Skip to content

Instantly share code, notes, and snippets.

@codeslord
Last active November 11, 2025 08:23
Show Gist options
  • Select an option

  • Save codeslord/2953cbef2d67946de2ec5e5aa8d65f19 to your computer and use it in GitHub Desktop.

Select an option

Save codeslord/2953cbef2d67946de2ec5e5aa8d65f19 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Script to automatically removes the references section, square bracket references,
and the perplexity image from perplexity markdown export.
Examples:
# Process a single file (creates backup)
python perplexity_markdown_cleaner.py file.md
# Process a single file without backup
python perplexity_markdown_cleaner.py file.md --no-backup
# Process a single file and save to new location
python perplexity_markdown_cleaner.py input.md --output cleaned.md
# Process all .md files in a directory
python perplexity_markdown_cleaner.py --directory /path/to/docs
# Process all .txt files in current directory (non-recursive)
python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
"""
import re
import argparse
import os
from pathlib import Path
def remove_references(text):
"""
Remove square bracket references, image sources, and reference sections from text.
Args:
text (str): Input text containing references
Returns:
str: Text with references removed
"""
# Pattern to match square bracket references like [^1], [^2], [^10], etc.
# This pattern matches:
# - Opening square bracket [
# - Caret symbol ^
# - One or more digits \d+
# - Closing square bracket ]
# The pattern can appear multiple times consecutively
pattern = r'\[\^\d+\]+'
# Remove all matching patterns
cleaned_text = re.sub(pattern, '', text)
# Remove image source lines at the beginning of the document
# Pattern matches lines like: <img src="..." style="..."/>
img_pattern = r'<img\s+src="[^"]*"[^>]*/?>\s*\n?'
cleaned_text = re.sub(img_pattern, '', cleaned_text, flags=re.IGNORECASE)
# Remove hidden span elements like <span style="display:none"></span>
span_pattern = r'<span\s+style="display:\s*none"[^>]*></span>\s*'
cleaned_text = re.sub(span_pattern, '', cleaned_text, flags=re.IGNORECASE)
# Remove the references section from the bottom
# This removes everything from the div with ⁂ symbol to the end of the document
# Pattern matches: <div style="...">⁂</div> followed by any content including [^X]: links
references_pattern = r'<div[^>]*>⁂</div>\s*\n*(.*?)$'
cleaned_text = re.sub(references_pattern, '', cleaned_text, flags=re.DOTALL)
# Also remove any standalone reference links that might remain
# Pattern matches lines like: [^1]: https://...
standalone_refs_pattern = r'^\[\^\d+\]:\s*.*$'
cleaned_text = re.sub(standalone_refs_pattern, '', cleaned_text, flags=re.MULTILINE)
# Clean up any extra blank lines that might be left behind
cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
return cleaned_text.strip()
def process_file(file_path, output_path=None, backup=True):
"""
Process a single file to remove references.
Args:
file_path (str): Path to the input file
output_path (str, optional): Path for output file. If None, overwrites original
backup (bool): Whether to create a backup of the original file
Returns:
bool: True if successful, False otherwise
"""
try:
file_path = Path(file_path)
# Check if file exists
if not file_path.exists():
print(f"Error: File '{file_path}' does not exist.")
return False
# Read the original file
with open(file_path, 'r', encoding='utf-8') as f:
original_content = f.read()
# Remove references
cleaned_content = remove_references(original_content)
# Create backup if requested
if backup and output_path is None:
backup_path = file_path.with_suffix(file_path.suffix + '.backup')
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(original_content)
print(f"Backup created: {backup_path}")
# Determine output path
if output_path is None:
output_path = file_path
else:
output_path = Path(output_path)
# Write cleaned content
with open(output_path, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
print(f"References removed from: {file_path}")
if output_path != file_path:
print(f"Output written to: {output_path}")
return True
except Exception as e:
print(f"Error processing file '{file_path}': {str(e)}")
return False
def process_directory(directory_path, pattern="*.md", recursive=True, backup=True):
"""
Process all files in a directory matching the given pattern.
Args:
directory_path (str): Path to the directory
pattern (str): File pattern to match (default: "*.md")
recursive (bool): Whether to search subdirectories
backup (bool): Whether to create backups
Returns:
tuple: (success_count, total_count)
"""
directory_path = Path(directory_path)
if not directory_path.exists():
print(f"Error: Directory '{directory_path}' does not exist.")
return 0, 0
if not directory_path.is_dir():
print(f"Error: '{directory_path}' is not a directory.")
return 0, 0
# Find matching files
if recursive:
files = list(directory_path.rglob(pattern))
else:
files = list(directory_path.glob(pattern))
if not files:
print(f"No files matching pattern '{pattern}' found in '{directory_path}'")
return 0, 0
print(f"Found {len(files)} files to process...")
success_count = 0
for file_path in files:
print(f"\nProcessing: {file_path}")
if process_file(file_path, backup=backup):
success_count += 1
return success_count, len(files)
def main():
"""Main function to handle command line arguments and execute the script."""
parser = argparse.ArgumentParser(
description="Remove square bracket references from markdown files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process a single file (creates backup)
python perplexity_markdown_cleaner.py file.md
# Process a single file without backup
python perplexity_markdown_cleaner.py file.md --no-backup
# Process a single file and save to new location
python perplexity_markdown_cleaner.py input.md --output cleaned.md
# Process all .md files in a directory
python perplexity_markdown_cleaner.py --directory /path/to/docs
# Process all .txt files in current directory (non-recursive)
python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
"""
)
# File or directory input
parser.add_argument(
'input',
nargs='?',
help='Input file path (if not using --directory)'
)
parser.add_argument(
'--output', '-o',
help='Output file path (only for single file processing)'
)
parser.add_argument(
'--directory', '-d',
help='Process all files in directory matching pattern'
)
parser.add_argument(
'--pattern', '-p',
default='*.md',
help='File pattern to match when processing directory (default: *.md)'
)
parser.add_argument(
'--recursive', '-r',
action='store_true',
default=True,
help='Search subdirectories (default: True)'
)
parser.add_argument(
'--no-recursive',
action='store_true',
help='Do not search subdirectories'
)
parser.add_argument(
'--backup', '-b',
action='store_true',
default=True,
help='Create backup files (default: True)'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Do not create backup files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be processed without making changes'
)
args = parser.parse_args()
# Handle conflicting arguments
if args.no_recursive:
args.recursive = False
if args.no_backup:
args.backup = False
# Validate arguments
if not args.directory and not args.input:
parser.error("Either provide an input file or use --directory option")
if args.directory and args.input:
parser.error("Cannot use both input file and --directory option")
if args.output and args.directory:
parser.error("Cannot use --output with --directory option")
# Dry run mode
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
print("-" * 50)
# Process directory
if args.directory:
print(f"Processing directory: {args.directory}")
print(f"Pattern: {args.pattern}")
print(f"Recursive: {args.recursive}")
print(f"Backup: {args.backup}")
print("-" * 50)
if args.dry_run:
directory_path = Path(args.directory)
if args.recursive:
files = list(directory_path.rglob(args.pattern))
else:
files = list(directory_path.glob(args.pattern))
print(f"Would process {len(files)} files:")
for file_path in files:
print(f" - {file_path}")
else:
success_count, total_count = process_directory(
args.directory,
args.pattern,
args.recursive,
args.backup
)
print(f"\nCompleted: {success_count}/{total_count} files processed successfully")
# Process single file
else:
print(f"Processing file: {args.input}")
print(f"Output: {args.output or 'overwrite original'}")
print(f"Backup: {args.backup}")
print("-" * 50)
if args.dry_run:
if Path(args.input).exists():
print(f"Would process: {args.input}")
with open(args.input, 'r', encoding='utf-8') as f:
content = f.read()
references = re.findall(r'\[\^\d+\]+', content)
if references:
print(f"Found references to remove: {set(references)}")
else:
print("No references found in file")
else:
print(f"File does not exist: {args.input}")
else:
success = process_file(args.input, args.output, args.backup)
if success:
print("File processed successfully!")
else:
print("Failed to process file.")
if __name__ == "__main__":
main()
@codeslord
Copy link
Author

codeslord commented Sep 3, 2025

Updated to include span

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment