Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save heychriszappa/8d06185bb4d368be339b9b6f563086bf to your computer and use it in GitHub Desktop.

Select an option

Save heychriszappa/8d06185bb4d368be339b9b6f563086bf to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Script to automatically removes the references section, square bracket references,
and the perplexity image from perplexity markdown export.
Examples:
# Process a single file (creates backup)
python perplexity_markdown_cleaner.py file.md
# Process a single file without backup
python perplexity_markdown_cleaner.py file.md --no-backup
# Process a single file and save to new location
python perplexity_markdown_cleaner.py input.md --output cleaned.md
# Process all .md files in a directory
python perplexity_markdown_cleaner.py --directory /path/to/docs
# Process all .txt files in current directory (non-recursive)
python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
"""
import re
import argparse
import os
from pathlib import Path
def remove_references(text):
"""
Remove square bracket references, image sources, and reference sections from text.
Args:
text (str): Input text containing references
Returns:
str: Text with references removed
"""
# Pattern to match square bracket references like [^1], [^2], [^10], etc.
# This pattern matches:
# - Opening square bracket [
# - Caret symbol ^
# - One or more digits \d+
# - Closing square bracket ]
# The pattern can appear multiple times consecutively
pattern = r'\[\^\d+\]+'
# Remove all matching patterns
cleaned_text = re.sub(pattern, '', text)
# Remove image source lines at the beginning of the document
# Pattern matches lines like: <img src="..." style="..."/>
img_pattern = r'<img\s+src="[^"]*"[^>]*/?>\s*\n?'
cleaned_text = re.sub(img_pattern, '', cleaned_text, flags=re.IGNORECASE)
# Remove hidden span elements like <span style="display:none"></span>
span_pattern = r'<span\s+style="display:\s*none"[^>]*></span>\s*'
cleaned_text = re.sub(span_pattern, '', cleaned_text, flags=re.IGNORECASE)
# Remove the references section from the bottom
# This removes everything from the div with ⁂ symbol to the end of the document
# Pattern matches: <div style="...">⁂</div> followed by any content including [^X]: links
references_pattern = r'<div[^>]*>⁂</div>\s*\n*(.*?)$'
cleaned_text = re.sub(references_pattern, '', cleaned_text, flags=re.DOTALL)
# Also remove any standalone reference links that might remain
# Pattern matches lines like: [^1]: https://...
standalone_refs_pattern = r'^\[\^\d+\]:\s*.*$'
cleaned_text = re.sub(standalone_refs_pattern, '', cleaned_text, flags=re.MULTILINE)
# Clean up any extra blank lines that might be left behind
cleaned_text = re.sub(r'\n\s*\n\s*\n', '\n\n', cleaned_text)
return cleaned_text.strip()
def process_file(file_path, output_path=None, backup=True):
"""
Process a single file to remove references.
Args:
file_path (str): Path to the input file
output_path (str, optional): Path for output file. If None, overwrites original
backup (bool): Whether to create a backup of the original file
Returns:
bool: True if successful, False otherwise
"""
try:
file_path = Path(file_path)
# Check if file exists
if not file_path.exists():
print(f"Error: File '{file_path}' does not exist.")
return False
# Read the original file
with open(file_path, 'r', encoding='utf-8') as f:
original_content = f.read()
# Remove references
cleaned_content = remove_references(original_content)
# Create backup if requested
if backup and output_path is None:
backup_path = file_path.with_suffix(file_path.suffix + '.backup')
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(original_content)
print(f"Backup created: {backup_path}")
# Determine output path
if output_path is None:
output_path = file_path
else:
output_path = Path(output_path)
# Write cleaned content
with open(output_path, 'w', encoding='utf-8') as f:
f.write(cleaned_content)
print(f"References removed from: {file_path}")
if output_path != file_path:
print(f"Output written to: {output_path}")
return True
except Exception as e:
print(f"Error processing file '{file_path}': {str(e)}")
return False
def process_directory(directory_path, pattern="*.md", recursive=True, backup=True):
"""
Process all files in a directory matching the given pattern.
Args:
directory_path (str): Path to the directory
pattern (str): File pattern to match (default: "*.md")
recursive (bool): Whether to search subdirectories
backup (bool): Whether to create backups
Returns:
tuple: (success_count, total_count)
"""
directory_path = Path(directory_path)
if not directory_path.exists():
print(f"Error: Directory '{directory_path}' does not exist.")
return 0, 0
if not directory_path.is_dir():
print(f"Error: '{directory_path}' is not a directory.")
return 0, 0
# Find matching files
if recursive:
files = list(directory_path.rglob(pattern))
else:
files = list(directory_path.glob(pattern))
if not files:
print(f"No files matching pattern '{pattern}' found in '{directory_path}'")
return 0, 0
print(f"Found {len(files)} files to process...")
success_count = 0
for file_path in files:
print(f"\nProcessing: {file_path}")
if process_file(file_path, backup=backup):
success_count += 1
return success_count, len(files)
def main():
"""Main function to handle command line arguments and execute the script."""
parser = argparse.ArgumentParser(
description="Remove square bracket references from markdown files",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Process a single file (creates backup)
python perplexity_markdown_cleaner.py file.md
# Process a single file without backup
python perplexity_markdown_cleaner.py file.md --no-backup
# Process a single file and save to new location
python perplexity_markdown_cleaner.py input.md --output cleaned.md
# Process all .md files in a directory
python perplexity_markdown_cleaner.py --directory /path/to/docs
# Process all .txt files in current directory (non-recursive)
python perplexity_markdown_cleaner.py --directory . --pattern "*.txt" --no-recursive
"""
)
# File or directory input
parser.add_argument(
'input',
nargs='?',
help='Input file path (if not using --directory)'
)
parser.add_argument(
'--output', '-o',
help='Output file path (only for single file processing)'
)
parser.add_argument(
'--directory', '-d',
help='Process all files in directory matching pattern'
)
parser.add_argument(
'--pattern', '-p',
default='*.md',
help='File pattern to match when processing directory (default: *.md)'
)
parser.add_argument(
'--recursive', '-r',
action='store_true',
default=True,
help='Search subdirectories (default: True)'
)
parser.add_argument(
'--no-recursive',
action='store_true',
help='Do not search subdirectories'
)
parser.add_argument(
'--backup', '-b',
action='store_true',
default=True,
help='Create backup files (default: True)'
)
parser.add_argument(
'--no-backup',
action='store_true',
help='Do not create backup files'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Show what would be processed without making changes'
)
args = parser.parse_args()
# Handle conflicting arguments
if args.no_recursive:
args.recursive = False
if args.no_backup:
args.backup = False
# Validate arguments
if not args.directory and not args.input:
parser.error("Either provide an input file or use --directory option")
if args.directory and args.input:
parser.error("Cannot use both input file and --directory option")
if args.output and args.directory:
parser.error("Cannot use --output with --directory option")
# Dry run mode
if args.dry_run:
print("DRY RUN MODE - No files will be modified")
print("-" * 50)
# Process directory
if args.directory:
print(f"Processing directory: {args.directory}")
print(f"Pattern: {args.pattern}")
print(f"Recursive: {args.recursive}")
print(f"Backup: {args.backup}")
print("-" * 50)
if args.dry_run:
directory_path = Path(args.directory)
if args.recursive:
files = list(directory_path.rglob(args.pattern))
else:
files = list(directory_path.glob(args.pattern))
print(f"Would process {len(files)} files:")
for file_path in files:
print(f" - {file_path}")
else:
success_count, total_count = process_directory(
args.directory,
args.pattern,
args.recursive,
args.backup
)
print(f"\nCompleted: {success_count}/{total_count} files processed successfully")
# Process single file
else:
print(f"Processing file: {args.input}")
print(f"Output: {args.output or 'overwrite original'}")
print(f"Backup: {args.backup}")
print("-" * 50)
if args.dry_run:
if Path(args.input).exists():
print(f"Would process: {args.input}")
with open(args.input, 'r', encoding='utf-8') as f:
content = f.read()
references = re.findall(r'\[\^\d+\]+', content)
if references:
print(f"Found references to remove: {set(references)}")
else:
print("No references found in file")
else:
print(f"File does not exist: {args.input}")
else:
success = process_file(args.input, args.output, args.backup)
if success:
print("File processed successfully!")
else:
print("Failed to process file.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment