sumitpore/remove-and-replace-special-characters-added-by-ai-tools-in-generated-text.py

## remove-and-replace-special-characters-added-by-ai-tools-in-generated-text.py
#!/usr/bin/env python3

import re
import sys
import pyperclip

# character replacements: typographic to ASCII
replacements = {
    # quotes and apostrophes
    '‘': "'",   # left single quotation mark
    '’': "'",   # right single quotation mark
    '“': '"',   # left double quotation mark
    '”': '"',   # right double quotation mark

    # dashes
    '—': '-',  # em dash
    '–': '-',   # en dash

    # ellipsis
    '…': '...', # horizontal ellipsis

    # bullets and dots
    '•': '*',   # bullet
    '·': '*',   # middle dot

    # angle quotes
    '‹': '<',   # single left-pointing angle quotation mark
    '›': '>',   # single right-pointing angle quotation mark
    '«': '<<',  # left-pointing double angle quotation mark
    '»': '>>',  # right-pointing double angle quotation mark

    # currency
    '€': 'EUR', # euro sign
    '£': 'GBP', # pound sign
    '¥': 'YEN', # yen sign
    '¢': 'cents', # cent sign

    # symbols
    '©': '(c)', # copyright
    '®': '(R)', # registered trademark
    '™': '(TM)', # trademark
    '°': 'deg', # degree sign
    '±': '+/-', # plus-minus sign
    '×': 'x',   # multiplication sign
    '÷': '/',   # division sign
    '§': 'section', # section sign
    '¶': 'paragraph', # pilcrow/paragraph sign

    # fractions
    '½': '1/2', # fraction one half
    '¼': '1/4', # fraction one quarter
    '¾': '3/4', # fraction three quarters
    '⅓': '1/3', # fraction one third
    '⅔': '2/3', # fraction two thirds
    '⅛': '1/8', # fraction one eighth
    '⅜': '3/8', # fraction three eighths
    '⅝': '5/8', # fraction five eighths
    '⅞': '7/8', # fraction seven eighths

    # accented letters - lowercase
    'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a',
    'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
    'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i',
    'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
    'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
    'ý': 'y', 'ÿ': 'y',
    'ñ': 'n', 'ç': 'c',
    'æ': 'ae', 'œ': 'oe',

    # accented letters - uppercase
    'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A',
    'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E',
    'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I',
    'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O',
    'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U',
    'Ý': 'Y',
    'Ñ': 'N', 'Ç': 'C',
    'Æ': 'AE', 'Œ': 'OE',
}

# main processing loop
while True:
    # get text from user input
    print("\nPaste your text below (can be multiple lines):")
    print("Press Enter twice on empty lines when done, or Ctrl+D (Unix/Mac) / Ctrl+Z (Windows) to finish:")
    print("-" * 60)

    # read lines until two consecutive empty lines or EOF
    lines = []
    empty_line_count = 0
    try:
        while True:
            line = input()
            if line == "":
                empty_line_count += 1
                # If we get two consecutive empty lines AND we already have content, that signals end of input
                if empty_line_count >= 2 and lines:
                    break
                # Add the empty line to preserve formatting (but not the second one)
                if empty_line_count == 1:
                    lines.append(line)
            else:
                empty_line_count = 0
                lines.append(line)
    except EOFError:
        # EOF reached, process what we have
        pass
    except KeyboardInterrupt:
        print("\n\nInterrupted. Exiting...")
        break

    if not lines:
        break

    text = '\n'.join(lines)

    print("\n" + "=" * 60)
    print("PROCESSING RESULTS")
    print("=" * 60 + "\n")

    # highlight non-ASCII characters for terminal display
    RESET = '\033[0m'
    highlighted = ""
    for char in text:
        if ord(char) > 127:
            highlighted += f"\033[41m \033[0m"
        else:
            highlighted += char

    # normalize text: replace typographic characters with ASCII equivalents
    normalized = text
    for typographic, ascii_char in replacements.items():
        normalized = normalized.replace(typographic, ascii_char)

    # remove remaining non-ASCII characters
    pattern = re.compile(r'[^\x00-\x7F]')
    cleaned = pattern.sub('', normalized)

    # count replacements
    replacement_count = len(pattern.findall(normalized))

    # display original with highlighting
    print("Original text with non-ASCII highlighted:")
    print(highlighted)

    # display cleaned text
    print("\nCleaned text (ASCII only):")
    print(cleaned)
    print(f"\n{replacement_count} non-ASCII characters removed")

    # copy cleaned text to clipboard
    pyperclip.copy(cleaned)
    print("✓ Cleaned text copied to clipboard")

    # ask if user wants to continue
    print("\n" + "-" * 60)
    continue_choice = input("Do you want to process another text? (yes/no): ").strip().lower()
    if continue_choice not in ['yes', 'y']:
        break

print("\nThank you for using the text normalizer!")
No results found