Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save sumitpore/8f2c49f455a114895ee4c864923a9a97 to your computer and use it in GitHub Desktop.

Select an option

Save sumitpore/8f2c49f455a114895ee4c864923a9a97 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import sys
import pyperclip
# character replacements: typographic to ASCII
replacements = {
# quotes and apostrophes
'‘': "'", # left single quotation mark
'’': "'", # right single quotation mark
'“': '"', # left double quotation mark
'”': '"', # right double quotation mark
# dashes
'—': '-', # em dash
'–': '-', # en dash
# ellipsis
'…': '...', # horizontal ellipsis
# bullets and dots
'•': '*', # bullet
'·': '*', # middle dot
# angle quotes
'‹': '<', # single left-pointing angle quotation mark
'›': '>', # single right-pointing angle quotation mark
'«': '<<', # left-pointing double angle quotation mark
'»': '>>', # right-pointing double angle quotation mark
# currency
'€': 'EUR', # euro sign
'£': 'GBP', # pound sign
'¥': 'YEN', # yen sign
'¢': 'cents', # cent sign
# symbols
'©': '(c)', # copyright
'®': '(R)', # registered trademark
'™': '(TM)', # trademark
'°': 'deg', # degree sign
'±': '+/-', # plus-minus sign
'×': 'x', # multiplication sign
'÷': '/', # division sign
'§': 'section', # section sign
'¶': 'paragraph', # pilcrow/paragraph sign
# fractions
'½': '1/2', # fraction one half
'¼': '1/4', # fraction one quarter
'¾': '3/4', # fraction three quarters
'⅓': '1/3', # fraction one third
'⅔': '2/3', # fraction two thirds
'⅛': '1/8', # fraction one eighth
'⅜': '3/8', # fraction three eighths
'⅝': '5/8', # fraction five eighths
'⅞': '7/8', # fraction seven eighths
# accented letters - lowercase
'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a',
'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e',
'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i',
'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o',
'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u',
'ý': 'y', 'ÿ': 'y',
'ñ': 'n', 'ç': 'c',
'æ': 'ae', 'œ': 'oe',
# accented letters - uppercase
'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A',
'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E',
'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I',
'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O',
'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U',
'Ý': 'Y',
'Ñ': 'N', 'Ç': 'C',
'Æ': 'AE', 'Œ': 'OE',
}
# main processing loop
while True:
# get text from user input
print("\nPaste your text below (can be multiple lines):")
print("Press Enter twice on empty lines when done, or Ctrl+D (Unix/Mac) / Ctrl+Z (Windows) to finish:")
print("-" * 60)
# read lines until two consecutive empty lines or EOF
lines = []
empty_line_count = 0
try:
while True:
line = input()
if line == "":
empty_line_count += 1
# If we get two consecutive empty lines AND we already have content, that signals end of input
if empty_line_count >= 2 and lines:
break
# Add the empty line to preserve formatting (but not the second one)
if empty_line_count == 1:
lines.append(line)
else:
empty_line_count = 0
lines.append(line)
except EOFError:
# EOF reached, process what we have
pass
except KeyboardInterrupt:
print("\n\nInterrupted. Exiting...")
break
if not lines:
break
text = '\n'.join(lines)
print("\n" + "=" * 60)
print("PROCESSING RESULTS")
print("=" * 60 + "\n")
# highlight non-ASCII characters for terminal display
RESET = '\033[0m'
highlighted = ""
for char in text:
if ord(char) > 127:
highlighted += f"\033[41m \033[0m"
else:
highlighted += char
# normalize text: replace typographic characters with ASCII equivalents
normalized = text
for typographic, ascii_char in replacements.items():
normalized = normalized.replace(typographic, ascii_char)
# remove remaining non-ASCII characters
pattern = re.compile(r'[^\x00-\x7F]')
cleaned = pattern.sub('', normalized)
# count replacements
replacement_count = len(pattern.findall(normalized))
# display original with highlighting
print("Original text with non-ASCII highlighted:")
print(highlighted)
# display cleaned text
print("\nCleaned text (ASCII only):")
print(cleaned)
print(f"\n{replacement_count} non-ASCII characters removed")
# copy cleaned text to clipboard
pyperclip.copy(cleaned)
print("✓ Cleaned text copied to clipboard")
# ask if user wants to continue
print("\n" + "-" * 60)
continue_choice = input("Do you want to process another text? (yes/no): ").strip().lower()
if continue_choice not in ['yes', 'y']:
break
print("\nThank you for using the text normalizer!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment