Last active
November 6, 2025 06:13
-
-
Save sumitpore/8f2c49f455a114895ee4c864923a9a97 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import re | |
| import sys | |
| import pyperclip | |
| # character replacements: typographic to ASCII | |
| replacements = { | |
| # quotes and apostrophes | |
| '‘': "'", # left single quotation mark | |
| '’': "'", # right single quotation mark | |
| '“': '"', # left double quotation mark | |
| '”': '"', # right double quotation mark | |
| # dashes | |
| '—': '-', # em dash | |
| '–': '-', # en dash | |
| # ellipsis | |
| '…': '...', # horizontal ellipsis | |
| # bullets and dots | |
| '•': '*', # bullet | |
| '·': '*', # middle dot | |
| # angle quotes | |
| '‹': '<', # single left-pointing angle quotation mark | |
| '›': '>', # single right-pointing angle quotation mark | |
| '«': '<<', # left-pointing double angle quotation mark | |
| '»': '>>', # right-pointing double angle quotation mark | |
| # currency | |
| '€': 'EUR', # euro sign | |
| '£': 'GBP', # pound sign | |
| '¥': 'YEN', # yen sign | |
| '¢': 'cents', # cent sign | |
| # symbols | |
| '©': '(c)', # copyright | |
| '®': '(R)', # registered trademark | |
| '™': '(TM)', # trademark | |
| '°': 'deg', # degree sign | |
| '±': '+/-', # plus-minus sign | |
| '×': 'x', # multiplication sign | |
| '÷': '/', # division sign | |
| '§': 'section', # section sign | |
| '¶': 'paragraph', # pilcrow/paragraph sign | |
| # fractions | |
| '½': '1/2', # fraction one half | |
| '¼': '1/4', # fraction one quarter | |
| '¾': '3/4', # fraction three quarters | |
| '⅓': '1/3', # fraction one third | |
| '⅔': '2/3', # fraction two thirds | |
| '⅛': '1/8', # fraction one eighth | |
| '⅜': '3/8', # fraction three eighths | |
| '⅝': '5/8', # fraction five eighths | |
| '⅞': '7/8', # fraction seven eighths | |
| # accented letters - lowercase | |
| 'à': 'a', 'á': 'a', 'â': 'a', 'ã': 'a', 'ä': 'a', 'å': 'a', | |
| 'è': 'e', 'é': 'e', 'ê': 'e', 'ë': 'e', | |
| 'ì': 'i', 'í': 'i', 'î': 'i', 'ï': 'i', | |
| 'ò': 'o', 'ó': 'o', 'ô': 'o', 'õ': 'o', 'ö': 'o', | |
| 'ù': 'u', 'ú': 'u', 'û': 'u', 'ü': 'u', | |
| 'ý': 'y', 'ÿ': 'y', | |
| 'ñ': 'n', 'ç': 'c', | |
| 'æ': 'ae', 'œ': 'oe', | |
| # accented letters - uppercase | |
| 'À': 'A', 'Á': 'A', 'Â': 'A', 'Ã': 'A', 'Ä': 'A', 'Å': 'A', | |
| 'È': 'E', 'É': 'E', 'Ê': 'E', 'Ë': 'E', | |
| 'Ì': 'I', 'Í': 'I', 'Î': 'I', 'Ï': 'I', | |
| 'Ò': 'O', 'Ó': 'O', 'Ô': 'O', 'Õ': 'O', 'Ö': 'O', | |
| 'Ù': 'U', 'Ú': 'U', 'Û': 'U', 'Ü': 'U', | |
| 'Ý': 'Y', | |
| 'Ñ': 'N', 'Ç': 'C', | |
| 'Æ': 'AE', 'Œ': 'OE', | |
| } | |
| # main processing loop | |
| while True: | |
| # get text from user input | |
| print("\nPaste your text below (can be multiple lines):") | |
| print("Press Enter twice on empty lines when done, or Ctrl+D (Unix/Mac) / Ctrl+Z (Windows) to finish:") | |
| print("-" * 60) | |
| # read lines until two consecutive empty lines or EOF | |
| lines = [] | |
| empty_line_count = 0 | |
| try: | |
| while True: | |
| line = input() | |
| if line == "": | |
| empty_line_count += 1 | |
| # If we get two consecutive empty lines AND we already have content, that signals end of input | |
| if empty_line_count >= 2 and lines: | |
| break | |
| # Add the empty line to preserve formatting (but not the second one) | |
| if empty_line_count == 1: | |
| lines.append(line) | |
| else: | |
| empty_line_count = 0 | |
| lines.append(line) | |
| except EOFError: | |
| # EOF reached, process what we have | |
| pass | |
| except KeyboardInterrupt: | |
| print("\n\nInterrupted. Exiting...") | |
| break | |
| if not lines: | |
| break | |
| text = '\n'.join(lines) | |
| print("\n" + "=" * 60) | |
| print("PROCESSING RESULTS") | |
| print("=" * 60 + "\n") | |
| # highlight non-ASCII characters for terminal display | |
| RESET = '\033[0m' | |
| highlighted = "" | |
| for char in text: | |
| if ord(char) > 127: | |
| highlighted += f"\033[41m \033[0m" | |
| else: | |
| highlighted += char | |
| # normalize text: replace typographic characters with ASCII equivalents | |
| normalized = text | |
| for typographic, ascii_char in replacements.items(): | |
| normalized = normalized.replace(typographic, ascii_char) | |
| # remove remaining non-ASCII characters | |
| pattern = re.compile(r'[^\x00-\x7F]') | |
| cleaned = pattern.sub('', normalized) | |
| # count replacements | |
| replacement_count = len(pattern.findall(normalized)) | |
| # display original with highlighting | |
| print("Original text with non-ASCII highlighted:") | |
| print(highlighted) | |
| # display cleaned text | |
| print("\nCleaned text (ASCII only):") | |
| print(cleaned) | |
| print(f"\n{replacement_count} non-ASCII characters removed") | |
| # copy cleaned text to clipboard | |
| pyperclip.copy(cleaned) | |
| print("✓ Cleaned text copied to clipboard") | |
| # ask if user wants to continue | |
| print("\n" + "-" * 60) | |
| continue_choice = input("Do you want to process another text? (yes/no): ").strip().lower() | |
| if continue_choice not in ['yes', 'y']: | |
| break | |
| print("\nThank you for using the text normalizer!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment