Last active
November 4, 2025 13:44
-
-
Save basperheim/ec49a4e8d1af13eb9eda4590536a70f3 to your computer and use it in GitHub Desktop.
Removes special LLM-injected characters from markdown files in order to "normalize" the markdown content.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import argparse | |
| import os | |
| import sys | |
| import re | |
| # directories to skip entirely | |
| EXCLUDE_DIRS = {'node_modules', 'target', 'bin', '__pycache__', 'build', 'dist'} | |
| MARKDOWN_LINK = re.compile(r'(!?\[.*?\])\((.*?)\)') # matches [text](url) and  | |
| RAW_URL = re.compile(r'https?://[^\s)`]+') # matches http/https raw URLs | |
| CODE_FENCE = re.compile(r'^```') # Match start/end of fenced code blocks | |
| INLINE_CODE = re.compile(r'`[^`]+`') # Match inline code spans | |
| HR_LINE = re.compile(r'^\s*-{3,}\s*$') | |
| # Replace em-dash variants and double hyphen outside code | |
| DASH_FIX = re.compile(r'(?<!`)(?<!-)(--|–)') # only outside backticks, not already part of inline `--` | |
| # Replace broken HTML open comment tag (e.g., <!— instead of <!--) | |
| BROKEN_HTML_COMMENT = re.compile(r'<![–—]') # en-dash or em-dash (Unicode U+2013/U+2014) | |
| # mapping of “fancy” chars → plain ASCII (applied globally) | |
| REPLACEMENTS = { | |
| "‘": "'", "’": "'", | |
| "“": '"', "”": '"', | |
| "‚": ",", | |
| "…": "...", | |
| "\u00A0": " ", | |
| "•": "-", | |
| "×": "x", "÷": "/", "°": " degrees ", | |
| "™": "(TM)", "®": "(R)", | |
| "€": "EUR", "£": "GBP", "¥": "YEN", | |
| "?utm_source=chatgpt.com": "", | |
| "&utm_source=chatgpt.com": "" | |
| } | |
| def protect_markdown_links(line: str, transform: callable) -> str: | |
| """Apply `transform` to non-link segments of the line.""" | |
| # First protect raw URLs | |
| raw_parts = [] | |
| last_end = 0 | |
| for match in RAW_URL.finditer(line): | |
| before = line[last_end:match.start()] | |
| protected = transform(before) | |
| raw_parts.append(protected) | |
| raw_parts.append(match.group(0)) # raw URL unchanged | |
| last_end = match.end() | |
| tail = transform(line[last_end:]) | |
| line = ''.join(raw_parts) + tail | |
| # Then protect [text](url) and  | |
| result = [] | |
| last_end = 0 | |
| for match in MARKDOWN_LINK.finditer(line): | |
| before = line[last_end:match.start()] | |
| protected = transform(before) | |
| result.append(protected) | |
| result.append(match.group(0)) # full match stays unchanged | |
| last_end = match.end() | |
| result.append(transform(line[last_end:])) | |
| return ''.join(result) | |
| def is_markdown_table_line(line: str) -> bool: | |
| return line.strip().startswith('|') and '|' in line | |
| def normalize_cli_flags(line: str) -> str: | |
| """Fix em/en dashes used incorrectly in CLI flags.""" | |
| def repl(m: re.Match) -> str: | |
| return '--' + m.group(2) | |
| return CLI_FLAG_BAD.sub(repl, line) | |
| def normalize_lines(lines: list[str]) -> list[str]: | |
| in_code_block = False | |
| result = [] | |
| for line in lines: | |
| stripped = line.strip() | |
| # Fix broken HTML comments globally | |
| line = BROKEN_HTML_COMMENT.sub('<!--', line) | |
| # Toggle code block mode | |
| if CODE_FENCE.match(stripped): | |
| in_code_block = not in_code_block | |
| result.append(line) | |
| continue | |
| # Always replace quotes and other global stuff (everywhere) | |
| for src, dst in REPLACEMENTS.items(): | |
| line = line.replace(src, dst) | |
| if in_code_block: | |
| result.append(line) | |
| continue | |
| # Skip front-matter / <hr> lines | |
| if HR_LINE.match(line): | |
| result.append(line) | |
| continue | |
| # Skip dash replacement for Markdown tables | |
| if is_markdown_table_line(line): | |
| # Still apply global replacements (e.g. quotes), and fix broken `<!—` in inline code | |
| for src, dst in REPLACEMENTS.items(): | |
| line = line.replace(src, dst) | |
| parts = [] | |
| last_end = 0 | |
| for match in INLINE_CODE.finditer(line): | |
| before = line[last_end:match.start()] | |
| before = BROKEN_HTML_COMMENT.sub('<!--', before) | |
| parts.append(before) | |
| code = match.group(0) | |
| code = BROKEN_HTML_COMMENT.sub('<!--', code) | |
| parts.append(code) | |
| last_end = match.end() | |
| tail = line[last_end:] | |
| tail = BROKEN_HTML_COMMENT.sub('<!--', tail) | |
| parts.append(tail) | |
| result.append(''.join(parts)) | |
| continue | |
| # Process inline code blocks | |
| parts = [] | |
| last_end = 0 | |
| for match in INLINE_CODE.finditer(line): | |
| # Non-code part: replace dashes, leave other fixes already applied | |
| non_code = line[last_end:match.start()] | |
| non_code = protect_markdown_links(non_code, lambda s: DASH_FIX.sub('—', s)) | |
| parts.append(non_code) | |
| # Inline code: fix bad HTML comment tags only | |
| code = match.group(0) | |
| code = BROKEN_HTML_COMMENT.sub('<!--', code) | |
| parts.append(code) | |
| last_end = match.end() | |
| # Remaining non-code part after last inline code | |
| tail = line[last_end:] | |
| tail = protect_markdown_links(tail, lambda s: DASH_FIX.sub('—', s)) | |
| parts.append(tail) | |
| result.append(''.join(parts)) | |
| return result | |
| def process_file(path: str) -> None: | |
| if os.path.getsize(path) == 0: | |
| return | |
| try: | |
| with open(path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| except UnicodeDecodeError: | |
| return # skip non-text or non-utf8 | |
| new_lines = normalize_lines(lines) | |
| if new_lines != lines: | |
| with open(path, 'w', encoding='utf-8') as f: | |
| f.writelines(new_lines) | |
| print(f"Fixed: {path}") | |
| def crawl_and_fix(root: str) -> None: | |
| for dirpath, dirnames, filenames in os.walk(root): | |
| # modify dirnames in-place to skip EXCLUDE_DIRS | |
| dirnames[:] = [d for d in dirnames if d not in EXCLUDE_DIRS] | |
| for fn in filenames: | |
| if fn.lower().endswith('.md'): | |
| process_file(os.path.join(dirpath, fn)) | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="Recursively normalize “smart” Unicode chars in Markdown → ASCII" | |
| ) | |
| parser.add_argument( | |
| '-d', '--dir', | |
| default='.', | |
| help="Root directory to crawl (defaults to cwd)" | |
| ) | |
| args = parser.parse_args() | |
| root = os.path.abspath(args.dir) | |
| if not os.path.isdir(root): | |
| print(f"Error: '{root}' is not a directory", file=sys.stderr) | |
| sys.exit(1) | |
| crawl_and_fix(root) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment