Skip to content

Instantly share code, notes, and snippets.

@basperheim
Last active November 4, 2025 13:44
Show Gist options
  • Select an option

  • Save basperheim/ec49a4e8d1af13eb9eda4590536a70f3 to your computer and use it in GitHub Desktop.

Select an option

Save basperheim/ec49a4e8d1af13eb9eda4590536a70f3 to your computer and use it in GitHub Desktop.
Removes special LLM-injected characters from markdown files in order to "normalize" the markdown content.
#!/usr/bin/env python3
import argparse
import os
import sys
import re
# directories to skip entirely
EXCLUDE_DIRS = {'node_modules', 'target', 'bin', '__pycache__', 'build', 'dist'}
MARKDOWN_LINK = re.compile(r'(!?\[.*?\])\((.*?)\)') # matches [text](url) and ![alt](url)
RAW_URL = re.compile(r'https?://[^\s)`]+') # matches http/https raw URLs
CODE_FENCE = re.compile(r'^```') # Match start/end of fenced code blocks
INLINE_CODE = re.compile(r'`[^`]+`') # Match inline code spans
HR_LINE = re.compile(r'^\s*-{3,}\s*$')
# Replace em-dash variants and double hyphen outside code
DASH_FIX = re.compile(r'(?<!`)(?<!-)(--|–)') # only outside backticks, not already part of inline `--`
# Replace broken HTML open comment tag (e.g., <!— instead of <!--)
BROKEN_HTML_COMMENT = re.compile(r'<![–—]') # en-dash or em-dash (Unicode U+2013/U+2014)
# mapping of “fancy” chars → plain ASCII (applied globally)
REPLACEMENTS = {
"‘": "'", "’": "'",
"“": '"', "”": '"',
"‚": ",",
"…": "...",
"\u00A0": " ",
"•": "-",
"×": "x", "÷": "/", "°": " degrees ",
"™": "(TM)", "®": "(R)",
"€": "EUR", "£": "GBP", "¥": "YEN",
"?utm_source=chatgpt.com": "",
"&utm_source=chatgpt.com": ""
}
def protect_markdown_links(line: str, transform: callable) -> str:
"""Apply `transform` to non-link segments of the line."""
# First protect raw URLs
raw_parts = []
last_end = 0
for match in RAW_URL.finditer(line):
before = line[last_end:match.start()]
protected = transform(before)
raw_parts.append(protected)
raw_parts.append(match.group(0)) # raw URL unchanged
last_end = match.end()
tail = transform(line[last_end:])
line = ''.join(raw_parts) + tail
# Then protect [text](url) and ![alt](url)
result = []
last_end = 0
for match in MARKDOWN_LINK.finditer(line):
before = line[last_end:match.start()]
protected = transform(before)
result.append(protected)
result.append(match.group(0)) # full match stays unchanged
last_end = match.end()
result.append(transform(line[last_end:]))
return ''.join(result)
def is_markdown_table_line(line: str) -> bool:
return line.strip().startswith('|') and '|' in line
def normalize_cli_flags(line: str) -> str:
"""Fix em/en dashes used incorrectly in CLI flags."""
def repl(m: re.Match) -> str:
return '--' + m.group(2)
return CLI_FLAG_BAD.sub(repl, line)
def normalize_lines(lines: list[str]) -> list[str]:
in_code_block = False
result = []
for line in lines:
stripped = line.strip()
# Fix broken HTML comments globally
line = BROKEN_HTML_COMMENT.sub('<!--', line)
# Toggle code block mode
if CODE_FENCE.match(stripped):
in_code_block = not in_code_block
result.append(line)
continue
# Always replace quotes and other global stuff (everywhere)
for src, dst in REPLACEMENTS.items():
line = line.replace(src, dst)
if in_code_block:
result.append(line)
continue
# Skip front-matter / <hr> lines
if HR_LINE.match(line):
result.append(line)
continue
# Skip dash replacement for Markdown tables
if is_markdown_table_line(line):
# Still apply global replacements (e.g. quotes), and fix broken `<!—` in inline code
for src, dst in REPLACEMENTS.items():
line = line.replace(src, dst)
parts = []
last_end = 0
for match in INLINE_CODE.finditer(line):
before = line[last_end:match.start()]
before = BROKEN_HTML_COMMENT.sub('<!--', before)
parts.append(before)
code = match.group(0)
code = BROKEN_HTML_COMMENT.sub('<!--', code)
parts.append(code)
last_end = match.end()
tail = line[last_end:]
tail = BROKEN_HTML_COMMENT.sub('<!--', tail)
parts.append(tail)
result.append(''.join(parts))
continue
# Process inline code blocks
parts = []
last_end = 0
for match in INLINE_CODE.finditer(line):
# Non-code part: replace dashes, leave other fixes already applied
non_code = line[last_end:match.start()]
non_code = protect_markdown_links(non_code, lambda s: DASH_FIX.sub('—', s))
parts.append(non_code)
# Inline code: fix bad HTML comment tags only
code = match.group(0)
code = BROKEN_HTML_COMMENT.sub('<!--', code)
parts.append(code)
last_end = match.end()
# Remaining non-code part after last inline code
tail = line[last_end:]
tail = protect_markdown_links(tail, lambda s: DASH_FIX.sub('—', s))
parts.append(tail)
result.append(''.join(parts))
return result
def process_file(path: str) -> None:
if os.path.getsize(path) == 0:
return
try:
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
except UnicodeDecodeError:
return # skip non-text or non-utf8
new_lines = normalize_lines(lines)
if new_lines != lines:
with open(path, 'w', encoding='utf-8') as f:
f.writelines(new_lines)
print(f"Fixed: {path}")
def crawl_and_fix(root: str) -> None:
for dirpath, dirnames, filenames in os.walk(root):
# modify dirnames in-place to skip EXCLUDE_DIRS
dirnames[:] = [d for d in dirnames if d not in EXCLUDE_DIRS]
for fn in filenames:
if fn.lower().endswith('.md'):
process_file(os.path.join(dirpath, fn))
def main():
parser = argparse.ArgumentParser(
description="Recursively normalize “smart” Unicode chars in Markdown → ASCII"
)
parser.add_argument(
'-d', '--dir',
default='.',
help="Root directory to crawl (defaults to cwd)"
)
args = parser.parse_args()
root = os.path.abspath(args.dir)
if not os.path.isdir(root):
print(f"Error: '{root}' is not a directory", file=sys.stderr)
sys.exit(1)
crawl_and_fix(root)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment