basperheim/replace_special_char_markdown.py

## replace_special_char_markdown.py
#!/usr/bin/env python3
import argparse
import os
import sys
import re

# directories to skip entirely
EXCLUDE_DIRS = {'node_modules', 'target', 'bin', '__pycache__', 'build', 'dist'}

MARKDOWN_LINK = re.compile(r'(!?\[.*?\])\((.*?)\)')  # matches [text](url) and ![alt](url)
RAW_URL = re.compile(r'https?://[^\s)`]+')  # matches http/https raw URLs

CODE_FENCE = re.compile(r'^```')  # Match start/end of fenced code blocks
INLINE_CODE = re.compile(r'`[^`]+`')  # Match inline code spans
HR_LINE = re.compile(r'^\s*-{3,}\s*$')

# Replace em-dash variants and double hyphen outside code
DASH_FIX = re.compile(r'(?<!`)(?<!-)(--|–)')  # only outside backticks, not already part of inline `--`

# Replace broken HTML open comment tag (e.g., <!— instead of <!--)
BROKEN_HTML_COMMENT = re.compile(r'<![–—]')  # en-dash or em-dash (Unicode U+2013/U+2014)

# mapping of “fancy” chars → plain ASCII (applied globally)
REPLACEMENTS = {
    "‘": "'", "’": "'",
    "“": '"', "”": '"',
    "‚": ",",
    "…": "...",
    "\u00A0": " ",
    "•": "-",
    "×": "x", "÷": "/", "°": " degrees ",
    "™": "(TM)", "®": "(R)",
    "€": "EUR", "£": "GBP", "¥": "YEN",
    "?utm_source=chatgpt.com": "",
    "&utm_source=chatgpt.com": ""
}

def protect_markdown_links(line: str, transform: callable) -> str:
    """Apply `transform` to non-link segments of the line."""
    # First protect raw URLs
    raw_parts = []
    last_end = 0
    for match in RAW_URL.finditer(line):
        before = line[last_end:match.start()]
        protected = transform(before)
        raw_parts.append(protected)
        raw_parts.append(match.group(0))  # raw URL unchanged
        last_end = match.end()
    tail = transform(line[last_end:])
    line = ''.join(raw_parts) + tail

    # Then protect [text](url) and ![alt](url)
    result = []
    last_end = 0
    for match in MARKDOWN_LINK.finditer(line):
        before = line[last_end:match.start()]
        protected = transform(before)
        result.append(protected)
        result.append(match.group(0))  # full match stays unchanged
        last_end = match.end()
    result.append(transform(line[last_end:]))
    return ''.join(result)

def is_markdown_table_line(line: str) -> bool:
    return line.strip().startswith('|') and '|' in line

def normalize_cli_flags(line: str) -> str:
    """Fix em/en dashes used incorrectly in CLI flags."""
    def repl(m: re.Match) -> str:
        return '--' + m.group(2)
    return CLI_FLAG_BAD.sub(repl, line)

def normalize_lines(lines: list[str]) -> list[str]:
    in_code_block = False
    result = []

    for line in lines:
        stripped = line.strip()

        # Fix broken HTML comments globally
        line = BROKEN_HTML_COMMENT.sub('<!--', line)

        # Toggle code block mode
        if CODE_FENCE.match(stripped):
            in_code_block = not in_code_block
            result.append(line)
            continue

        # Always replace quotes and other global stuff (everywhere)
        for src, dst in REPLACEMENTS.items():
            line = line.replace(src, dst)

        if in_code_block:
            result.append(line)
            continue

        # Skip front-matter / <hr> lines
        if HR_LINE.match(line):
            result.append(line)
            continue

        # Skip dash replacement for Markdown tables
        if is_markdown_table_line(line):
            # Still apply global replacements (e.g. quotes), and fix broken `<!—` in inline code
            for src, dst in REPLACEMENTS.items():
                line = line.replace(src, dst)

            parts = []
            last_end = 0
            for match in INLINE_CODE.finditer(line):
                before = line[last_end:match.start()]
                before = BROKEN_HTML_COMMENT.sub('<!--', before)
                parts.append(before)
                code = match.group(0)
                code = BROKEN_HTML_COMMENT.sub('<!--', code)
                parts.append(code)
                last_end = match.end()
            tail = line[last_end:]
            tail = BROKEN_HTML_COMMENT.sub('<!--', tail)
            parts.append(tail)

            result.append(''.join(parts))
            continue

        # Process inline code blocks
        parts = []
        last_end = 0
        for match in INLINE_CODE.finditer(line):
            # Non-code part: replace dashes, leave other fixes already applied
            non_code = line[last_end:match.start()]
            non_code = protect_markdown_links(non_code, lambda s: DASH_FIX.sub('—', s))
            parts.append(non_code)

            # Inline code: fix bad HTML comment tags only
            code = match.group(0)
            code = BROKEN_HTML_COMMENT.sub('<!--', code)
            parts.append(code)
            last_end = match.end()

        # Remaining non-code part after last inline code
        tail = line[last_end:]
        tail = protect_markdown_links(tail, lambda s: DASH_FIX.sub('—', s))
        parts.append(tail)

        result.append(''.join(parts))

    return result

def process_file(path: str) -> None:
    if os.path.getsize(path) == 0:
        return

    try:
        with open(path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except UnicodeDecodeError:
        return  # skip non-text or non-utf8

    new_lines = normalize_lines(lines)
    if new_lines != lines:
        with open(path, 'w', encoding='utf-8') as f:
            f.writelines(new_lines)
        print(f"Fixed: {path}")

def crawl_and_fix(root: str) -> None:
    for dirpath, dirnames, filenames in os.walk(root):
        # modify dirnames in-place to skip EXCLUDE_DIRS
        dirnames[:] = [d for d in dirnames if d not in EXCLUDE_DIRS]

        for fn in filenames:
            if fn.lower().endswith('.md'):
                process_file(os.path.join(dirpath, fn))

def main():
    parser = argparse.ArgumentParser(
        description="Recursively normalize “smart” Unicode chars in Markdown → ASCII"
    )
    parser.add_argument(
        '-d', '--dir',
        default='.',
        help="Root directory to crawl (defaults to cwd)"
    )
    args = parser.parse_args()

    root = os.path.abspath(args.dir)
    if not os.path.isdir(root):
        print(f"Error: '{root}' is not a directory", file=sys.stderr)
        sys.exit(1)

    crawl_and_fix(root)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python3
	import argparse
	import os
	import sys
	import re

	# directories to skip entirely
	EXCLUDE_DIRS = {'node_modules', 'target', 'bin', '__pycache__', 'build', 'dist'}

	MARKDOWN_LINK = re.compile(r'(!?\[.?\])\((.?)\)') # matches [text](url) and ![alt](url)
	RAW_URL = re.compile(r'https?://[^\s)`]+') # matches http/https raw URLs

	CODE_FENCE = re.compile(r'^```') # Match start/end of fenced code blocks
	INLINE_CODE = re.compile(r'`[^`]+`') # Match inline code spans
	HR_LINE = re.compile(r'^\s-{3,}\s$')

	# Replace em-dash variants and double hyphen outside code
	DASH_FIX = re.compile(r'(?<!`)(?<!-)(--\|–)') # only outside backticks, not already part of inline `--`

	# Replace broken HTML open comment tag (e.g., <!— instead of <!--)
	BROKEN_HTML_COMMENT = re.compile(r'<![–—]') # en-dash or em-dash (Unicode U+2013/U+2014)

	# mapping of “fancy” chars → plain ASCII (applied globally)
	REPLACEMENTS = {
	"‘": "'", "’": "'",
	"“": '"', "”": '"',
	"‚": ",",
	"…": "...",
	"\u00A0": " ",
	"•": "-",
	"×": "x", "÷": "/", "°": " degrees ",
	"™": "(TM)", "®": "(R)",
	"€": "EUR", "£": "GBP", "¥": "YEN",
	"?utm_source=chatgpt.com": "",
	"&utm_source=chatgpt.com": ""
	}

	def protect_markdown_links(line: str, transform: callable) -> str:
	"""Apply `transform` to non-link segments of the line."""
	# First protect raw URLs
	raw_parts = []
	last_end = 0
	for match in RAW_URL.finditer(line):
	before = line[last_end:match.start()]
	protected = transform(before)
	raw_parts.append(protected)
	raw_parts.append(match.group(0)) # raw URL unchanged
	last_end = match.end()
	tail = transform(line[last_end:])
	line = ''.join(raw_parts) + tail

	# Then protect [text](url) and ![alt](url)
	result = []
	last_end = 0
	for match in MARKDOWN_LINK.finditer(line):
	before = line[last_end:match.start()]
	protected = transform(before)
	result.append(protected)
	result.append(match.group(0)) # full match stays unchanged
	last_end = match.end()
	result.append(transform(line[last_end:]))
	return ''.join(result)

	def is_markdown_table_line(line: str) -> bool:
	return line.strip().startswith('\|') and '\|' in line

	def normalize_cli_flags(line: str) -> str:
	"""Fix em/en dashes used incorrectly in CLI flags."""
	def repl(m: re.Match) -> str:
	return '--' + m.group(2)
	return CLI_FLAG_BAD.sub(repl, line)

	def normalize_lines(lines: list[str]) -> list[str]:
	in_code_block = False
	result = []

	for line in lines:
	stripped = line.strip()

	# Fix broken HTML comments globally
	line = BROKEN_HTML_COMMENT.sub('<!--', line)

	# Toggle code block mode
	if CODE_FENCE.match(stripped):
	in_code_block = not in_code_block
	result.append(line)
	continue

	# Always replace quotes and other global stuff (everywhere)
	for src, dst in REPLACEMENTS.items():
	line = line.replace(src, dst)

	if in_code_block:
	result.append(line)
	continue

	# Skip front-matter / <hr> lines
	if HR_LINE.match(line):
	result.append(line)
	continue

	# Skip dash replacement for Markdown tables
	if is_markdown_table_line(line):
	# Still apply global replacements (e.g. quotes), and fix broken `<!—` in inline code
	for src, dst in REPLACEMENTS.items():
	line = line.replace(src, dst)

	parts = []
	last_end = 0
	for match in INLINE_CODE.finditer(line):
	before = line[last_end:match.start()]
	before = BROKEN_HTML_COMMENT.sub('<!--', before)
	parts.append(before)
	code = match.group(0)
	code = BROKEN_HTML_COMMENT.sub('<!--', code)
	parts.append(code)
	last_end = match.end()
	tail = line[last_end:]
	tail = BROKEN_HTML_COMMENT.sub('<!--', tail)
	parts.append(tail)

	result.append(''.join(parts))
	continue

	# Process inline code blocks
	parts = []
	last_end = 0
	for match in INLINE_CODE.finditer(line):
	# Non-code part: replace dashes, leave other fixes already applied
	non_code = line[last_end:match.start()]
	non_code = protect_markdown_links(non_code, lambda s: DASH_FIX.sub('—', s))
	parts.append(non_code)

	# Inline code: fix bad HTML comment tags only
	code = match.group(0)
	code = BROKEN_HTML_COMMENT.sub('<!--', code)
	parts.append(code)
	last_end = match.end()

	# Remaining non-code part after last inline code
	tail = line[last_end:]
	tail = protect_markdown_links(tail, lambda s: DASH_FIX.sub('—', s))
	parts.append(tail)

	result.append(''.join(parts))

	return result

	def process_file(path: str) -> None:
	if os.path.getsize(path) == 0:
	return

	try:
	with open(path, 'r', encoding='utf-8') as f:
	lines = f.readlines()
	except UnicodeDecodeError:
	return # skip non-text or non-utf8

	new_lines = normalize_lines(lines)
	if new_lines != lines:
	with open(path, 'w', encoding='utf-8') as f:
	f.writelines(new_lines)
	print(f"Fixed: {path}")

	def crawl_and_fix(root: str) -> None:
	for dirpath, dirnames, filenames in os.walk(root):
	# modify dirnames in-place to skip EXCLUDE_DIRS
	dirnames[:] = [d for d in dirnames if d not in EXCLUDE_DIRS]

	for fn in filenames:
	if fn.lower().endswith('.md'):
	process_file(os.path.join(dirpath, fn))

	def main():
	parser = argparse.ArgumentParser(
	description="Recursively normalize “smart” Unicode chars in Markdown → ASCII"
	)
	parser.add_argument(
	'-d', '--dir',
	default='.',
	help="Root directory to crawl (defaults to cwd)"
	)
	args = parser.parse_args()

	root = os.path.abspath(args.dir)
	if not os.path.isdir(root):
	print(f"Error: '{root}' is not a directory", file=sys.stderr)
	sys.exit(1)

	crawl_and_fix(root)

	if __name__ == '__main__':
	main()
No results found