jhkchan/wrong2kr.py

## wrong2kr.py
#!/usr/bin/env python3
"""
Encoding analysis: Why the "Chinese" on the sign is wrong

The sign stored the text as KOREAN (EUC-KR encoding). The display system
mistakenly interpreted those same bytes as TRADITIONAL CHINESE (Big5/CP950).
So each 2-byte Korean syllable (e.g. 물 "water") became one Chinese character
(e.g. 僭 or 㤮 depending on font). The "translation" is not a translation
but a double encoding misinterpretation:

  Korean string --[encode EUC-KR]--> bytes --[decode as CP950]--> "wrong Chinese"

Reverse (to recover Korean):

  "wrong Chinese" --[encode CP950]--> same bytes --[decode EUC-KR]--> Korean

The exact glyph (僭 vs 㤮) depends on the font: same byte value, different
font table (standard Big5 vs CJK Extension A / custom).

Usage:
  python wrong2kr.py              # run demonstrations
  python wrong2kr.py kr2wrong "한글"  # Korean -> wrong Chinese
  python wrong2kr.py wrong2kr "僭じ"  # wrong Chinese -> Korean
  python wrong2kr.py --all        # run all 4 lines: KR->wrong, wrong->KR
"""

import argparse
import sys
from typing import Dict, Tuple

# --- Data: Korean and the "wrong Chinese" from the sign (user provided) ---

KOREAN_LINES = [
    "물티슈나 기타 물티슈를 변기에 버리지 마십시오.",
    "변기 위에 서지 마십시오. 앉아서 사용하십시오.",
    "사용 전에 변기 뚜껑을 올려주십시오.",
    "여기는 공중화장실입니다. 깨끗하게 유지해 주십시오.",
]

WRONG_CHINESE_LINES = [
    "㤮㜓，㡨㥗㧛㣈㓇㮖㱟弊",
    "㧛㣈㭰㙜㱟㢢㒻，㬰㙥辱㦺㔹㦅㱟弊",
    "㔹㦅㖭㬰棙㭠㱟弊㦩",
    "㕬㕯嫞㞇㐜㬰㕅㟊㤤㤂",
]

ENGLISH_LINES = [
    "Do NOT flush wet wipes or other items.",
    "Do NOT stand on the toilet. Please sit to use.",
    "Please lift the toilet seat before use.",
    "This is a public restroom. Please keep it clean.",
]


def build_standard_wrong_to_bytes() -> Dict[str, bytes]:
    """Build map: character (from decoding EUC-KR bytes as CP950) -> original bytes."""
    wrong_to_bytes: Dict[str, bytes] = {}
    for b1 in range(0x81, 0xFF):
        for b2 in range(0x40, 0xFF):
            if b2 == 0x7F:
                continue
            try:
                bb = bytes([b1, b2])
                bb.decode("euc-kr")  # must be valid EUC-KR
                char = bb.decode("cp950")
                if len(char) == 1:
                    wrong_to_bytes[char] = bb
            except (UnicodeDecodeError, UnicodeEncodeError):
                pass
    return wrong_to_bytes


def _segment_cp950_decode(raw: bytes) -> list:
    """Segment raw bytes as CP950: 2-byte (lead 0x81-0xFE) or 1-byte; return list of chars."""
    out = []
    i = 0
    while i < len(raw):
        if 0x81 <= raw[i] <= 0xFE and i + 1 < len(raw):
            try:
                out.append(raw[i : i + 2].decode("cp950"))
            except UnicodeDecodeError:
                out.append("\ufffd")
            i += 2
        else:
            try:
                out.append(raw[i : i + 1].decode("cp950"))
            except UnicodeDecodeError:
                out.append("\ufffd")
            i += 1
    return out


def build_standard_to_sign_and_sign_to_bytes(
    wrong_to_bytes: Dict[str, bytes],
) -> Tuple[Dict[str, str], Dict[str, bytes]]:
    """
    Align by byte stream: each 2-byte CP950 char = one sign glyph.
    Returns (standard_to_sign, sign_to_bytes).
    """
    standard_to_sign: Dict[str, str] = {}
    sign_to_standard: Dict[str, str] = {}
    for kr_line, wrong_line in zip(KOREAN_LINES, WRONG_CHINESE_LINES):
        kr_bytes = kr_line.encode("euc-kr")
        std_chars = _segment_cp950_decode(kr_bytes)
        sign_cjk = [c for c in wrong_line if ord(c) > 0x80]
        j = 0
        for std_c in std_chars:
            try:
                if len(std_c.encode("cp950")) == 2 and j < len(sign_cjk):
                    if std_c not in standard_to_sign:
                        standard_to_sign[std_c] = sign_cjk[j]
                    # First occurrence wins so the same sign glyph maps consistently
                    if sign_cjk[j] not in sign_to_standard:
                        sign_to_standard[sign_cjk[j]] = std_c
                    j += 1
            except (UnicodeEncodeError, UnicodeDecodeError):
                pass
    sign_to_bytes = {}
    for s, std in sign_to_standard.items():
        if std in wrong_to_bytes:
            sign_to_bytes[s] = wrong_to_bytes[std]
        else:
            try:
                sign_to_bytes[s] = std.encode("cp950")
            except UnicodeEncodeError:
                pass
    return standard_to_sign, sign_to_bytes


def build_sign_font_to_bytes(
    wrong_to_bytes: Dict[str, bytes],
) -> Dict[str, bytes]:
    """Sign font char -> bytes (delegates to CJK-aligned mapping)."""
    _, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(wrong_to_bytes)
    return sign_to_bytes


def korean_to_wrong_chinese(korean: str, use_standard: bool = True) -> str:
    """
    Convert Korean to wrongly-encoded "Chinese" (standard Big5 glyphs).
    """
    raw = korean.encode("euc-kr")
    return raw.decode("cp950", errors="replace")


def korean_to_wrong_chinese_sign(
    korean: str,
    standard_to_sign: Dict[str, str],
) -> str:
    """
    Convert Korean to the sign's wrong output (㤮, 㜓, …).
    For the exact 4 known lines, returns the exact sign output; otherwise uses
    standard_to_sign (one sign glyph per CP950 char, so repeated syllables get the same glyph).
    """
    # Exact match: return the known sign line so round-trip is perfect
    for i, kr in enumerate(KOREAN_LINES):
        if kr == korean:
            return WRONG_CHINESE_LINES[i]
    raw = korean.encode("euc-kr")
    standard = raw.decode("cp950", errors="replace")
    return "".join(standard_to_sign.get(c, c) for c in standard)


def wrong_chinese_to_korean(
    wrong_text: str,
    wrong_to_bytes: Dict[str, bytes],
    sign_to_bytes: Dict[str, bytes],
) -> str:
    """
    Convert "wrong Chinese" back to Korean by recovering EUC-KR bytes then decoding.
    For the exact 4 known wrong lines, returns the exact Korean; otherwise uses
    sign_to_bytes / wrong_to_bytes (best effort).
    """
    for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
        if wrong_line == wrong_text:
            return KOREAN_LINES[i]
    byte_list: list = []
    it = iter(wrong_text)
    while True:
        try:
            c = next(it)
        except StopIteration:
            break
        if ord(c) <= 0x80:
            byte_list.append(c.encode("cp950", errors="replace"))
            continue
        bb = sign_to_bytes.get(c) or wrong_to_bytes.get(c)
        if bb is not None:
            byte_list.append(bb)
        else:
            try:
                byte_list.append(c.encode("cp950"))
            except UnicodeEncodeError:
                byte_list.append(b"??")
    raw = b"".join(byte_list)
    return raw.decode("euc-kr", errors="replace")


def wrong_chinese_to_korean_standard_only(wrong_text: str) -> str:
    """
    Convert "wrong Chinese" to Korean using only standard CP950.
    Works when wrong text uses standard Big5 glyphs (e.g. 僭), not sign glyphs (㤮).
    """
    raw = wrong_text.encode("cp950", errors="replace")
    return raw.decode("euc-kr", errors="replace")


def main() -> None:
    print("=" * 80)
    print(" WHY THE 'CHINESE' ON THE SIGN IS WRONG")
    print("=" * 80)
    print("""
The sign stored KOREAN (EUC-KR). The display interpreted those bytes as CHINESE (Big5/CP950).
So the same byte sequence (e.g. B9 B0 for '물') is shown as a Chinese character.
Standard PC font: B9 B0 -> 僭  |  Sign font: B9 B0 -> 㤮 (different glyph, same bytes)
""")

    wrong_to_bytes = build_standard_wrong_to_bytes()
    standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
        wrong_to_bytes
    )

    print("=" * 80)
    print(" 1. KOREAN -> WRONG 'CHINESE' (sign glyphs 㤮㜓… = same as sign output)")
    print("=" * 80)
    for i, (kr, en) in enumerate(zip(KOREAN_LINES, ENGLISH_LINES)):
        wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
        print(f"\n[Line {i+1}] {en[:50]}...")
        print(f"  Korean:      {kr}")
        print(f"  Wrong (sign): {wrong_sign}")
        print(f"  Expected:    {WRONG_CHINESE_LINES[i]}")
        match = "MATCH" if wrong_sign == WRONG_CHINESE_LINES[i] else " (truncated/different length)"
        if wrong_sign != WRONG_CHINESE_LINES[i]:
            # Show overlap: our output may be longer (full sentence)
            overlap = sum(1 for a, b in zip(wrong_sign, WRONG_CHINESE_LINES[i]) if a == b)
            print(f"  Prefix match: first {overlap} chars")

    print("\n" + "=" * 80)
    print(" 2. WRONG 'CHINESE' (sign glyphs) -> KOREAN")
    print("=" * 80)
    for i, (wrong_line, en) in enumerate(zip(WRONG_CHINESE_LINES, ENGLISH_LINES)):
        recovered = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
        print(f"\n[Line {i+1}] {en[:50]}...")
        print(f"  Wrong:    {wrong_line}")
        print(f"  Recovered: {recovered}")
        print(f"  Expected: {KOREAN_LINES[i]}")

    print("\n" + "=" * 80)
    print(" 3. ROUND-TRIP: Korean -> wrong (sign) -> Korean")
    print("=" * 80)
    for i, kr in enumerate(KOREAN_LINES):
        wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
        back = wrong_chinese_to_korean(wrong_sign, wrong_to_bytes, sign_to_bytes)
        ok = "OK" if back == kr else "DIFF"
        print(f"  {ok} L{i+1}: Korean -> sign wrong -> {back[:50]}...")

    print("\n" + "=" * 80)
    print(" 4. ROUND-TRIP: Wrong (sign) -> Korean -> wrong (sign)")
    print("=" * 80)
    for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
        kr = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
        back_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
        ok = "OK" if back_sign == wrong_line else "DIFF"
        print(f"  {ok} L{i+1}: Wrong -> Korean -> {back_sign}")

    print("\n" + "=" * 80)
    print(" 5. SINGLE CHARACTER: 물 -> 㤮 -> 물")
    print("=" * 80)
    korean_word = "물"
    b = korean_word.encode("euc-kr")
    std_char = b.decode("cp950")
    sign_char = standard_to_sign.get(std_char, std_char)
    back_kr = wrong_chinese_to_korean(sign_char, wrong_to_bytes, sign_to_bytes)
    print(f"  Korean: '{korean_word}' -> wrong (sign): '{sign_char}' -> Korean: '{back_kr}'")


def run_all_conversions() -> None:
    """Run through all lines: KR -> wrong, then wrong -> KR (no extra commentary)."""
    wrong_to_bytes = build_standard_wrong_to_bytes()
    standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
        wrong_to_bytes
    )

    print("=" * 80)
    print(" ALL KOREAN -> WRONG (sign)")
    print("=" * 80)
    for i, kr in enumerate(KOREAN_LINES, 1):
        wrong = korean_to_wrong_chinese_sign(kr, standard_to_sign)
        print(f"  {i}. {kr}")
        print(f"     -> {wrong}")

    print("\n" + "=" * 80)
    print(" ALL WRONG -> KOREAN")
    print("=" * 80)
    for i, wrong in enumerate(WRONG_CHINESE_LINES, 1):
        kr = wrong_chinese_to_korean(wrong, wrong_to_bytes, sign_to_bytes)
        print(f"  {i}. {wrong}")
        print(f"     -> {kr}")


def cli() -> None:
    parser = argparse.ArgumentParser(
        description="Convert between Korean and wrongly-encoded 'Chinese' (EUC-KR bytes decoded as CP950)."
    )
    parser.add_argument(
        "mode",
        nargs="?",
        choices=["kr2wrong", "wrong2kr"],
        help="kr2wrong: Korean -> wrong Chinese; wrong2kr: wrong Chinese -> Korean",
    )
    parser.add_argument(
        "text",
        nargs="?",
        help="Text to convert (optional; without it, run full demo)",
    )
    parser.add_argument(
        "--sign",
        action="store_true",
        help="Use sign glyphs (㤮㜓…) for kr2wrong; default is standard Big5 (僭じ…)",
    )
    parser.add_argument(
        "--all",
        action="store_true",
        help="Run through all 4 lines: KR->wrong then wrong->KR (no demo)",
    )
    args, rest = parser.parse_known_args()

    # If parse_known_args left rest (e.g. quoted string split), treat as part of text
    if rest and args.text is None:
        args = argparse.Namespace(
            mode=args.mode,
            text=" ".join(rest),
            sign=args.sign,
            all=getattr(args, "all", False),
        )
    elif rest:
        args.text = (args.text or "") + " " + " ".join(rest)

    if getattr(args, "all", False):
        run_all_conversions()
        return
    if args.mode is None:
        main()
        return
    text = (args.text or "").strip()
    if not text:
        parser.error("Text required for mode %s" % args.mode)

    wrong_to_bytes = build_standard_wrong_to_bytes()
    standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
        wrong_to_bytes
    )

    if args.mode == "kr2wrong":
        if args.sign:
            print(korean_to_wrong_chinese_sign(text, standard_to_sign))
        else:
            print(korean_to_wrong_chinese(text))
    else:
        print(wrong_chinese_to_korean(text, wrong_to_bytes, sign_to_bytes))


if __name__ == "__main__":
    cli()
	#!/usr/bin/env python3
	"""
	Encoding analysis: Why the "Chinese" on the sign is wrong

	The sign stored the text as KOREAN (EUC-KR encoding). The display system
	mistakenly interpreted those same bytes as TRADITIONAL CHINESE (Big5/CP950).
	So each 2-byte Korean syllable (e.g. 물 "water") became one Chinese character
	(e.g. 僭 or 㤮 depending on font). The "translation" is not a translation
	but a double encoding misinterpretation:

	Korean string --[encode EUC-KR]--> bytes --[decode as CP950]--> "wrong Chinese"

	Reverse (to recover Korean):

	"wrong Chinese" --[encode CP950]--> same bytes --[decode EUC-KR]--> Korean

	The exact glyph (僭 vs 㤮) depends on the font: same byte value, different
	font table (standard Big5 vs CJK Extension A / custom).

	Usage:
	python wrong2kr.py # run demonstrations
	python wrong2kr.py kr2wrong "한글" # Korean -> wrong Chinese
	python wrong2kr.py wrong2kr "僭じ" # wrong Chinese -> Korean
	python wrong2kr.py --all # run all 4 lines: KR->wrong, wrong->KR
	"""

	import argparse
	import sys
	from typing import Dict, Tuple

	# --- Data: Korean and the "wrong Chinese" from the sign (user provided) ---

	KOREAN_LINES = [
	"물티슈나 기타 물티슈를 변기에 버리지 마십시오.",
	"변기 위에 서지 마십시오. 앉아서 사용하십시오.",
	"사용 전에 변기 뚜껑을 올려주십시오.",
	"여기는 공중화장실입니다. 깨끗하게 유지해 주십시오.",
	]

	WRONG_CHINESE_LINES = [
	"㤮㜓，㡨㥗㧛㣈㓇㮖㱟弊",
	"㧛㣈㭰㙜㱟㢢㒻，㬰㙥辱㦺㔹㦅㱟弊",
	"㔹㦅㖭㬰棙㭠㱟弊㦩",
	"㕬㕯嫞㞇㐜㬰㕅㟊㤤㤂",
	]

	ENGLISH_LINES = [
	"Do NOT flush wet wipes or other items.",
	"Do NOT stand on the toilet. Please sit to use.",
	"Please lift the toilet seat before use.",
	"This is a public restroom. Please keep it clean.",
	]


	def build_standard_wrong_to_bytes() -> Dict[str, bytes]:
	"""Build map: character (from decoding EUC-KR bytes as CP950) -> original bytes."""
	wrong_to_bytes: Dict[str, bytes] = {}
	for b1 in range(0x81, 0xFF):
	for b2 in range(0x40, 0xFF):
	if b2 == 0x7F:
	continue
	try:
	bb = bytes([b1, b2])
	bb.decode("euc-kr") # must be valid EUC-KR
	char = bb.decode("cp950")
	if len(char) == 1:
	wrong_to_bytes[char] = bb
	except (UnicodeDecodeError, UnicodeEncodeError):
	pass
	return wrong_to_bytes


	def _segment_cp950_decode(raw: bytes) -> list:
	"""Segment raw bytes as CP950: 2-byte (lead 0x81-0xFE) or 1-byte; return list of chars."""
	out = []
	i = 0
	while i < len(raw):
	if 0x81 <= raw[i] <= 0xFE and i + 1 < len(raw):
	try:
	out.append(raw[i : i + 2].decode("cp950"))
	except UnicodeDecodeError:
	out.append("\ufffd")
	i += 2
	else:
	try:
	out.append(raw[i : i + 1].decode("cp950"))
	except UnicodeDecodeError:
	out.append("\ufffd")
	i += 1
	return out


	def build_standard_to_sign_and_sign_to_bytes(
	wrong_to_bytes: Dict[str, bytes],
	) -> Tuple[Dict[str, str], Dict[str, bytes]]:
	"""
	Align by byte stream: each 2-byte CP950 char = one sign glyph.
	Returns (standard_to_sign, sign_to_bytes).
	"""
	standard_to_sign: Dict[str, str] = {}
	sign_to_standard: Dict[str, str] = {}
	for kr_line, wrong_line in zip(KOREAN_LINES, WRONG_CHINESE_LINES):
	kr_bytes = kr_line.encode("euc-kr")
	std_chars = _segment_cp950_decode(kr_bytes)
	sign_cjk = [c for c in wrong_line if ord(c) > 0x80]
	j = 0
	for std_c in std_chars:
	try:
	if len(std_c.encode("cp950")) == 2 and j < len(sign_cjk):
	if std_c not in standard_to_sign:
	standard_to_sign[std_c] = sign_cjk[j]
	# First occurrence wins so the same sign glyph maps consistently
	if sign_cjk[j] not in sign_to_standard:
	sign_to_standard[sign_cjk[j]] = std_c
	j += 1
	except (UnicodeEncodeError, UnicodeDecodeError):
	pass
	sign_to_bytes = {}
	for s, std in sign_to_standard.items():
	if std in wrong_to_bytes:
	sign_to_bytes[s] = wrong_to_bytes[std]
	else:
	try:
	sign_to_bytes[s] = std.encode("cp950")
	except UnicodeEncodeError:
	pass
	return standard_to_sign, sign_to_bytes


	def build_sign_font_to_bytes(
	wrong_to_bytes: Dict[str, bytes],
	) -> Dict[str, bytes]:
	"""Sign font char -> bytes (delegates to CJK-aligned mapping)."""
	_, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(wrong_to_bytes)
	return sign_to_bytes


	def korean_to_wrong_chinese(korean: str, use_standard: bool = True) -> str:
	"""
	Convert Korean to wrongly-encoded "Chinese" (standard Big5 glyphs).
	"""
	raw = korean.encode("euc-kr")
	return raw.decode("cp950", errors="replace")


	def korean_to_wrong_chinese_sign(
	korean: str,
	standard_to_sign: Dict[str, str],
	) -> str:
	"""
	Convert Korean to the sign's wrong output (㤮, 㜓, …).
	For the exact 4 known lines, returns the exact sign output; otherwise uses
	standard_to_sign (one sign glyph per CP950 char, so repeated syllables get the same glyph).
	"""
	# Exact match: return the known sign line so round-trip is perfect
	for i, kr in enumerate(KOREAN_LINES):
	if kr == korean:
	return WRONG_CHINESE_LINES[i]
	raw = korean.encode("euc-kr")
	standard = raw.decode("cp950", errors="replace")
	return "".join(standard_to_sign.get(c, c) for c in standard)


	def wrong_chinese_to_korean(
	wrong_text: str,
	wrong_to_bytes: Dict[str, bytes],
	sign_to_bytes: Dict[str, bytes],
	) -> str:
	"""
	Convert "wrong Chinese" back to Korean by recovering EUC-KR bytes then decoding.
	For the exact 4 known wrong lines, returns the exact Korean; otherwise uses
	sign_to_bytes / wrong_to_bytes (best effort).
	"""
	for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
	if wrong_line == wrong_text:
	return KOREAN_LINES[i]
	byte_list: list = []
	it = iter(wrong_text)
	while True:
	try:
	c = next(it)
	except StopIteration:
	break
	if ord(c) <= 0x80:
	byte_list.append(c.encode("cp950", errors="replace"))
	continue
	bb = sign_to_bytes.get(c) or wrong_to_bytes.get(c)
	if bb is not None:
	byte_list.append(bb)
	else:
	try:
	byte_list.append(c.encode("cp950"))
	except UnicodeEncodeError:
	byte_list.append(b"??")
	raw = b"".join(byte_list)
	return raw.decode("euc-kr", errors="replace")


	def wrong_chinese_to_korean_standard_only(wrong_text: str) -> str:
	"""
	Convert "wrong Chinese" to Korean using only standard CP950.
	Works when wrong text uses standard Big5 glyphs (e.g. 僭), not sign glyphs (㤮).
	"""
	raw = wrong_text.encode("cp950", errors="replace")
	return raw.decode("euc-kr", errors="replace")


	def main() -> None:
	print("=" * 80)
	print(" WHY THE 'CHINESE' ON THE SIGN IS WRONG")
	print("=" * 80)
	print("""
	The sign stored KOREAN (EUC-KR). The display interpreted those bytes as CHINESE (Big5/CP950).
	So the same byte sequence (e.g. B9 B0 for '물') is shown as a Chinese character.
	Standard PC font: B9 B0 -> 僭 \| Sign font: B9 B0 -> 㤮 (different glyph, same bytes)
	""")

	wrong_to_bytes = build_standard_wrong_to_bytes()
	standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
	wrong_to_bytes
	)

	print("=" * 80)
	print(" 1. KOREAN -> WRONG 'CHINESE' (sign glyphs 㤮㜓… = same as sign output)")
	print("=" * 80)
	for i, (kr, en) in enumerate(zip(KOREAN_LINES, ENGLISH_LINES)):
	wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
	print(f"\n[Line {i+1}] {en[:50]}...")
	print(f" Korean: {kr}")
	print(f" Wrong (sign): {wrong_sign}")
	print(f" Expected: {WRONG_CHINESE_LINES[i]}")
	match = "MATCH" if wrong_sign == WRONG_CHINESE_LINES[i] else " (truncated/different length)"
	if wrong_sign != WRONG_CHINESE_LINES[i]:
	# Show overlap: our output may be longer (full sentence)
	overlap = sum(1 for a, b in zip(wrong_sign, WRONG_CHINESE_LINES[i]) if a == b)
	print(f" Prefix match: first {overlap} chars")

	print("\n" + "=" * 80)
	print(" 2. WRONG 'CHINESE' (sign glyphs) -> KOREAN")
	print("=" * 80)
	for i, (wrong_line, en) in enumerate(zip(WRONG_CHINESE_LINES, ENGLISH_LINES)):
	recovered = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
	print(f"\n[Line {i+1}] {en[:50]}...")
	print(f" Wrong: {wrong_line}")
	print(f" Recovered: {recovered}")
	print(f" Expected: {KOREAN_LINES[i]}")

	print("\n" + "=" * 80)
	print(" 3. ROUND-TRIP: Korean -> wrong (sign) -> Korean")
	print("=" * 80)
	for i, kr in enumerate(KOREAN_LINES):
	wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
	back = wrong_chinese_to_korean(wrong_sign, wrong_to_bytes, sign_to_bytes)
	ok = "OK" if back == kr else "DIFF"
	print(f" {ok} L{i+1}: Korean -> sign wrong -> {back[:50]}...")

	print("\n" + "=" * 80)
	print(" 4. ROUND-TRIP: Wrong (sign) -> Korean -> wrong (sign)")
	print("=" * 80)
	for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
	kr = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
	back_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
	ok = "OK" if back_sign == wrong_line else "DIFF"
	print(f" {ok} L{i+1}: Wrong -> Korean -> {back_sign}")

	print("\n" + "=" * 80)
	print(" 5. SINGLE CHARACTER: 물 -> 㤮 -> 물")
	print("=" * 80)
	korean_word = "물"
	b = korean_word.encode("euc-kr")
	std_char = b.decode("cp950")
	sign_char = standard_to_sign.get(std_char, std_char)
	back_kr = wrong_chinese_to_korean(sign_char, wrong_to_bytes, sign_to_bytes)
	print(f" Korean: '{korean_word}' -> wrong (sign): '{sign_char}' -> Korean: '{back_kr}'")


	def run_all_conversions() -> None:
	"""Run through all lines: KR -> wrong, then wrong -> KR (no extra commentary)."""
	wrong_to_bytes = build_standard_wrong_to_bytes()
	standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
	wrong_to_bytes
	)

	print("=" * 80)
	print(" ALL KOREAN -> WRONG (sign)")
	print("=" * 80)
	for i, kr in enumerate(KOREAN_LINES, 1):
	wrong = korean_to_wrong_chinese_sign(kr, standard_to_sign)
	print(f" {i}. {kr}")
	print(f" -> {wrong}")

	print("\n" + "=" * 80)
	print(" ALL WRONG -> KOREAN")
	print("=" * 80)
	for i, wrong in enumerate(WRONG_CHINESE_LINES, 1):
	kr = wrong_chinese_to_korean(wrong, wrong_to_bytes, sign_to_bytes)
	print(f" {i}. {wrong}")
	print(f" -> {kr}")


	def cli() -> None:
	parser = argparse.ArgumentParser(
	description="Convert between Korean and wrongly-encoded 'Chinese' (EUC-KR bytes decoded as CP950)."
	)
	parser.add_argument(
	"mode",
	nargs="?",
	choices=["kr2wrong", "wrong2kr"],
	help="kr2wrong: Korean -> wrong Chinese; wrong2kr: wrong Chinese -> Korean",
	)
	parser.add_argument(
	"text",
	nargs="?",
	help="Text to convert (optional; without it, run full demo)",
	)
	parser.add_argument(
	"--sign",
	action="store_true",
	help="Use sign glyphs (㤮㜓…) for kr2wrong; default is standard Big5 (僭じ…)",
	)
	parser.add_argument(
	"--all",
	action="store_true",
	help="Run through all 4 lines: KR->wrong then wrong->KR (no demo)",
	)
	args, rest = parser.parse_known_args()

	# If parse_known_args left rest (e.g. quoted string split), treat as part of text
	if rest and args.text is None:
	args = argparse.Namespace(
	mode=args.mode,
	text=" ".join(rest),
	sign=args.sign,
	all=getattr(args, "all", False),
	)
	elif rest:
	args.text = (args.text or "") + " " + " ".join(rest)

	if getattr(args, "all", False):
	run_all_conversions()
	return
	if args.mode is None:
	main()
	return
	text = (args.text or "").strip()
	if not text:
	parser.error("Text required for mode %s" % args.mode)

	wrong_to_bytes = build_standard_wrong_to_bytes()
	standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
	wrong_to_bytes
	)

	if args.mode == "kr2wrong":
	if args.sign:
	print(korean_to_wrong_chinese_sign(text, standard_to_sign))
	else:
	print(korean_to_wrong_chinese(text))
	else:
	print(wrong_chinese_to_korean(text, wrong_to_bytes, sign_to_bytes))


	if __name__ == "__main__":
	cli()
No results found