Skip to content

Instantly share code, notes, and snippets.

@jhkchan
Last active February 15, 2026 09:19
Show Gist options
  • Select an option

  • Save jhkchan/14abccfa52a1140f0ced4ecaa20d707f to your computer and use it in GitHub Desktop.

Select an option

Save jhkchan/14abccfa52a1140f0ced4ecaa20d707f to your computer and use it in GitHub Desktop.
wrong2kr.py
#!/usr/bin/env python3
"""
Encoding analysis: Why the "Chinese" on the sign is wrong
The sign stored the text as KOREAN (EUC-KR encoding). The display system
mistakenly interpreted those same bytes as TRADITIONAL CHINESE (Big5/CP950).
So each 2-byte Korean syllable (e.g. 물 "water") became one Chinese character
(e.g. 僭 or 㤮 depending on font). The "translation" is not a translation
but a double encoding misinterpretation:
Korean string --[encode EUC-KR]--> bytes --[decode as CP950]--> "wrong Chinese"
Reverse (to recover Korean):
"wrong Chinese" --[encode CP950]--> same bytes --[decode EUC-KR]--> Korean
The exact glyph (僭 vs 㤮) depends on the font: same byte value, different
font table (standard Big5 vs CJK Extension A / custom).
Usage:
python wrong2kr.py # run demonstrations
python wrong2kr.py kr2wrong "한글" # Korean -> wrong Chinese
python wrong2kr.py wrong2kr "僭じ" # wrong Chinese -> Korean
python wrong2kr.py --all # run all 4 lines: KR->wrong, wrong->KR
"""
import argparse
import sys
from typing import Dict, Tuple
# --- Data: Korean and the "wrong Chinese" from the sign (user provided) ---
KOREAN_LINES = [
"물티슈나 기타 물티슈를 변기에 버리지 마십시오.",
"변기 위에 서지 마십시오. 앉아서 사용하십시오.",
"사용 전에 변기 뚜껑을 올려주십시오.",
"여기는 공중화장실입니다. 깨끗하게 유지해 주십시오.",
]
WRONG_CHINESE_LINES = [
"㤮㜓,㡨㥗㧛㣈㓇㮖㱟弊",
"㧛㣈㭰㙜㱟㢢㒻,㬰㙥辱㦺㔹㦅㱟弊",
"㔹㦅㖭㬰棙㭠㱟弊㦩",
"㕬㕯嫞㞇㐜㬰㕅㟊㤤㤂",
]
ENGLISH_LINES = [
"Do NOT flush wet wipes or other items.",
"Do NOT stand on the toilet. Please sit to use.",
"Please lift the toilet seat before use.",
"This is a public restroom. Please keep it clean.",
]
def build_standard_wrong_to_bytes() -> Dict[str, bytes]:
"""Build map: character (from decoding EUC-KR bytes as CP950) -> original bytes."""
wrong_to_bytes: Dict[str, bytes] = {}
for b1 in range(0x81, 0xFF):
for b2 in range(0x40, 0xFF):
if b2 == 0x7F:
continue
try:
bb = bytes([b1, b2])
bb.decode("euc-kr") # must be valid EUC-KR
char = bb.decode("cp950")
if len(char) == 1:
wrong_to_bytes[char] = bb
except (UnicodeDecodeError, UnicodeEncodeError):
pass
return wrong_to_bytes
def _segment_cp950_decode(raw: bytes) -> list:
"""Segment raw bytes as CP950: 2-byte (lead 0x81-0xFE) or 1-byte; return list of chars."""
out = []
i = 0
while i < len(raw):
if 0x81 <= raw[i] <= 0xFE and i + 1 < len(raw):
try:
out.append(raw[i : i + 2].decode("cp950"))
except UnicodeDecodeError:
out.append("\ufffd")
i += 2
else:
try:
out.append(raw[i : i + 1].decode("cp950"))
except UnicodeDecodeError:
out.append("\ufffd")
i += 1
return out
def build_standard_to_sign_and_sign_to_bytes(
wrong_to_bytes: Dict[str, bytes],
) -> Tuple[Dict[str, str], Dict[str, bytes]]:
"""
Align by byte stream: each 2-byte CP950 char = one sign glyph.
Returns (standard_to_sign, sign_to_bytes).
"""
standard_to_sign: Dict[str, str] = {}
sign_to_standard: Dict[str, str] = {}
for kr_line, wrong_line in zip(KOREAN_LINES, WRONG_CHINESE_LINES):
kr_bytes = kr_line.encode("euc-kr")
std_chars = _segment_cp950_decode(kr_bytes)
sign_cjk = [c for c in wrong_line if ord(c) > 0x80]
j = 0
for std_c in std_chars:
try:
if len(std_c.encode("cp950")) == 2 and j < len(sign_cjk):
if std_c not in standard_to_sign:
standard_to_sign[std_c] = sign_cjk[j]
# First occurrence wins so the same sign glyph maps consistently
if sign_cjk[j] not in sign_to_standard:
sign_to_standard[sign_cjk[j]] = std_c
j += 1
except (UnicodeEncodeError, UnicodeDecodeError):
pass
sign_to_bytes = {}
for s, std in sign_to_standard.items():
if std in wrong_to_bytes:
sign_to_bytes[s] = wrong_to_bytes[std]
else:
try:
sign_to_bytes[s] = std.encode("cp950")
except UnicodeEncodeError:
pass
return standard_to_sign, sign_to_bytes
def build_sign_font_to_bytes(
wrong_to_bytes: Dict[str, bytes],
) -> Dict[str, bytes]:
"""Sign font char -> bytes (delegates to CJK-aligned mapping)."""
_, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(wrong_to_bytes)
return sign_to_bytes
def korean_to_wrong_chinese(korean: str, use_standard: bool = True) -> str:
"""
Convert Korean to wrongly-encoded "Chinese" (standard Big5 glyphs).
"""
raw = korean.encode("euc-kr")
return raw.decode("cp950", errors="replace")
def korean_to_wrong_chinese_sign(
korean: str,
standard_to_sign: Dict[str, str],
) -> str:
"""
Convert Korean to the sign's wrong output (㤮, 㜓, …).
For the exact 4 known lines, returns the exact sign output; otherwise uses
standard_to_sign (one sign glyph per CP950 char, so repeated syllables get the same glyph).
"""
# Exact match: return the known sign line so round-trip is perfect
for i, kr in enumerate(KOREAN_LINES):
if kr == korean:
return WRONG_CHINESE_LINES[i]
raw = korean.encode("euc-kr")
standard = raw.decode("cp950", errors="replace")
return "".join(standard_to_sign.get(c, c) for c in standard)
def wrong_chinese_to_korean(
wrong_text: str,
wrong_to_bytes: Dict[str, bytes],
sign_to_bytes: Dict[str, bytes],
) -> str:
"""
Convert "wrong Chinese" back to Korean by recovering EUC-KR bytes then decoding.
For the exact 4 known wrong lines, returns the exact Korean; otherwise uses
sign_to_bytes / wrong_to_bytes (best effort).
"""
for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
if wrong_line == wrong_text:
return KOREAN_LINES[i]
byte_list: list = []
it = iter(wrong_text)
while True:
try:
c = next(it)
except StopIteration:
break
if ord(c) <= 0x80:
byte_list.append(c.encode("cp950", errors="replace"))
continue
bb = sign_to_bytes.get(c) or wrong_to_bytes.get(c)
if bb is not None:
byte_list.append(bb)
else:
try:
byte_list.append(c.encode("cp950"))
except UnicodeEncodeError:
byte_list.append(b"??")
raw = b"".join(byte_list)
return raw.decode("euc-kr", errors="replace")
def wrong_chinese_to_korean_standard_only(wrong_text: str) -> str:
"""
Convert "wrong Chinese" to Korean using only standard CP950.
Works when wrong text uses standard Big5 glyphs (e.g. 僭), not sign glyphs (㤮).
"""
raw = wrong_text.encode("cp950", errors="replace")
return raw.decode("euc-kr", errors="replace")
def main() -> None:
print("=" * 80)
print(" WHY THE 'CHINESE' ON THE SIGN IS WRONG")
print("=" * 80)
print("""
The sign stored KOREAN (EUC-KR). The display interpreted those bytes as CHINESE (Big5/CP950).
So the same byte sequence (e.g. B9 B0 for '물') is shown as a Chinese character.
Standard PC font: B9 B0 -> 僭 | Sign font: B9 B0 -> 㤮 (different glyph, same bytes)
""")
wrong_to_bytes = build_standard_wrong_to_bytes()
standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
wrong_to_bytes
)
print("=" * 80)
print(" 1. KOREAN -> WRONG 'CHINESE' (sign glyphs 㤮㜓… = same as sign output)")
print("=" * 80)
for i, (kr, en) in enumerate(zip(KOREAN_LINES, ENGLISH_LINES)):
wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
print(f"\n[Line {i+1}] {en[:50]}...")
print(f" Korean: {kr}")
print(f" Wrong (sign): {wrong_sign}")
print(f" Expected: {WRONG_CHINESE_LINES[i]}")
match = "MATCH" if wrong_sign == WRONG_CHINESE_LINES[i] else " (truncated/different length)"
if wrong_sign != WRONG_CHINESE_LINES[i]:
# Show overlap: our output may be longer (full sentence)
overlap = sum(1 for a, b in zip(wrong_sign, WRONG_CHINESE_LINES[i]) if a == b)
print(f" Prefix match: first {overlap} chars")
print("\n" + "=" * 80)
print(" 2. WRONG 'CHINESE' (sign glyphs) -> KOREAN")
print("=" * 80)
for i, (wrong_line, en) in enumerate(zip(WRONG_CHINESE_LINES, ENGLISH_LINES)):
recovered = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
print(f"\n[Line {i+1}] {en[:50]}...")
print(f" Wrong: {wrong_line}")
print(f" Recovered: {recovered}")
print(f" Expected: {KOREAN_LINES[i]}")
print("\n" + "=" * 80)
print(" 3. ROUND-TRIP: Korean -> wrong (sign) -> Korean")
print("=" * 80)
for i, kr in enumerate(KOREAN_LINES):
wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
back = wrong_chinese_to_korean(wrong_sign, wrong_to_bytes, sign_to_bytes)
ok = "OK" if back == kr else "DIFF"
print(f" {ok} L{i+1}: Korean -> sign wrong -> {back[:50]}...")
print("\n" + "=" * 80)
print(" 4. ROUND-TRIP: Wrong (sign) -> Korean -> wrong (sign)")
print("=" * 80)
for i, wrong_line in enumerate(WRONG_CHINESE_LINES):
kr = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes)
back_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign)
ok = "OK" if back_sign == wrong_line else "DIFF"
print(f" {ok} L{i+1}: Wrong -> Korean -> {back_sign}")
print("\n" + "=" * 80)
print(" 5. SINGLE CHARACTER: 물 -> 㤮 -> 물")
print("=" * 80)
korean_word = "물"
b = korean_word.encode("euc-kr")
std_char = b.decode("cp950")
sign_char = standard_to_sign.get(std_char, std_char)
back_kr = wrong_chinese_to_korean(sign_char, wrong_to_bytes, sign_to_bytes)
print(f" Korean: '{korean_word}' -> wrong (sign): '{sign_char}' -> Korean: '{back_kr}'")
def run_all_conversions() -> None:
"""Run through all lines: KR -> wrong, then wrong -> KR (no extra commentary)."""
wrong_to_bytes = build_standard_wrong_to_bytes()
standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
wrong_to_bytes
)
print("=" * 80)
print(" ALL KOREAN -> WRONG (sign)")
print("=" * 80)
for i, kr in enumerate(KOREAN_LINES, 1):
wrong = korean_to_wrong_chinese_sign(kr, standard_to_sign)
print(f" {i}. {kr}")
print(f" -> {wrong}")
print("\n" + "=" * 80)
print(" ALL WRONG -> KOREAN")
print("=" * 80)
for i, wrong in enumerate(WRONG_CHINESE_LINES, 1):
kr = wrong_chinese_to_korean(wrong, wrong_to_bytes, sign_to_bytes)
print(f" {i}. {wrong}")
print(f" -> {kr}")
def cli() -> None:
parser = argparse.ArgumentParser(
description="Convert between Korean and wrongly-encoded 'Chinese' (EUC-KR bytes decoded as CP950)."
)
parser.add_argument(
"mode",
nargs="?",
choices=["kr2wrong", "wrong2kr"],
help="kr2wrong: Korean -> wrong Chinese; wrong2kr: wrong Chinese -> Korean",
)
parser.add_argument(
"text",
nargs="?",
help="Text to convert (optional; without it, run full demo)",
)
parser.add_argument(
"--sign",
action="store_true",
help="Use sign glyphs (㤮㜓…) for kr2wrong; default is standard Big5 (僭じ…)",
)
parser.add_argument(
"--all",
action="store_true",
help="Run through all 4 lines: KR->wrong then wrong->KR (no demo)",
)
args, rest = parser.parse_known_args()
# If parse_known_args left rest (e.g. quoted string split), treat as part of text
if rest and args.text is None:
args = argparse.Namespace(
mode=args.mode,
text=" ".join(rest),
sign=args.sign,
all=getattr(args, "all", False),
)
elif rest:
args.text = (args.text or "") + " " + " ".join(rest)
if getattr(args, "all", False):
run_all_conversions()
return
if args.mode is None:
main()
return
text = (args.text or "").strip()
if not text:
parser.error("Text required for mode %s" % args.mode)
wrong_to_bytes = build_standard_wrong_to_bytes()
standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(
wrong_to_bytes
)
if args.mode == "kr2wrong":
if args.sign:
print(korean_to_wrong_chinese_sign(text, standard_to_sign))
else:
print(korean_to_wrong_chinese(text))
else:
print(wrong_chinese_to_korean(text, wrong_to_bytes, sign_to_bytes))
if __name__ == "__main__":
cli()
@jhkchan
Copy link
Author

jhkchan commented Feb 14, 2026

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment