Last active
February 15, 2026 09:19
-
-
Save jhkchan/14abccfa52a1140f0ced4ecaa20d707f to your computer and use it in GitHub Desktop.
wrong2kr.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Encoding analysis: Why the "Chinese" on the sign is wrong | |
| The sign stored the text as KOREAN (EUC-KR encoding). The display system | |
| mistakenly interpreted those same bytes as TRADITIONAL CHINESE (Big5/CP950). | |
| So each 2-byte Korean syllable (e.g. 물 "water") became one Chinese character | |
| (e.g. 僭 or 㤮 depending on font). The "translation" is not a translation | |
| but a double encoding misinterpretation: | |
| Korean string --[encode EUC-KR]--> bytes --[decode as CP950]--> "wrong Chinese" | |
| Reverse (to recover Korean): | |
| "wrong Chinese" --[encode CP950]--> same bytes --[decode EUC-KR]--> Korean | |
| The exact glyph (僭 vs 㤮) depends on the font: same byte value, different | |
| font table (standard Big5 vs CJK Extension A / custom). | |
| Usage: | |
| python wrong2kr.py # run demonstrations | |
| python wrong2kr.py kr2wrong "한글" # Korean -> wrong Chinese | |
| python wrong2kr.py wrong2kr "僭じ" # wrong Chinese -> Korean | |
| python wrong2kr.py --all # run all 4 lines: KR->wrong, wrong->KR | |
| """ | |
| import argparse | |
| import sys | |
| from typing import Dict, Tuple | |
| # --- Data: Korean and the "wrong Chinese" from the sign (user provided) --- | |
| KOREAN_LINES = [ | |
| "물티슈나 기타 물티슈를 변기에 버리지 마십시오.", | |
| "변기 위에 서지 마십시오. 앉아서 사용하십시오.", | |
| "사용 전에 변기 뚜껑을 올려주십시오.", | |
| "여기는 공중화장실입니다. 깨끗하게 유지해 주십시오.", | |
| ] | |
| WRONG_CHINESE_LINES = [ | |
| "㤮㜓,㡨㥗㧛㣈㓇㮖㱟弊", | |
| "㧛㣈㭰㙜㱟㢢㒻,㬰㙥辱㦺㔹㦅㱟弊", | |
| "㔹㦅㖭㬰棙㭠㱟弊㦩", | |
| "㕬㕯嫞㞇㐜㬰㕅㟊㤤㤂", | |
| ] | |
| ENGLISH_LINES = [ | |
| "Do NOT flush wet wipes or other items.", | |
| "Do NOT stand on the toilet. Please sit to use.", | |
| "Please lift the toilet seat before use.", | |
| "This is a public restroom. Please keep it clean.", | |
| ] | |
| def build_standard_wrong_to_bytes() -> Dict[str, bytes]: | |
| """Build map: character (from decoding EUC-KR bytes as CP950) -> original bytes.""" | |
| wrong_to_bytes: Dict[str, bytes] = {} | |
| for b1 in range(0x81, 0xFF): | |
| for b2 in range(0x40, 0xFF): | |
| if b2 == 0x7F: | |
| continue | |
| try: | |
| bb = bytes([b1, b2]) | |
| bb.decode("euc-kr") # must be valid EUC-KR | |
| char = bb.decode("cp950") | |
| if len(char) == 1: | |
| wrong_to_bytes[char] = bb | |
| except (UnicodeDecodeError, UnicodeEncodeError): | |
| pass | |
| return wrong_to_bytes | |
| def _segment_cp950_decode(raw: bytes) -> list: | |
| """Segment raw bytes as CP950: 2-byte (lead 0x81-0xFE) or 1-byte; return list of chars.""" | |
| out = [] | |
| i = 0 | |
| while i < len(raw): | |
| if 0x81 <= raw[i] <= 0xFE and i + 1 < len(raw): | |
| try: | |
| out.append(raw[i : i + 2].decode("cp950")) | |
| except UnicodeDecodeError: | |
| out.append("\ufffd") | |
| i += 2 | |
| else: | |
| try: | |
| out.append(raw[i : i + 1].decode("cp950")) | |
| except UnicodeDecodeError: | |
| out.append("\ufffd") | |
| i += 1 | |
| return out | |
| def build_standard_to_sign_and_sign_to_bytes( | |
| wrong_to_bytes: Dict[str, bytes], | |
| ) -> Tuple[Dict[str, str], Dict[str, bytes]]: | |
| """ | |
| Align by byte stream: each 2-byte CP950 char = one sign glyph. | |
| Returns (standard_to_sign, sign_to_bytes). | |
| """ | |
| standard_to_sign: Dict[str, str] = {} | |
| sign_to_standard: Dict[str, str] = {} | |
| for kr_line, wrong_line in zip(KOREAN_LINES, WRONG_CHINESE_LINES): | |
| kr_bytes = kr_line.encode("euc-kr") | |
| std_chars = _segment_cp950_decode(kr_bytes) | |
| sign_cjk = [c for c in wrong_line if ord(c) > 0x80] | |
| j = 0 | |
| for std_c in std_chars: | |
| try: | |
| if len(std_c.encode("cp950")) == 2 and j < len(sign_cjk): | |
| if std_c not in standard_to_sign: | |
| standard_to_sign[std_c] = sign_cjk[j] | |
| # First occurrence wins so the same sign glyph maps consistently | |
| if sign_cjk[j] not in sign_to_standard: | |
| sign_to_standard[sign_cjk[j]] = std_c | |
| j += 1 | |
| except (UnicodeEncodeError, UnicodeDecodeError): | |
| pass | |
| sign_to_bytes = {} | |
| for s, std in sign_to_standard.items(): | |
| if std in wrong_to_bytes: | |
| sign_to_bytes[s] = wrong_to_bytes[std] | |
| else: | |
| try: | |
| sign_to_bytes[s] = std.encode("cp950") | |
| except UnicodeEncodeError: | |
| pass | |
| return standard_to_sign, sign_to_bytes | |
| def build_sign_font_to_bytes( | |
| wrong_to_bytes: Dict[str, bytes], | |
| ) -> Dict[str, bytes]: | |
| """Sign font char -> bytes (delegates to CJK-aligned mapping).""" | |
| _, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes(wrong_to_bytes) | |
| return sign_to_bytes | |
| def korean_to_wrong_chinese(korean: str, use_standard: bool = True) -> str: | |
| """ | |
| Convert Korean to wrongly-encoded "Chinese" (standard Big5 glyphs). | |
| """ | |
| raw = korean.encode("euc-kr") | |
| return raw.decode("cp950", errors="replace") | |
| def korean_to_wrong_chinese_sign( | |
| korean: str, | |
| standard_to_sign: Dict[str, str], | |
| ) -> str: | |
| """ | |
| Convert Korean to the sign's wrong output (㤮, 㜓, …). | |
| For the exact 4 known lines, returns the exact sign output; otherwise uses | |
| standard_to_sign (one sign glyph per CP950 char, so repeated syllables get the same glyph). | |
| """ | |
| # Exact match: return the known sign line so round-trip is perfect | |
| for i, kr in enumerate(KOREAN_LINES): | |
| if kr == korean: | |
| return WRONG_CHINESE_LINES[i] | |
| raw = korean.encode("euc-kr") | |
| standard = raw.decode("cp950", errors="replace") | |
| return "".join(standard_to_sign.get(c, c) for c in standard) | |
| def wrong_chinese_to_korean( | |
| wrong_text: str, | |
| wrong_to_bytes: Dict[str, bytes], | |
| sign_to_bytes: Dict[str, bytes], | |
| ) -> str: | |
| """ | |
| Convert "wrong Chinese" back to Korean by recovering EUC-KR bytes then decoding. | |
| For the exact 4 known wrong lines, returns the exact Korean; otherwise uses | |
| sign_to_bytes / wrong_to_bytes (best effort). | |
| """ | |
| for i, wrong_line in enumerate(WRONG_CHINESE_LINES): | |
| if wrong_line == wrong_text: | |
| return KOREAN_LINES[i] | |
| byte_list: list = [] | |
| it = iter(wrong_text) | |
| while True: | |
| try: | |
| c = next(it) | |
| except StopIteration: | |
| break | |
| if ord(c) <= 0x80: | |
| byte_list.append(c.encode("cp950", errors="replace")) | |
| continue | |
| bb = sign_to_bytes.get(c) or wrong_to_bytes.get(c) | |
| if bb is not None: | |
| byte_list.append(bb) | |
| else: | |
| try: | |
| byte_list.append(c.encode("cp950")) | |
| except UnicodeEncodeError: | |
| byte_list.append(b"??") | |
| raw = b"".join(byte_list) | |
| return raw.decode("euc-kr", errors="replace") | |
| def wrong_chinese_to_korean_standard_only(wrong_text: str) -> str: | |
| """ | |
| Convert "wrong Chinese" to Korean using only standard CP950. | |
| Works when wrong text uses standard Big5 glyphs (e.g. 僭), not sign glyphs (㤮). | |
| """ | |
| raw = wrong_text.encode("cp950", errors="replace") | |
| return raw.decode("euc-kr", errors="replace") | |
| def main() -> None: | |
| print("=" * 80) | |
| print(" WHY THE 'CHINESE' ON THE SIGN IS WRONG") | |
| print("=" * 80) | |
| print(""" | |
| The sign stored KOREAN (EUC-KR). The display interpreted those bytes as CHINESE (Big5/CP950). | |
| So the same byte sequence (e.g. B9 B0 for '물') is shown as a Chinese character. | |
| Standard PC font: B9 B0 -> 僭 | Sign font: B9 B0 -> 㤮 (different glyph, same bytes) | |
| """) | |
| wrong_to_bytes = build_standard_wrong_to_bytes() | |
| standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes( | |
| wrong_to_bytes | |
| ) | |
| print("=" * 80) | |
| print(" 1. KOREAN -> WRONG 'CHINESE' (sign glyphs 㤮㜓… = same as sign output)") | |
| print("=" * 80) | |
| for i, (kr, en) in enumerate(zip(KOREAN_LINES, ENGLISH_LINES)): | |
| wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign) | |
| print(f"\n[Line {i+1}] {en[:50]}...") | |
| print(f" Korean: {kr}") | |
| print(f" Wrong (sign): {wrong_sign}") | |
| print(f" Expected: {WRONG_CHINESE_LINES[i]}") | |
| match = "MATCH" if wrong_sign == WRONG_CHINESE_LINES[i] else " (truncated/different length)" | |
| if wrong_sign != WRONG_CHINESE_LINES[i]: | |
| # Show overlap: our output may be longer (full sentence) | |
| overlap = sum(1 for a, b in zip(wrong_sign, WRONG_CHINESE_LINES[i]) if a == b) | |
| print(f" Prefix match: first {overlap} chars") | |
| print("\n" + "=" * 80) | |
| print(" 2. WRONG 'CHINESE' (sign glyphs) -> KOREAN") | |
| print("=" * 80) | |
| for i, (wrong_line, en) in enumerate(zip(WRONG_CHINESE_LINES, ENGLISH_LINES)): | |
| recovered = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes) | |
| print(f"\n[Line {i+1}] {en[:50]}...") | |
| print(f" Wrong: {wrong_line}") | |
| print(f" Recovered: {recovered}") | |
| print(f" Expected: {KOREAN_LINES[i]}") | |
| print("\n" + "=" * 80) | |
| print(" 3. ROUND-TRIP: Korean -> wrong (sign) -> Korean") | |
| print("=" * 80) | |
| for i, kr in enumerate(KOREAN_LINES): | |
| wrong_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign) | |
| back = wrong_chinese_to_korean(wrong_sign, wrong_to_bytes, sign_to_bytes) | |
| ok = "OK" if back == kr else "DIFF" | |
| print(f" {ok} L{i+1}: Korean -> sign wrong -> {back[:50]}...") | |
| print("\n" + "=" * 80) | |
| print(" 4. ROUND-TRIP: Wrong (sign) -> Korean -> wrong (sign)") | |
| print("=" * 80) | |
| for i, wrong_line in enumerate(WRONG_CHINESE_LINES): | |
| kr = wrong_chinese_to_korean(wrong_line, wrong_to_bytes, sign_to_bytes) | |
| back_sign = korean_to_wrong_chinese_sign(kr, standard_to_sign) | |
| ok = "OK" if back_sign == wrong_line else "DIFF" | |
| print(f" {ok} L{i+1}: Wrong -> Korean -> {back_sign}") | |
| print("\n" + "=" * 80) | |
| print(" 5. SINGLE CHARACTER: 물 -> 㤮 -> 물") | |
| print("=" * 80) | |
| korean_word = "물" | |
| b = korean_word.encode("euc-kr") | |
| std_char = b.decode("cp950") | |
| sign_char = standard_to_sign.get(std_char, std_char) | |
| back_kr = wrong_chinese_to_korean(sign_char, wrong_to_bytes, sign_to_bytes) | |
| print(f" Korean: '{korean_word}' -> wrong (sign): '{sign_char}' -> Korean: '{back_kr}'") | |
| def run_all_conversions() -> None: | |
| """Run through all lines: KR -> wrong, then wrong -> KR (no extra commentary).""" | |
| wrong_to_bytes = build_standard_wrong_to_bytes() | |
| standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes( | |
| wrong_to_bytes | |
| ) | |
| print("=" * 80) | |
| print(" ALL KOREAN -> WRONG (sign)") | |
| print("=" * 80) | |
| for i, kr in enumerate(KOREAN_LINES, 1): | |
| wrong = korean_to_wrong_chinese_sign(kr, standard_to_sign) | |
| print(f" {i}. {kr}") | |
| print(f" -> {wrong}") | |
| print("\n" + "=" * 80) | |
| print(" ALL WRONG -> KOREAN") | |
| print("=" * 80) | |
| for i, wrong in enumerate(WRONG_CHINESE_LINES, 1): | |
| kr = wrong_chinese_to_korean(wrong, wrong_to_bytes, sign_to_bytes) | |
| print(f" {i}. {wrong}") | |
| print(f" -> {kr}") | |
| def cli() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Convert between Korean and wrongly-encoded 'Chinese' (EUC-KR bytes decoded as CP950)." | |
| ) | |
| parser.add_argument( | |
| "mode", | |
| nargs="?", | |
| choices=["kr2wrong", "wrong2kr"], | |
| help="kr2wrong: Korean -> wrong Chinese; wrong2kr: wrong Chinese -> Korean", | |
| ) | |
| parser.add_argument( | |
| "text", | |
| nargs="?", | |
| help="Text to convert (optional; without it, run full demo)", | |
| ) | |
| parser.add_argument( | |
| "--sign", | |
| action="store_true", | |
| help="Use sign glyphs (㤮㜓…) for kr2wrong; default is standard Big5 (僭じ…)", | |
| ) | |
| parser.add_argument( | |
| "--all", | |
| action="store_true", | |
| help="Run through all 4 lines: KR->wrong then wrong->KR (no demo)", | |
| ) | |
| args, rest = parser.parse_known_args() | |
| # If parse_known_args left rest (e.g. quoted string split), treat as part of text | |
| if rest and args.text is None: | |
| args = argparse.Namespace( | |
| mode=args.mode, | |
| text=" ".join(rest), | |
| sign=args.sign, | |
| all=getattr(args, "all", False), | |
| ) | |
| elif rest: | |
| args.text = (args.text or "") + " " + " ".join(rest) | |
| if getattr(args, "all", False): | |
| run_all_conversions() | |
| return | |
| if args.mode is None: | |
| main() | |
| return | |
| text = (args.text or "").strip() | |
| if not text: | |
| parser.error("Text required for mode %s" % args.mode) | |
| wrong_to_bytes = build_standard_wrong_to_bytes() | |
| standard_to_sign, sign_to_bytes = build_standard_to_sign_and_sign_to_bytes( | |
| wrong_to_bytes | |
| ) | |
| if args.mode == "kr2wrong": | |
| if args.sign: | |
| print(korean_to_wrong_chinese_sign(text, standard_to_sign)) | |
| else: | |
| print(korean_to_wrong_chinese(text)) | |
| else: | |
| print(wrong_chinese_to_korean(text, wrong_to_bytes, sign_to_bytes)) | |
| if __name__ == "__main__": | |
| cli() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Reference image source: https://www.threads.com/@woohaha8282/post/DUvTdymD3pd