Last active
November 14, 2025 03:47
-
-
Save annibal/9f8de9055dd9b42dc124acffe1f8ac0a to your computer and use it in GitHub Desktop.
Unicode Combination and Width Tests
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Example usage: | |
| console.log(testWidthReductionTechniques("Hello World")); | |
| console.log(runWidthTests()); | |
| function testWidthReductionTechniques(inputText) { | |
| // Store original for comparison | |
| const original = inputText; | |
| // Key directional control characters | |
| const RLM = "\u200F"; // Right-to-Left Mark | |
| const LRM = "\u200E"; // Left-to-Right Mark | |
| const RLO = "\u202E"; // Right-to-Left Override | |
| const LRO = "\u202D"; // Left-to-Right Override | |
| const PDF = "\u202C"; // Pop Directional Formatting | |
| const RLI = "\u2067"; // Right-to-Left Isolate | |
| const LRI = "\u2066"; // Left-to-Right Isolate | |
| const FSI = "\u2068"; // First Strong Isolate | |
| const PDI = "\u2069"; // Pop Directional Isolate | |
| // Key combining characters | |
| const CGJ = "\u034F"; // Combining Grapheme Joiner | |
| const ZWJ = "\u200D"; // Zero Width Joiner | |
| const ZWNJ = "\u200C"; // Zero Width Non-Joiner | |
| const COMB_DIAERESIS = "\u0308"; // Combining Diaeresis | |
| const COMB_BREVE = "\u0306"; // Combining Breve | |
| const COMB_DBL_BREVE = "\u035C"; // Combining Double Breve Below | |
| const COMB_OVERLINE = "\u0305"; // Combining Overline | |
| const COMB_MACRON = "\u0304"; // Combining Macron | |
| // Techniques to try | |
| const techniques = [ | |
| { | |
| name: "RTL Mark + Combining Joiner", | |
| transform: (text) => { | |
| let result = ""; | |
| for (let i = 0; i < text.length; i++) { | |
| result += text[i] + RLM + CGJ; | |
| } | |
| return result; | |
| }, | |
| }, | |
| { | |
| name: "RTL Override Pairs", | |
| transform: (text) => { | |
| let result = ""; | |
| for (let i = 0; i < text.length; i += 2) { | |
| if (i + 1 < text.length) { | |
| // Pair characters with RTL override | |
| result += text[i] + RLO + text[i + 1] + PDF; | |
| } else { | |
| result += text[i]; | |
| } | |
| } | |
| return result; | |
| }, | |
| }, | |
| { | |
| name: "Combining Marks Compression", | |
| transform: (text) => { | |
| let result = ""; | |
| for (let i = 0; i < text.length; i++) { | |
| // Add combining mark to potentially compress | |
| result += text[i] + COMB_BREVE; | |
| } | |
| return result; | |
| }, | |
| }, | |
| { | |
| name: "RTL Isolate with Combining", | |
| transform: (text) => { | |
| let result = ""; | |
| for (let i = 0; i < text.length; i += 3) { | |
| if (i + 2 < text.length) { | |
| // Group three chars with RTL isolate | |
| result += RLI + text[i] + text[i + 1] + text[i + 2] + PDI + COMB_MACRON; | |
| } else if (i + 1 < text.length) { | |
| result += RLI + text[i] + text[i + 1] + PDI; | |
| } else { | |
| result += text[i]; | |
| } | |
| } | |
| return result; | |
| }, | |
| }, | |
| { | |
| name: "ZWJ Character Fusion", | |
| transform: (text) => { | |
| let result = ""; | |
| for (let i = 0; i < text.length; i++) { | |
| if (i > 0) { | |
| // Try to fuse characters with ZWJ | |
| result += ZWJ; | |
| } | |
| result += text[i]; | |
| } | |
| return result; | |
| }, | |
| }, | |
| { | |
| name: "Bidirectional Layered Approach", | |
| transform: (text) => { | |
| // Create layers of bidirectional controls | |
| let result = RLI; | |
| for (let i = 0; i < text.length; i++) { | |
| if (i % 2 === 0) { | |
| result += LRI + text[i] + PDI; | |
| } else { | |
| result += RLI + text[i] + PDI; | |
| } | |
| // Add combining mark every other character | |
| if (i % 2 === 1) { | |
| result += COMB_DBL_BREVE; | |
| } | |
| } | |
| result += PDI; | |
| return result; | |
| }, | |
| }, | |
| ]; | |
| // Apply each technique and collect results | |
| const results = techniques.map((technique) => { | |
| const transformed = technique.transform(inputText); | |
| return { | |
| technique: technique.name, | |
| original: inputText, | |
| transformed: transformed, | |
| originalLength: inputText.length, | |
| transformedLength: transformed.length, | |
| // Code points for debugging | |
| originalCodePoints: [...inputText].map((c) => c.codePointAt(0).toString(16).padStart(4, "0")).join(" "), | |
| transformedCodePoints: [...transformed].map((c) => c.codePointAt(0).toString(16).padStart(4, "0")).join(" "), | |
| }; | |
| }); | |
| // Format results for display | |
| let output = "=== Width Reduction Techniques Test ===\n\n"; | |
| output += `Original text: "${original}" (${original.length} chars)\n\n`; | |
| results.forEach((result) => { | |
| output += `Technique: ${result.technique}\n`; | |
| output += `Transformed: "${result.transformed}" (${result.transformedLength} chars)\n`; | |
| output += `Original code points: ${result.originalCodePoints}\n`; | |
| output += `Transformed code points: ${result.transformedCodePoints}\n\n`; | |
| }); | |
| return output; | |
| } | |
| // Test with various strings | |
| function runWidthTests() { | |
| const testStrings = ["Hello", "MMMMMM", "iiiiii", "Mixed width chars", "👨👩👧👦 Family emoji"]; | |
| let allResults = ""; | |
| testStrings.forEach((str) => { | |
| allResults += testWidthReductionTechniques(str); | |
| allResults += "\n----------------------------\n\n"; | |
| }); | |
| return allResults; | |
| } | |
| // Function to measure visual width in browser (if available) | |
| function measureVisualWidth(text, fontStyle = "16px Arial") { | |
| // Check if we're in a browser environment | |
| if (typeof document === "undefined") { | |
| return { text, fontStyle, note: "Cannot measure width (not in browser environment)" }; | |
| } | |
| // Create canvas for measurement | |
| const canvas = document.createElement("canvas"); | |
| const context = canvas.getContext("2d"); | |
| context.font = fontStyle; | |
| // Measure text | |
| const metrics = context.measureText(text); | |
| return { | |
| text, | |
| fontStyle, | |
| width: metrics.width, | |
| pixelsPerChar: metrics.width / text.length, | |
| }; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function intToUnicodeHexEscape(charNum) { | |
| result += `U+${charNum.toString(16).toUpperCase().padStart(4, '0')}: `; | |
| const char = String.fromCodePoint(charNum); | |
| return char; | |
| // if (charNum < 256) { | |
| // return '\\x' + charNum.toString(16).toUpperCase().padStart(2, '0'); | |
| // } | |
| // return '\\u' + charNum.toString(16).toUpperCase().padStart(4, '0'); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // depends on unicodeCombinationCharactersRange.json | |
| function printAllCombiningCharacters(baseChar = " ", secChar = " ") { | |
| const ranges = unicodeCombinationCharactersRange.uCRanges.map((r) => ({ | |
| ...r, | |
| start: intToUnicodeHexEscape(r.start), | |
| end: intToUnicodeHexEscape(r.end), | |
| })); | |
| let result = ""; | |
| [ | |
| { title: "Regular", add: "" }, | |
| { title: "With ZWJ", add: "0x200D" }, | |
| { title: "With ZWNJ", add: "0x200C" }, | |
| ].forEach((recombine) => { | |
| result += "█".repeat(24) + "\n████ All Combinable Chars, " + recombine.title + "\n"; | |
| ranges.forEach((range) => { | |
| result += `\n\n=== ${range.name} (U+${range.start.toString(16).toUpperCase()}-U+${range.end | |
| .toString(16) | |
| .toUpperCase()}) ===\n\n`; | |
| // Group characters in rows of 16 for better readability | |
| for (let idx = range.start; idx <= range.end; idx++) { | |
| // Add the character code | |
| result += " "; | |
| const char = unicodeCombinationCharactersRange(idx); | |
| // Special handling for format characters which are invisible on their own | |
| if ([0x200b, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x034f].includes(idx)) { | |
| result += `|[${baseChar}${char}]|[${secChar}${char}]`; | |
| } else if ([0x200d, 0x200c, 0x034f, 0x035c, 0x0361].includes(idx)) { | |
| result += `| ${baseChar}${char}${secChar}| ${secChar}${char}${baseChar}`; | |
| } else { | |
| result += `| ${baseChar}${char} | ${secChar}${char} `; | |
| } | |
| // Add a newline after every 4 characters for readability (reduced from 8 for more content per line) | |
| if ((idx - range.start + 1) % 8 === 0) { | |
| result += `| U+${i.toString(16).toUpperCase().padStart(4, "0")}\n`; | |
| } | |
| } | |
| }); | |
| }); | |
| return result; | |
| } | |
| // Example usage: | |
| // console.log(printAllCombiningCharacters()); | |
| // Function to print specific ranges of combining characters | |
| function printCombiningCharactersRange(start, end, baseLetter = "a") { | |
| let result = `Combining characters from U+${start.toString(16).toUpperCase()} to U+${end.toString(16).toUpperCase()}:\n\n`; | |
| for (let i = start; i <= end; i++) { | |
| const char = String.fromCodePoint(i); | |
| result += `U+${i.toString(16).toUpperCase().padStart(4, "0")}: ${baseLetter}${char} `; | |
| // Add examples with different base characters | |
| result += `| o${char} | i${char} `; | |
| // Add a newline after every 3 characters for readability | |
| if ((i - start + 1) % 3 === 0) { | |
| result += "\n"; | |
| } | |
| } | |
| return result; | |
| } | |
| // Function to test if a combining character might reduce width | |
| function testWidthReducingCharacters() { | |
| // Characters that might affect width in some fonts/contexts | |
| const potentialWidthReducers = [ | |
| { code: 0x200d, name: "Zero Width Joiner (ZWJ)" }, | |
| { code: 0x200c, name: "Zero Width Non-Joiner (ZWNJ)" }, | |
| { code: 0x034f, name: "Combining Grapheme Joiner (CGJ)" }, | |
| { code: 0x035c, name: "Combining Double Breve Below" }, | |
| { code: 0x0361, name: "Combining Double Inverted Breve" }, | |
| { code: 0x0311, name: "Combining Inverted Breve" }, | |
| { code: 0x0306, name: "Combining Breve" }, | |
| { code: 0x0310, name: "Combining Candrabindu" }, | |
| { code: 0x0344, name: "Combining Greek Dialytika Tonos" }, | |
| { code: 0x0323, name: "Combining Dot Below" }, | |
| { code: 0x033e, name: "Combining Vertical Tilde" }, | |
| { code: 0x0355, name: "Combining Right Arrowhead Below" }, | |
| { code: 0x035d, name: "Combining Double Breve" }, | |
| { code: 0x0360, name: "Combining Double Tilde" }, | |
| ]; | |
| let result = "=== Testing Potentially Width-Reducing Characters ===\n\n"; | |
| // Test base strings of various widths | |
| const baseStrings = ["m", "i", "w", "l", "nnnn", "iiii", "wwww", "mmmm", "....."]; | |
| potentialWidthReducers.forEach((char) => { | |
| const combChar = String.fromCodePoint(char.code); | |
| result += `U+${char.code.toString(16).toUpperCase().padStart(4, "0")} ${char.name}:\n`; | |
| baseStrings.forEach((base) => { | |
| // Apply the combining character to each character in the base string | |
| let modified = ""; | |
| for (let i = 0; i < base.length; i++) { | |
| modified += base[i] + combChar; | |
| } | |
| result += ` Base: "${base}" → Modified: "${modified}"\n`; | |
| }); | |
| result += "\n"; | |
| }); | |
| return result; | |
| } | |
| // Examples: | |
| // console.log(printCombiningCharactersRange(0x0300, 0x0310)); | |
| // console.log(testWidthReducingCharacters()); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // dependencies: | |
| // - unicodeCombinationCharactersRange.json | |
| // - intToUnicodeHexEscape.js | |
| // - combineRanges and the RegExp | |
| // unicode combine: for each addition, combine it with the base character. | |
| uCombine("a", ["`"]) == "à"; | |
| uCombine("o", ["^"]) == "ô"; | |
| // apply all combination characters once to every character in base. | |
| uCombine("b", ["A̪","B̫","C̬"]) == "b̪̫̬"; | |
| // extract the combinated characters from each addition, ignore the base character. | |
| uCombine("bob", ["A̪","B̫","C̬"]) == "b̪̫̬o̪̫̬b̪̫̬"; | |
| // otherwise lots of garbage characters like spaces would fall thru. | |
| uCombine("e", [" ̄", " ̄̄", " ̳̳", "x̳̳"]) == "ē̳̳̳̄̄"; | |
| // additions[i] must be array of chars. so for compatibility, get the first valid combining character, reading from the end, and ignore the rest. | |
| // also trim whitespaces in case the source of the addition string was copied quickly. | |
| uCombine("x", ["tá 10 g̊ "]) == "x̊" | |
| // | |
| const combineRanges = unicodeCombinationCharactersRange.uCRanges.map(r => { | |
| const s = intToUnicodeHexEscape(r.start); | |
| const e = intToUnicodeHexEscape(r.end); | |
| return s + "-" + e; | |
| }) | |
| const isCombineCharRegExp = new RegExp(combineRanges.join("")); | |
| function uCombine(base, additions) { | |
| if (!base || base.length === 0) return ''; | |
| if (!additions || !Array.isArray(additions) || additions.length === 0) return base; | |
| let result = ''; | |
| for (let i = 0; i < base.length; i++) { | |
| const baseChar = base[i]; | |
| let combinedChar = baseChar; | |
| for (let j = 0; j < additions.length; j++) { | |
| const addition = additions[j].trim(); | |
| if (!addition || addition.length === 0) continue; | |
| // Find the combining character in the addition | |
| // Start from the end and find the first combining character | |
| let combiningChar = null; | |
| for (let k = addition.length - 1; k >= 0; k--) { | |
| const char = addition[k]; | |
| // Check if this is a combining character (Unicode range U+0300 to U+036F and others) | |
| if (isCombineCharRegExp.test(char)) { | |
| combiningChar = char; | |
| break; | |
| } | |
| } | |
| // If no combining character found, try to extract one from grapheme clusters | |
| if (!combiningChar) { | |
| for (let k = 0; k < addition.length; k++) { | |
| const charCode = addition.charCodeAt(k); | |
| // Skip ASCII characters | |
| if (charCode < 127) continue; | |
| // Try to find the combining character in this complex character | |
| const chars = [...addition[k]]; | |
| if (chars.length > 1) { | |
| for (let l = 1; l < chars.length; l++) { | |
| if (isCombineCharRegExp.test(chars[l])) { | |
| combiningChar = chars[l]; | |
| break; | |
| } | |
| } | |
| } | |
| if (combiningChar) break; | |
| } | |
| } | |
| // If we found a combining character, add it to the combined character | |
| if (combiningChar) { | |
| combinedChar += combiningChar; | |
| } | |
| } | |
| result += combinedChar; | |
| } | |
| return result; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "meta": { | |
| "filename": "unicodeCombinationCharactersRange.json", | |
| "usage": "const char = intToUnicodeHexEscape(uCRanges[14].start)", | |
| "-":"--", | |
| "all unicode latest characters data": "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt", | |
| "top most ranges": "Default chars like à and ô", | |
| "from Hebrew to Arabic": "Additional combining character blocks", | |
| "bottom most ranges": "Special format characters that affect text rendering", | |
| "o": "k" | |
| }, | |
| "uCRanges": [ | |
| { "start": 768, "end": 879, "name": "Comb. Diacrt. Marks" }, | |
| { "start": 6832, "end": 6911, "name": "Comb. Diacrt. Marks Extended" }, | |
| { "start": 7616, "end": 7679, "name": "Comb. Diacrt. Marks Supplement" }, | |
| { "start": 8400, "end": 8447, "name": "Comb. Diacrt. Marks for Symbols" }, | |
| { "start": 65056, "end": 65071, "name": "Comb. Half Marks" }, | |
| { "start": 1425, "end": 1469, "name": "Hebrew Combining Marks" }, | |
| { "start": 1471, "end": 1471, "name": "Hebrew Point Rafe" }, | |
| { "start": 1473, "end": 1474, "name": "Hebrew Points" }, | |
| { "start": 1476, "end": 1477, "name": "Hebrew Mark" }, | |
| { "start": 1479, "end": 1479, "name": "Hebrew Point Qamats Qatan" }, | |
| { "start": 1552, "end": 1562, "name": "Arabic Combining Marks" }, | |
| { "start": 1611, "end": 1631, "name": "Arabic Fathatan...Sukun" }, | |
| { "start": 1648, "end": 1648, "name": "Arabic Letter Superscript Alef" }, | |
| { "start": 1750, "end": 1756, "name": "Arabic Small Fatha...Small Waw" }, | |
| { "start": 1759, "end": 1764, "name": "Arabic Small High..." }, | |
| { "start": 1767, "end": 1768, "name": "Arabic Small High Yeh/Noon" }, | |
| { "start": 1770, "end": 1773, "name": "Arabic Empty Centre..." }, | |
| { "start": 1809, "end": 1809, "name": "Syriac Abbreviation Mark" }, | |
| { "start": 1840, "end": 1866, "name": "Syriac Pthaha...Barrekh" }, | |
| { "start": 1958, "end": 1968, "name": "Thaana Combining Marks" }, | |
| { "start": 2027, "end": 2035, "name": "NKo Combining Marks" }, | |
| { "start": 2070, "end": 2073, "name": "Samaritan Marks" }, | |
| { "start": 2075, "end": 2083, "name": "Samaritan Marks" }, | |
| { "start": 2085, "end": 2087, "name": "Samaritan Marks" }, | |
| { "start": 2089, "end": 2093, "name": "Samaritan Marks" }, | |
| { "start": 2137, "end": 2139, "name": "Mandaic Affrication/Gemination" }, | |
| { "start": 2260, "end": 2273, "name": "Arabic Tone/Extended Marks" }, | |
| { "start": 2275, "end": 2303, "name": "Arabic Extended Marks" }, | |
| { "start": 8203, "end": 8203, "name": "Zero Width Space (ZWSP)" }, | |
| { "start": 8204, "end": 8204, "name": "Zero Width Non-Joiner (ZWNJ)" }, | |
| { "start": 8205, "end": 8205, "name": "Zero Width Joiner (ZWJ)" }, | |
| { "start": 8288, "end": 8288, "name": "Word Joiner (WJ)" }, | |
| { "start": 8289, "end": 8292, "name": "Function Appl...Invisible Plus" }, | |
| { "start": 8298, "end": 8303, "name": "Inhibit Symmetric...Nominal Digit" }, | |
| { "start": 847, "end": 847, "name": "Combining Grapheme Joiner (CGJ)" } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment