Skip to content

Instantly share code, notes, and snippets.

@annibal
Last active November 14, 2025 03:47
Show Gist options
  • Select an option

  • Save annibal/9f8de9055dd9b42dc124acffe1f8ac0a to your computer and use it in GitHub Desktop.

Select an option

Save annibal/9f8de9055dd9b42dc124acffe1f8ac0a to your computer and use it in GitHub Desktop.
Unicode Combination and Width Tests
// Example usage:
console.log(testWidthReductionTechniques("Hello World"));
console.log(runWidthTests());
function testWidthReductionTechniques(inputText) {
// Store original for comparison
const original = inputText;
// Key directional control characters
const RLM = "\u200F"; // Right-to-Left Mark
const LRM = "\u200E"; // Left-to-Right Mark
const RLO = "\u202E"; // Right-to-Left Override
const LRO = "\u202D"; // Left-to-Right Override
const PDF = "\u202C"; // Pop Directional Formatting
const RLI = "\u2067"; // Right-to-Left Isolate
const LRI = "\u2066"; // Left-to-Right Isolate
const FSI = "\u2068"; // First Strong Isolate
const PDI = "\u2069"; // Pop Directional Isolate
// Key combining characters
const CGJ = "\u034F"; // Combining Grapheme Joiner
const ZWJ = "\u200D"; // Zero Width Joiner
const ZWNJ = "\u200C"; // Zero Width Non-Joiner
const COMB_DIAERESIS = "\u0308"; // Combining Diaeresis
const COMB_BREVE = "\u0306"; // Combining Breve
const COMB_DBL_BREVE = "\u035C"; // Combining Double Breve Below
const COMB_OVERLINE = "\u0305"; // Combining Overline
const COMB_MACRON = "\u0304"; // Combining Macron
// Techniques to try
const techniques = [
{
name: "RTL Mark + Combining Joiner",
transform: (text) => {
let result = "";
for (let i = 0; i < text.length; i++) {
result += text[i] + RLM + CGJ;
}
return result;
},
},
{
name: "RTL Override Pairs",
transform: (text) => {
let result = "";
for (let i = 0; i < text.length; i += 2) {
if (i + 1 < text.length) {
// Pair characters with RTL override
result += text[i] + RLO + text[i + 1] + PDF;
} else {
result += text[i];
}
}
return result;
},
},
{
name: "Combining Marks Compression",
transform: (text) => {
let result = "";
for (let i = 0; i < text.length; i++) {
// Add combining mark to potentially compress
result += text[i] + COMB_BREVE;
}
return result;
},
},
{
name: "RTL Isolate with Combining",
transform: (text) => {
let result = "";
for (let i = 0; i < text.length; i += 3) {
if (i + 2 < text.length) {
// Group three chars with RTL isolate
result += RLI + text[i] + text[i + 1] + text[i + 2] + PDI + COMB_MACRON;
} else if (i + 1 < text.length) {
result += RLI + text[i] + text[i + 1] + PDI;
} else {
result += text[i];
}
}
return result;
},
},
{
name: "ZWJ Character Fusion",
transform: (text) => {
let result = "";
for (let i = 0; i < text.length; i++) {
if (i > 0) {
// Try to fuse characters with ZWJ
result += ZWJ;
}
result += text[i];
}
return result;
},
},
{
name: "Bidirectional Layered Approach",
transform: (text) => {
// Create layers of bidirectional controls
let result = RLI;
for (let i = 0; i < text.length; i++) {
if (i % 2 === 0) {
result += LRI + text[i] + PDI;
} else {
result += RLI + text[i] + PDI;
}
// Add combining mark every other character
if (i % 2 === 1) {
result += COMB_DBL_BREVE;
}
}
result += PDI;
return result;
},
},
];
// Apply each technique and collect results
const results = techniques.map((technique) => {
const transformed = technique.transform(inputText);
return {
technique: technique.name,
original: inputText,
transformed: transformed,
originalLength: inputText.length,
transformedLength: transformed.length,
// Code points for debugging
originalCodePoints: [...inputText].map((c) => c.codePointAt(0).toString(16).padStart(4, "0")).join(" "),
transformedCodePoints: [...transformed].map((c) => c.codePointAt(0).toString(16).padStart(4, "0")).join(" "),
};
});
// Format results for display
let output = "=== Width Reduction Techniques Test ===\n\n";
output += `Original text: "${original}" (${original.length} chars)\n\n`;
results.forEach((result) => {
output += `Technique: ${result.technique}\n`;
output += `Transformed: "${result.transformed}" (${result.transformedLength} chars)\n`;
output += `Original code points: ${result.originalCodePoints}\n`;
output += `Transformed code points: ${result.transformedCodePoints}\n\n`;
});
return output;
}
// Test with various strings
function runWidthTests() {
const testStrings = ["Hello", "MMMMMM", "iiiiii", "Mixed width chars", "👨‍👩‍👧‍👦 Family emoji"];
let allResults = "";
testStrings.forEach((str) => {
allResults += testWidthReductionTechniques(str);
allResults += "\n----------------------------\n\n";
});
return allResults;
}
// Function to measure visual width in browser (if available)
function measureVisualWidth(text, fontStyle = "16px Arial") {
// Check if we're in a browser environment
if (typeof document === "undefined") {
return { text, fontStyle, note: "Cannot measure width (not in browser environment)" };
}
// Create canvas for measurement
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");
context.font = fontStyle;
// Measure text
const metrics = context.measureText(text);
return {
text,
fontStyle,
width: metrics.width,
pixelsPerChar: metrics.width / text.length,
};
}
function intToUnicodeHexEscape(charNum) {
result += `U+${charNum.toString(16).toUpperCase().padStart(4, '0')}: `;
const char = String.fromCodePoint(charNum);
return char;
// if (charNum < 256) {
// return '\\x' + charNum.toString(16).toUpperCase().padStart(2, '0');
// }
// return '\\u' + charNum.toString(16).toUpperCase().padStart(4, '0');
}
// depends on unicodeCombinationCharactersRange.json
function printAllCombiningCharacters(baseChar = " ", secChar = " ") {
const ranges = unicodeCombinationCharactersRange.uCRanges.map((r) => ({
...r,
start: intToUnicodeHexEscape(r.start),
end: intToUnicodeHexEscape(r.end),
}));
let result = "";
[
{ title: "Regular", add: "" },
{ title: "With ZWJ", add: "0x200D" },
{ title: "With ZWNJ", add: "0x200C" },
].forEach((recombine) => {
result += "█".repeat(24) + "\n████ All Combinable Chars, " + recombine.title + "\n";
ranges.forEach((range) => {
result += `\n\n=== ${range.name} (U+${range.start.toString(16).toUpperCase()}-U+${range.end
.toString(16)
.toUpperCase()}) ===\n\n`;
// Group characters in rows of 16 for better readability
for (let idx = range.start; idx <= range.end; idx++) {
// Add the character code
result += " ";
const char = unicodeCombinationCharactersRange(idx);
// Special handling for format characters which are invisible on their own
if ([0x200b, 0x2060, 0x2061, 0x2062, 0x2063, 0x2064, 0x034f].includes(idx)) {
result += `|[${baseChar}${char}]|[${secChar}${char}]`;
} else if ([0x200d, 0x200c, 0x034f, 0x035c, 0x0361].includes(idx)) {
result += `| ${baseChar}${char}${secChar}| ${secChar}${char}${baseChar}`;
} else {
result += `| ${baseChar}${char} | ${secChar}${char} `;
}
// Add a newline after every 4 characters for readability (reduced from 8 for more content per line)
if ((idx - range.start + 1) % 8 === 0) {
result += `| U+${i.toString(16).toUpperCase().padStart(4, "0")}\n`;
}
}
});
});
return result;
}
// Example usage:
// console.log(printAllCombiningCharacters());
// Function to print specific ranges of combining characters
function printCombiningCharactersRange(start, end, baseLetter = "a") {
let result = `Combining characters from U+${start.toString(16).toUpperCase()} to U+${end.toString(16).toUpperCase()}:\n\n`;
for (let i = start; i <= end; i++) {
const char = String.fromCodePoint(i);
result += `U+${i.toString(16).toUpperCase().padStart(4, "0")}: ${baseLetter}${char} `;
// Add examples with different base characters
result += `| o${char} | i${char} `;
// Add a newline after every 3 characters for readability
if ((i - start + 1) % 3 === 0) {
result += "\n";
}
}
return result;
}
// Function to test if a combining character might reduce width
function testWidthReducingCharacters() {
// Characters that might affect width in some fonts/contexts
const potentialWidthReducers = [
{ code: 0x200d, name: "Zero Width Joiner (ZWJ)" },
{ code: 0x200c, name: "Zero Width Non-Joiner (ZWNJ)" },
{ code: 0x034f, name: "Combining Grapheme Joiner (CGJ)" },
{ code: 0x035c, name: "Combining Double Breve Below" },
{ code: 0x0361, name: "Combining Double Inverted Breve" },
{ code: 0x0311, name: "Combining Inverted Breve" },
{ code: 0x0306, name: "Combining Breve" },
{ code: 0x0310, name: "Combining Candrabindu" },
{ code: 0x0344, name: "Combining Greek Dialytika Tonos" },
{ code: 0x0323, name: "Combining Dot Below" },
{ code: 0x033e, name: "Combining Vertical Tilde" },
{ code: 0x0355, name: "Combining Right Arrowhead Below" },
{ code: 0x035d, name: "Combining Double Breve" },
{ code: 0x0360, name: "Combining Double Tilde" },
];
let result = "=== Testing Potentially Width-Reducing Characters ===\n\n";
// Test base strings of various widths
const baseStrings = ["m", "i", "w", "l", "nnnn", "iiii", "wwww", "mmmm", "....."];
potentialWidthReducers.forEach((char) => {
const combChar = String.fromCodePoint(char.code);
result += `U+${char.code.toString(16).toUpperCase().padStart(4, "0")} ${char.name}:\n`;
baseStrings.forEach((base) => {
// Apply the combining character to each character in the base string
let modified = "";
for (let i = 0; i < base.length; i++) {
modified += base[i] + combChar;
}
result += ` Base: "${base}" → Modified: "${modified}"\n`;
});
result += "\n";
});
return result;
}
// Examples:
// console.log(printCombiningCharactersRange(0x0300, 0x0310));
// console.log(testWidthReducingCharacters());
// dependencies:
// - unicodeCombinationCharactersRange.json
// - intToUnicodeHexEscape.js
// - combineRanges and the RegExp
// unicode combine: for each addition, combine it with the base character.
uCombine("a", ["`"]) == "à";
uCombine("o", ["^"]) == "ô";
// apply all combination characters once to every character in base.
uCombine("b", ["A̪","B̫","C̬"]) == "b̪̫̬";
// extract the combinated characters from each addition, ignore the base character.
uCombine("bob", ["A̪","B̫","C̬"]) == "b̪̫̬o̪̫̬b̪̫̬";
// otherwise lots of garbage characters like spaces would fall thru.
uCombine("e", [" ̄", " ̄̄", " ̳̳", "x̳̳"]) == "ē̳̳̳̄̄";
// additions[i] must be array of chars. so for compatibility, get the first valid combining character, reading from the end, and ignore the rest.
// also trim whitespaces in case the source of the addition string was copied quickly.
uCombine("x", ["tá 10 g̊ "]) == "x̊"
//
const combineRanges = unicodeCombinationCharactersRange.uCRanges.map(r => {
const s = intToUnicodeHexEscape(r.start);
const e = intToUnicodeHexEscape(r.end);
return s + "-" + e;
})
const isCombineCharRegExp = new RegExp(combineRanges.join(""));
function uCombine(base, additions) {
if (!base || base.length === 0) return '';
if (!additions || !Array.isArray(additions) || additions.length === 0) return base;
let result = '';
for (let i = 0; i < base.length; i++) {
const baseChar = base[i];
let combinedChar = baseChar;
for (let j = 0; j < additions.length; j++) {
const addition = additions[j].trim();
if (!addition || addition.length === 0) continue;
// Find the combining character in the addition
// Start from the end and find the first combining character
let combiningChar = null;
for (let k = addition.length - 1; k >= 0; k--) {
const char = addition[k];
// Check if this is a combining character (Unicode range U+0300 to U+036F and others)
if (isCombineCharRegExp.test(char)) {
combiningChar = char;
break;
}
}
// If no combining character found, try to extract one from grapheme clusters
if (!combiningChar) {
for (let k = 0; k < addition.length; k++) {
const charCode = addition.charCodeAt(k);
// Skip ASCII characters
if (charCode < 127) continue;
// Try to find the combining character in this complex character
const chars = [...addition[k]];
if (chars.length > 1) {
for (let l = 1; l < chars.length; l++) {
if (isCombineCharRegExp.test(chars[l])) {
combiningChar = chars[l];
break;
}
}
}
if (combiningChar) break;
}
}
// If we found a combining character, add it to the combined character
if (combiningChar) {
combinedChar += combiningChar;
}
}
result += combinedChar;
}
return result;
}
{
"meta": {
"filename": "unicodeCombinationCharactersRange.json",
"usage": "const char = intToUnicodeHexEscape(uCRanges[14].start)",
"-":"--",
"all unicode latest characters data": "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt",
"top most ranges": "Default chars like à and ô",
"from Hebrew to Arabic": "Additional combining character blocks",
"bottom most ranges": "Special format characters that affect text rendering",
"o": "k"
},
"uCRanges": [
{ "start": 768, "end": 879, "name": "Comb. Diacrt. Marks" },
{ "start": 6832, "end": 6911, "name": "Comb. Diacrt. Marks Extended" },
{ "start": 7616, "end": 7679, "name": "Comb. Diacrt. Marks Supplement" },
{ "start": 8400, "end": 8447, "name": "Comb. Diacrt. Marks for Symbols" },
{ "start": 65056, "end": 65071, "name": "Comb. Half Marks" },
{ "start": 1425, "end": 1469, "name": "Hebrew Combining Marks" },
{ "start": 1471, "end": 1471, "name": "Hebrew Point Rafe" },
{ "start": 1473, "end": 1474, "name": "Hebrew Points" },
{ "start": 1476, "end": 1477, "name": "Hebrew Mark" },
{ "start": 1479, "end": 1479, "name": "Hebrew Point Qamats Qatan" },
{ "start": 1552, "end": 1562, "name": "Arabic Combining Marks" },
{ "start": 1611, "end": 1631, "name": "Arabic Fathatan...Sukun" },
{ "start": 1648, "end": 1648, "name": "Arabic Letter Superscript Alef" },
{ "start": 1750, "end": 1756, "name": "Arabic Small Fatha...Small Waw" },
{ "start": 1759, "end": 1764, "name": "Arabic Small High..." },
{ "start": 1767, "end": 1768, "name": "Arabic Small High Yeh/Noon" },
{ "start": 1770, "end": 1773, "name": "Arabic Empty Centre..." },
{ "start": 1809, "end": 1809, "name": "Syriac Abbreviation Mark" },
{ "start": 1840, "end": 1866, "name": "Syriac Pthaha...Barrekh" },
{ "start": 1958, "end": 1968, "name": "Thaana Combining Marks" },
{ "start": 2027, "end": 2035, "name": "NKo Combining Marks" },
{ "start": 2070, "end": 2073, "name": "Samaritan Marks" },
{ "start": 2075, "end": 2083, "name": "Samaritan Marks" },
{ "start": 2085, "end": 2087, "name": "Samaritan Marks" },
{ "start": 2089, "end": 2093, "name": "Samaritan Marks" },
{ "start": 2137, "end": 2139, "name": "Mandaic Affrication/Gemination" },
{ "start": 2260, "end": 2273, "name": "Arabic Tone/Extended Marks" },
{ "start": 2275, "end": 2303, "name": "Arabic Extended Marks" },
{ "start": 8203, "end": 8203, "name": "Zero Width Space (ZWSP)" },
{ "start": 8204, "end": 8204, "name": "Zero Width Non-Joiner (ZWNJ)" },
{ "start": 8205, "end": 8205, "name": "Zero Width Joiner (ZWJ)" },
{ "start": 8288, "end": 8288, "name": "Word Joiner (WJ)" },
{ "start": 8289, "end": 8292, "name": "Function Appl...Invisible Plus" },
{ "start": 8298, "end": 8303, "name": "Inhibit Symmetric...Nominal Digit" },
{ "start": 847, "end": 847, "name": "Combining Grapheme Joiner (CGJ)" }
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment