Last active
January 8, 2026 18:21
-
-
Save egorsmkv/a0d37cb780bd50930cd11ac21cdde7bc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "utf8.h" | |
| #include <stdlib.h> | |
| size_t utf8_decode_advance(const char8_t *bytes, size_t len, | |
| uint32_t *out_codepoint, bool *out_invalid) { | |
| if (!bytes || len == 0) | |
| return 0; | |
| uint8_t b0 = bytes[0]; | |
| uint32_t codepoint = 0xFFFD; | |
| size_t advance = 1; | |
| if (b0 < 0x80) { | |
| codepoint = b0; | |
| } else if ((b0 & 0xE0) == 0xC0 && len >= 2) { | |
| uint8_t b1 = bytes[1]; | |
| if ((b1 & 0xC0) == 0x80) { | |
| codepoint = ((uint32_t)(b0 & 0x1F) << 6) | (uint32_t)(b1 & 0x3F); | |
| if (codepoint >= 0x80) { | |
| advance = 2; | |
| } else { | |
| codepoint = 0xFFFD; | |
| } | |
| } | |
| } else if ((b0 & 0xF0) == 0xE0 && len >= 3) { | |
| uint8_t b1 = bytes[1]; | |
| uint8_t b2 = bytes[2]; | |
| if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80) { | |
| codepoint = ((uint32_t)(b0 & 0x0F) << 12) | ((uint32_t)(b1 & 0x3F) << 6) | | |
| (uint32_t)(b2 & 0x3F); | |
| if (codepoint >= 0x800 && (codepoint < 0xD800 || codepoint > 0xDFFF)) { | |
| advance = 3; | |
| } else { | |
| codepoint = 0xFFFD; | |
| } | |
| } | |
| } else if ((b0 & 0xF8) == 0xF0 && len >= 4) { | |
| uint8_t b1 = bytes[1]; | |
| uint8_t b2 = bytes[2]; | |
| uint8_t b3 = bytes[3]; | |
| if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 && (b3 & 0xC0) == 0x80) { | |
| codepoint = ((uint32_t)(b0 & 0x07) << 18) | | |
| ((uint32_t)(b1 & 0x3F) << 12) | ((uint32_t)(b2 & 0x3F) << 6) | | |
| (uint32_t)(b3 & 0x3F); | |
| if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) { | |
| advance = 4; | |
| } else { | |
| codepoint = 0xFFFD; | |
| } | |
| } | |
| } | |
| if (out_codepoint) | |
| *out_codepoint = codepoint; | |
| if (out_invalid) | |
| *out_invalid = (codepoint == 0xFFFD && b0 >= 0x80); | |
| return advance; | |
| } | |
| [[nodiscard]] bool utf8_decode_buffer(const char8_t *input, size_t len, | |
| uint32_t **out, size_t *out_len, | |
| size_t *invalid_count) { | |
| if (!out || !out_len || !invalid_count) | |
| return false; | |
| *out = NULL; | |
| *out_len = 0; | |
| *invalid_count = 0; | |
| if (!input || len == 0) | |
| return true; | |
| if (len > SIZE_MAX / sizeof(uint32_t)) | |
| return false; | |
| uint32_t *buffer = malloc(len * sizeof(uint32_t)); | |
| if (!buffer) | |
| return false; | |
| const uint8_t *bytes = (const uint8_t *)input; | |
| size_t i = 0; | |
| size_t count = 0; | |
| size_t invalid = 0; | |
| while (i < len) { | |
| uint32_t codepoint = 0; | |
| bool invalid_codepoint = false; | |
| size_t advance = | |
| utf8_decode_advance(bytes + i, len - i, &codepoint, &invalid_codepoint); | |
| if (advance == 0) | |
| break; | |
| if (invalid_codepoint) | |
| invalid++; | |
| buffer[count++] = codepoint; | |
| i += advance; | |
| } | |
| *out = buffer; | |
| *out_len = count; | |
| *invalid_count = invalid; | |
| return true; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #ifndef UTF8_UTIL_H | |
| #define UTF8_UTIL_H | |
| #include <stdbool.h> | |
| #include <stddef.h> | |
| #include <stdint.h> | |
| #include <uchar.h> | |
| #if defined(__clang__) | |
| #if __has_feature(c_char8_t) | |
| #define UTF8_UTIL_HAVE_CHAR8_T 1 | |
| #endif | |
| #elif defined(__GNUC__) | |
| #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L | |
| #define UTF8_UTIL_HAVE_CHAR8_T 1 | |
| #endif | |
| #endif | |
| #if !defined(UTF8_UTIL_HAVE_CHAR8_T) | |
| typedef unsigned char char8_t; | |
| #endif | |
| /** | |
| * Decode one UTF-8 code point. | |
| * @param bytes Pointer to the first byte. | |
| * @param len Number of available bytes. | |
| * @param out_codepoint Optional output for the decoded scalar value. | |
| * @param out_invalid Optional flag set when an invalid sequence is seen. | |
| * @return Number of bytes consumed (0 on invalid/incomplete input). | |
| */ | |
| size_t utf8_decode_advance(const char8_t *bytes, size_t len, | |
| uint32_t *out_codepoint, bool *out_invalid); | |
| /** | |
| * Decode a UTF-8 buffer into a newly allocated UTF-32 array. | |
| * Caller owns the returned buffer. | |
| */ | |
| [[nodiscard]] bool utf8_decode_buffer(const char8_t *input, size_t len, | |
| uint32_t **out, size_t *out_len, | |
| size_t *invalid_count); | |
| #endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment