Skip to content

Instantly share code, notes, and snippets.

@egorsmkv
Last active January 8, 2026 18:21
Show Gist options
  • Select an option

  • Save egorsmkv/a0d37cb780bd50930cd11ac21cdde7bc to your computer and use it in GitHub Desktop.

Select an option

Save egorsmkv/a0d37cb780bd50930cd11ac21cdde7bc to your computer and use it in GitHub Desktop.
#include "utf8.h"
#include <stdlib.h>
size_t utf8_decode_advance(const char8_t *bytes, size_t len,
uint32_t *out_codepoint, bool *out_invalid) {
if (!bytes || len == 0)
return 0;
uint8_t b0 = bytes[0];
uint32_t codepoint = 0xFFFD;
size_t advance = 1;
if (b0 < 0x80) {
codepoint = b0;
} else if ((b0 & 0xE0) == 0xC0 && len >= 2) {
uint8_t b1 = bytes[1];
if ((b1 & 0xC0) == 0x80) {
codepoint = ((uint32_t)(b0 & 0x1F) << 6) | (uint32_t)(b1 & 0x3F);
if (codepoint >= 0x80) {
advance = 2;
} else {
codepoint = 0xFFFD;
}
}
} else if ((b0 & 0xF0) == 0xE0 && len >= 3) {
uint8_t b1 = bytes[1];
uint8_t b2 = bytes[2];
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80) {
codepoint = ((uint32_t)(b0 & 0x0F) << 12) | ((uint32_t)(b1 & 0x3F) << 6) |
(uint32_t)(b2 & 0x3F);
if (codepoint >= 0x800 && (codepoint < 0xD800 || codepoint > 0xDFFF)) {
advance = 3;
} else {
codepoint = 0xFFFD;
}
}
} else if ((b0 & 0xF8) == 0xF0 && len >= 4) {
uint8_t b1 = bytes[1];
uint8_t b2 = bytes[2];
uint8_t b3 = bytes[3];
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 && (b3 & 0xC0) == 0x80) {
codepoint = ((uint32_t)(b0 & 0x07) << 18) |
((uint32_t)(b1 & 0x3F) << 12) | ((uint32_t)(b2 & 0x3F) << 6) |
(uint32_t)(b3 & 0x3F);
if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) {
advance = 4;
} else {
codepoint = 0xFFFD;
}
}
}
if (out_codepoint)
*out_codepoint = codepoint;
if (out_invalid)
*out_invalid = (codepoint == 0xFFFD && b0 >= 0x80);
return advance;
}
[[nodiscard]] bool utf8_decode_buffer(const char8_t *input, size_t len,
uint32_t **out, size_t *out_len,
size_t *invalid_count) {
if (!out || !out_len || !invalid_count)
return false;
*out = NULL;
*out_len = 0;
*invalid_count = 0;
if (!input || len == 0)
return true;
if (len > SIZE_MAX / sizeof(uint32_t))
return false;
uint32_t *buffer = malloc(len * sizeof(uint32_t));
if (!buffer)
return false;
const uint8_t *bytes = (const uint8_t *)input;
size_t i = 0;
size_t count = 0;
size_t invalid = 0;
while (i < len) {
uint32_t codepoint = 0;
bool invalid_codepoint = false;
size_t advance =
utf8_decode_advance(bytes + i, len - i, &codepoint, &invalid_codepoint);
if (advance == 0)
break;
if (invalid_codepoint)
invalid++;
buffer[count++] = codepoint;
i += advance;
}
*out = buffer;
*out_len = count;
*invalid_count = invalid;
return true;
}
#ifndef UTF8_UTIL_H
#define UTF8_UTIL_H
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <uchar.h>
#if defined(__clang__)
#if __has_feature(c_char8_t)
#define UTF8_UTIL_HAVE_CHAR8_T 1
#endif
#elif defined(__GNUC__)
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L
#define UTF8_UTIL_HAVE_CHAR8_T 1
#endif
#endif
#if !defined(UTF8_UTIL_HAVE_CHAR8_T)
typedef unsigned char char8_t;
#endif
/**
* Decode one UTF-8 code point.
* @param bytes Pointer to the first byte.
* @param len Number of available bytes.
* @param out_codepoint Optional output for the decoded scalar value.
* @param out_invalid Optional flag set when an invalid sequence is seen.
* @return Number of bytes consumed (0 on invalid/incomplete input).
*/
size_t utf8_decode_advance(const char8_t *bytes, size_t len,
uint32_t *out_codepoint, bool *out_invalid);
/**
* Decode a UTF-8 buffer into a newly allocated UTF-32 array.
* Caller owns the returned buffer.
*/
[[nodiscard]] bool utf8_decode_buffer(const char8_t *input, size_t len,
uint32_t **out, size_t *out_len,
size_t *invalid_count);
#endif
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment