#include "utf.h" /* ========================== * * utf8 * ========================== */ GLOBAL READONLY u8 g_utf8_lens[32] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,5 }; struct utf8_decode_result utf8_decode(struct string str) { u32 codepoint = U32_MAX; u32 advance = 0; if (str.len > 0) { u8 c0 = str.text[0]; u8 utf8_len = g_utf8_lens[c0 >> 3]; advance = 1; switch (utf8_len) { case 1: { codepoint = c0; } break; case 2: { if (str.len >= 2) { u8 c1 = str.text[1]; if (g_utf8_lens[c1 >> 3] == 0) { codepoint = (c1 & 0x3F) << 0; codepoint |= (c0 & 0x1F) << 6; advance = 2; } } } break; case 3: { if (str.len >= 3) { u8 c1 = str.text[1]; u8 c2 = str.text[2]; if (g_utf8_lens[c1 >> 3] == 0 && g_utf8_lens[c2 >> 3] == 0) { codepoint = (c2 & 0x3F) << 0; codepoint |= (c1 & 0x3F) << 6; codepoint |= (c0 & 0x0F) << 12; advance = 3; } } } break; case 4: { if (str.len >= 4) { u8 c1 = str.text[1]; u8 c2 = str.text[2]; u8 c3 = str.text[3]; if (g_utf8_lens[c1 >> 3] == 0 && g_utf8_lens[c2 >> 3] == 0 && g_utf8_lens[c3 >> 3] == 0) { codepoint = (c3 & 0x3F) << 0; codepoint |= (c2 & 0x3F) << 6; codepoint |= (c1 & 0x3F) << 12; codepoint |= (c0 & 0x07) << 16; advance = 3; } } } break; default: break; } } return (struct utf8_decode_result) { .advance8 = advance, .codepoint = codepoint }; } struct utf8_encode_result utf8_encode(u32 codepoint) { struct utf8_encode_result res = { 0 }; if (codepoint <= 0x7F) { res.count8 = 1; res.chars8[0] = codepoint; } else if (codepoint <= 0x7FF) { res.count8 = 2; res.chars8[1] = 0x80 | ((codepoint >> 0) & 0x3F); res.chars8[0] = 0xC0 | ((codepoint >> 6) & 0x1F); } else if (codepoint <= 0xFFFF) { res.count8 = 3; res.chars8[2] = 0x80 | ((codepoint >> 0) & 0x3F); res.chars8[1] = 0x80 | ((codepoint >> 6) & 0x3F); res.chars8[0] = 0xE0 | ((codepoint >> 12) & 0x0F); } else if (codepoint <= 0x10FFFF) { res.count8 = 4; res.chars8[3] = 0x80 | ((codepoint >> 0) & 0x3F); res.chars8[2] = 0x80 | ((codepoint >> 6) & 0x3F); res.chars8[1] = 0x80 | ((codepoint >> 12) & 0x3F); res.chars8[0] = 0xF0 | ((codepoint >> 18) & 0x07); } else { /* Invalid codepoint */ res.count8 = 1; res.chars8[0] = '?'; } return res; } /* ========================== * * utf16 * ========================== */ struct utf16_decode_result utf16_decode(struct string16 str) { u32 codepoint = U32_MAX; u32 advance = 0; if (str.len >= 1) { u16 c0 = str.text[0]; codepoint = c0; advance = 1; if (str.len >= 2) { u16 c1 = str.text[1]; if ((0xD800 <= c0 && c0 < 0xDC00) && (0xDC00 <= c1 && c1 < 0xE000)) { codepoint = (c1 & 0x3FF) << 0; codepoint |= (c0 & 0x3FF) << 10; advance = 2; } } } return (struct utf16_decode_result) { .advance16 = advance, .codepoint = codepoint }; } struct utf16_encode_result utf16_encode(u32 codepoint) { struct utf16_encode_result res = { 0 }; if (codepoint <= 0xFFFF) { res.count16 = 1; res.chars16[0] = codepoint; } else if (codepoint <= 0x10FFFF) { res.count16 = 2; res.chars16[1] = 0xDC00 | ((codepoint >> 0) & 0x3FF); res.chars16[0] = 0xD800 | ((codepoint >> 10) & 0x3FF); } else { /* Invalid codepoint */ res.count16 = 1; res.chars16[0] = '?'; } return res; } b32 utf16_is_high_surrogate(u16 c) { return 0xD800 <= c && c < 0xDC00; } b32 utf16_is_low_surrogate(u16 c) { return 0xDC00 <= c && c < 0xE000; } /* ========================== * * utf32 * ========================== */ struct utf32_decode_result utf32_decode(struct string32 str) { u32 codepoint = U32_MAX; u32 advance = 0; if (str.len >= 1) { u32 c = str.text[0]; advance = 1; if (c <= 0x10FFFF) { codepoint = c; } } return (struct utf32_decode_result) { .advance32 = advance, .codepoint = codepoint }; } struct utf32_encode_result utf32_encode(u32 codepoint) { struct utf32_encode_result res = { 0 }; if (codepoint <= 0x10FFFF) { res.chars32 = codepoint; } else { /* Invalid codepoint */ res.chars32 = '?'; } return res; }