diff --git a/src/common.h b/src/common.h index d6a44593..a06518d2 100644 --- a/src/common.h +++ b/src/common.h @@ -330,6 +330,11 @@ struct string { u8 *text; }; +struct string16 { + u64 len; + u16 *text; +}; + struct string32 { u64 len; u32 *text; diff --git a/src/string.c b/src/string.c index 42ad2a1a..e964f06f 100644 --- a/src/string.c +++ b/src/string.c @@ -3,6 +3,7 @@ #include "memory.h" #include "scratch.h" #include "math.h" +#include "utf.h" /* * NOTE: Strings should be considered ~IMMUTABLE~ @@ -548,20 +549,101 @@ struct string _string_format(struct arena *arena, struct string fmt, ...) } /* ========================== * - * Unicode + * Unicode transformation * ========================== */ -/* Placeholder functions. Unicode not supported yet. */ - -struct string32 string32_from_string(struct arena *arena, struct string str) +/* utf8 <- utf16 */ +struct string string_from_string16(struct arena *arena, struct string16 str16) { - u32 *text = arena_push_array(arena, u32, str.len); - for (u64 i = 0; i < str.len; ++i) { - u8 c = str.text[i]; - text[i] = (u32)c; - } - return (struct string32) { - .text = text, - .len = str.len + struct string res = { + .len = 0, + .text = arena_dry_push(arena, u8) }; + + u64 pos16 = 0; + while (pos16 < str16.len) { + struct string16 str16_remaining = { .len = (str16.len - pos16), .text = str16.text + pos16 }; + struct utf16_decode_result decoded = utf16_decode(str16_remaining); + struct utf8_encode_result encoded = utf8_encode(decoded.codepoint); + + u8 *dest = arena_push_array(arena, u8, encoded.count8); + MEMCPY(dest, encoded.chars8, encoded.count8); + + pos16 += decoded.advance16; + res.len += encoded.count8; + } + + return res; +} + +/* utf16 <- utf8 */ +struct string16 string16_from_string(struct arena *arena, struct string str8) +{ + struct string16 res = { + .len = 0, + .text = arena_dry_push(arena, u16) + }; + + u64 pos8 = 0; + while (pos8 < str8.len) { + struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 }; + struct utf8_decode_result decoded = utf8_decode(str8_remaining); + struct utf16_encode_result encoded = utf16_encode(decoded.codepoint); + + u16 *dest = arena_push_array(arena, u16, encoded.count16); + MEMCPY(dest, encoded.chars16, (encoded.count16 << 1)); + + pos8 += decoded.advance8; + res.len += encoded.count16; + } + + return res; +} + +/* utf8 <- utf32 */ +struct string string_from_string32(struct arena *arena, struct string32 str32) +{ + struct string res = { + .len = 0, + .text = arena_dry_push(arena, u8) + }; + + u64 pos32 = 0; + while (pos32 < str32.len) { + struct string32 str32_remaining = { .len = (str32.len - pos32), .text = str32.text + pos32 }; + struct utf32_decode_result decoded = utf32_decode(str32_remaining); + struct utf8_encode_result encoded = utf8_encode(decoded.codepoint); + + u8 *dest = arena_push_array(arena, u8, encoded.count8); + MEMCPY(dest, &encoded.chars8, encoded.count8); + + pos32 += 1; + res.len += encoded.count8; + } + + return res; +} + +/* utf32 <- utf8 */ +struct string32 string32_from_string(struct arena *arena, struct string str8) +{ + struct string32 res = { + .len = 0, + .text = arena_dry_push(arena, u32) + }; + + u64 pos8 = 0; + while (pos8 < str8.len) { + struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 }; + struct utf8_decode_result decoded = utf8_decode(str8_remaining); + struct utf32_encode_result encoded = utf32_encode(decoded.codepoint); + + u32 *dest = arena_push(arena, u32); + *dest = encoded.chars32; + + pos8 += decoded.advance8; + res.len += 1; + } + + return res; } diff --git a/src/string.h b/src/string.h index 366b07c5..c5f3d10e 100644 --- a/src/string.h +++ b/src/string.h @@ -86,11 +86,12 @@ struct string _string_format(struct arena *arena, struct string fmt, ...); struct string string_formatv(struct arena *arena, struct string fmt, va_list args); /* ========================== * - * Unicode + * Unicode transformation * ========================== */ -/* TODO: Real unicode conversions */ - -struct string32 string32_from_string(struct arena *arena, struct string str); +struct string string_from_string16(struct arena *arena, struct string16 str16); +struct string16 string16_from_string(struct arena *arena, struct string str8); +struct string string_from_string32(struct arena *arena, struct string32 str32); +struct string32 string32_from_string(struct arena *arena, struct string str8); #endif diff --git a/src/utf.c b/src/utf.c new file mode 100644 index 00000000..b6fe488f --- /dev/null +++ b/src/utf.c @@ -0,0 +1,192 @@ +#include "utf.h" + +/* ========================== * + * utf8 + * ========================== */ + +GLOBAL READONLY u8 g_utf8_lens[32] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,5 +}; + +struct utf8_decode_result utf8_decode(struct string str) +{ + u32 codepoint = U32_MAX; + u32 advance = 0; + if (str.len > 0) { + u8 c0 = str.text[0]; + u8 utf8_len = g_utf8_lens[c0 >> 3]; + + advance = 1; + switch (utf8_len) { + case 1: { + codepoint = c0; + } break; + + case 2: { + if (str.len >= 2) { + u8 c1 = str.text[1]; + if (g_utf8_lens[c1 >> 3] == 0) { + codepoint = (c1 & 0x3F) << 0; + codepoint |= (c0 & 0x1F) << 6; + advance = 2; + } + } + } break; + + case 3: { + if (str.len >= 3) { + u8 c1 = str.text[1]; + u8 c2 = str.text[2]; + if (g_utf8_lens[c1 >> 3] == 0 && + g_utf8_lens[c2 >> 3] == 0) { + codepoint = (c2 & 0x3F) << 0; + codepoint |= (c1 & 0x3F) << 6; + codepoint |= (c0 & 0x0F) << 12; + advance = 3; + } + } + } break; + + case 4: { + if (str.len >= 4) { + u8 c1 = str.text[1]; + u8 c2 = str.text[2]; + u8 c3 = str.text[3]; + if (g_utf8_lens[c1 >> 3] == 0 && + g_utf8_lens[c2 >> 3] == 0 && + g_utf8_lens[c3 >> 3] == 0) { + codepoint = (c3 & 0x3F) << 0; + codepoint |= (c2 & 0x3F) << 6; + codepoint |= (c1 & 0x3F) << 12; + codepoint |= (c0 & 0x07) << 16; + advance = 3; + } + } + } break; + + default: break; + } + } + + return (struct utf8_decode_result) { + .advance8 = advance, + .codepoint = codepoint + }; +} + +struct utf8_encode_result utf8_encode(u32 codepoint) +{ + struct utf8_encode_result res = { 0 }; + + if (codepoint <= 0x7F) { + res.count8 = 1; + res.chars8[0] = codepoint; + } else if (codepoint <= 0x7FF) { + res.count8 = 2; + res.chars8[1] = 0x80 | ((codepoint >> 0) & 0x3F); + res.chars8[0] = 0xC0 | ((codepoint >> 6) & 0x1F); + } else if (codepoint <= 0xFFFF) { + res.count8 = 3; + res.chars8[2] = 0x80 | ((codepoint >> 0) & 0x3F); + res.chars8[1] = 0x80 | ((codepoint >> 6) & 0x3F); + res.chars8[0] = 0xE0 | ((codepoint >> 12) & 0x0F); + } else if (codepoint <= 0x10FFFF) { + res.count8 = 4; + res.chars8[3] = 0x80 | ((codepoint >> 0) & 0x3F); + res.chars8[2] = 0x80 | ((codepoint >> 6) & 0x3F); + res.chars8[1] = 0x80 | ((codepoint >> 12) & 0x3F); + res.chars8[0] = 0xF0 | ((codepoint >> 18) & 0x07); + } else { + /* Invalid codepoint */ + res.count8 = 1; + res.chars8[0] = '?'; + } + + return res; +} + +/* ========================== * + * utf16 + * ========================== */ + +struct utf16_decode_result utf16_decode(struct string16 str) +{ + u32 codepoint = U32_MAX; + u32 advance = 0; + + if (str.len >= 1) { + u16 c0 = str.text[0]; + codepoint = c0; + advance = 1; + if (str.len >= 2) { + u16 c1 = str.text[1]; + if ((0xD800 <= c0 && c0 < 0xDC00) && (0xDC00 <= c1 && c1 < 0xE000)) { + codepoint = (c1 & 0x3FF) << 0; + codepoint |= (c0 & 0x3FF) << 10; + advance = 2; + } + } + } + + return (struct utf16_decode_result) { + .advance16 = advance, + .codepoint = codepoint + }; +} + +struct utf16_encode_result utf16_encode(u32 codepoint) +{ + struct utf16_encode_result res = { 0 }; + + if (codepoint <= 0xFFFF) { + res.count16 = 1; + res.chars16[0] = codepoint; + } else if (codepoint <= 0x10FFFF) { + res.count16 = 2; + res.chars16[1] = 0xDC00 | ((codepoint >> 0) & 0x3FF); + res.chars16[0] = 0xD800 | ((codepoint >> 10) & 0x3FF); + } else { + /* Invalid codepoint */ + res.count16 = 1; + res.chars16[0] = '?'; + } + + return res; +} + +/* ========================== * + * utf32 + * ========================== */ + +struct utf32_decode_result utf32_decode(struct string32 str) +{ + u32 codepoint = U32_MAX; + u32 advance = 0; + + if (str.len >= 1) { + u32 c = str.text[0]; + advance = 1; + if (c <= 0x10FFFF) { + codepoint = c; + } + } + + return (struct utf32_decode_result) { + .advance32 = advance, + .codepoint = codepoint + }; +} + +struct utf32_encode_result utf32_encode(u32 codepoint) +{ + struct utf32_encode_result res = { 0 }; + + if (codepoint <= 0x10FFFF) { + res.chars32 = codepoint; + } else { + /* Invalid codepoint */ + res.chars32 = '?'; + } + + return res; +} diff --git a/src/utf.h b/src/utf.h new file mode 100644 index 00000000..6c3a33ef --- /dev/null +++ b/src/utf.h @@ -0,0 +1,54 @@ +#ifndef UTF_H +#define UTF_H + +/* ========================== * + * utf8 + * ========================== */ + +struct utf8_decode_result { + u32 advance8; + u32 codepoint; +}; + +struct utf8_encode_result { + u32 count8; + u8 chars8[4]; +}; + +struct utf8_decode_result utf8_decode(struct string str); +struct utf8_encode_result utf8_encode(u32 codepoint); + +/* ========================== * + * utf16 + * ========================== */ + +struct utf16_decode_result { + u32 advance16; + u32 codepoint; +}; + +struct utf16_encode_result { + u32 count16; + u16 chars16[2]; +}; + +struct utf16_decode_result utf16_decode(struct string16 str); +struct utf16_encode_result utf16_encode(u32 codepoint); + +/* ========================== * + * utf32 + * ========================== */ + +struct utf32_decode_result { + u32 advance32; + u32 codepoint; +}; + +struct utf32_encode_result { + u32 chars32; +}; + +struct utf32_decode_result utf32_decode(struct string32 str); +struct utf32_encode_result utf32_encode(u32 codepoint); + +#endif