create unicode transformation functions

2024-04-03 15:28:46 -05:00 · 2024-04-03 15:28:46 -05:00 · db16c4b067
commit db16c4b067
parent 369223eafd
5 changed files with 350 additions and 16 deletions
--- a/src/common.h
+++ b/src/common.h
@ -330,6 +330,11 @@ struct string {
    u8 *text;
 };

+struct string16 {
+    u64 len;
+    u16 *text;
+};
+
 struct string32 {
    u64 len;
    u32 *text;
--- a/src/string.c
+++ b/src/string.c
@ -3,6 +3,7 @@
 #include "memory.h"
 #include "scratch.h"
 #include "math.h"
+#include "utf.h"

 /*
 * NOTE: Strings should be considered ~IMMUTABLE~
@ -548,20 +549,101 @@ struct string _string_format(struct arena *arena, struct string fmt, ...)
 }

 /* ========================== *
- * Unicode
+ * Unicode transformation
 * ========================== */

-/* Placeholder functions. Unicode not supported yet. */
-
-struct string32 string32_from_string(struct arena *arena, struct string str)
+/* utf8 <- utf16 */
+struct string string_from_string16(struct arena *arena, struct string16 str16)
 {
-    u32 *text = arena_push_array(arena, u32, str.len);
-    for (u64 i = 0; i < str.len; ++i) {
-        u8 c = str.text[i];
-        text[i] = (u32)c;
-    }
-    return (struct string32) {
-        .text = text,
-        .len = str.len
+    struct string res = {
+        .len = 0,
+        .text = arena_dry_push(arena, u8)
    };
+
+    u64 pos16 = 0;
+    while (pos16 < str16.len) {
+        struct string16 str16_remaining = { .len = (str16.len - pos16), .text = str16.text + pos16 };
+        struct utf16_decode_result decoded = utf16_decode(str16_remaining);
+        struct utf8_encode_result encoded = utf8_encode(decoded.codepoint);
+
+        u8 *dest = arena_push_array(arena, u8, encoded.count8);
+        MEMCPY(dest, encoded.chars8, encoded.count8);
+
+        pos16 += decoded.advance16;
+        res.len += encoded.count8;
+    }
+
+    return res;
+}
+
+/* utf16 <- utf8 */
+struct string16 string16_from_string(struct arena *arena, struct string str8)
+{
+    struct string16 res = {
+        .len = 0,
+        .text = arena_dry_push(arena, u16)
+    };
+
+    u64 pos8 = 0;
+    while (pos8 < str8.len) {
+        struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 };
+        struct utf8_decode_result decoded = utf8_decode(str8_remaining);
+        struct utf16_encode_result encoded = utf16_encode(decoded.codepoint);
+
+        u16 *dest = arena_push_array(arena, u16, encoded.count16);
+        MEMCPY(dest, encoded.chars16, (encoded.count16 << 1));
+
+        pos8 += decoded.advance8;
+        res.len += encoded.count16;
+    }
+
+    return res;
+}
+
+/* utf8 <- utf32 */
+struct string string_from_string32(struct arena *arena, struct string32 str32)
+{
+    struct string res = {
+        .len = 0,
+        .text = arena_dry_push(arena, u8)
+    };
+
+    u64 pos32 = 0;
+    while (pos32 < str32.len) {
+        struct string32 str32_remaining = { .len = (str32.len - pos32), .text = str32.text + pos32 };
+        struct utf32_decode_result decoded = utf32_decode(str32_remaining);
+        struct utf8_encode_result encoded = utf8_encode(decoded.codepoint);
+
+        u8 *dest = arena_push_array(arena, u8, encoded.count8);
+        MEMCPY(dest, &encoded.chars8, encoded.count8);
+
+        pos32 += 1;
+        res.len += encoded.count8;
+    }
+
+    return res;
+}
+
+/* utf32 <- utf8 */
+struct string32 string32_from_string(struct arena *arena, struct string str8)
+{
+    struct string32 res = {
+        .len = 0,
+        .text = arena_dry_push(arena, u32)
+    };
+
+    u64 pos8 = 0;
+    while (pos8 < str8.len) {
+        struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 };
+        struct utf8_decode_result decoded = utf8_decode(str8_remaining);
+        struct utf32_encode_result encoded = utf32_encode(decoded.codepoint);
+
+        u32 *dest = arena_push(arena, u32);
+        *dest = encoded.chars32;
+
+        pos8 += decoded.advance8;
+        res.len += 1;
+    }
+
+    return res;
 }
--- a/src/string.h
+++ b/src/string.h
@ -86,11 +86,12 @@ struct string _string_format(struct arena *arena, struct string fmt, ...);
 struct string string_formatv(struct arena *arena, struct string fmt, va_list args);

 /* ========================== *
- * Unicode
+ * Unicode transformation
 * ========================== */

-/* TODO: Real unicode conversions */
-
-struct string32 string32_from_string(struct arena *arena, struct string str);
+struct string string_from_string16(struct arena *arena, struct string16 str16);
+struct string16 string16_from_string(struct arena *arena, struct string str8);
+struct string string_from_string32(struct arena *arena, struct string32 str32);
+struct string32 string32_from_string(struct arena *arena, struct string str8);

 #endif
--- a/src/utf.c
+++ b/src/utf.c
@ -0,0 +1,192 @@
+#include "utf.h"
+
+/* ========================== *
+ * utf8
+ * ========================== */
+
+GLOBAL READONLY u8 g_utf8_lens[32] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,5
+};
+
+struct utf8_decode_result utf8_decode(struct string str)
+{
+    u32 codepoint = U32_MAX;
+    u32 advance = 0;
+    if (str.len > 0) {
+        u8 c0 = str.text[0];
+        u8 utf8_len = g_utf8_lens[c0 >> 3];
+
+        advance = 1;
+        switch (utf8_len) {
+            case 1: {
+                codepoint = c0;
+            } break;
+
+            case 2: {
+                if (str.len >= 2) {
+                    u8 c1 = str.text[1];
+                    if (g_utf8_lens[c1 >> 3] == 0) {
+                        codepoint  = (c1 & 0x3F) << 0;
+                        codepoint |= (c0 & 0x1F) << 6;
+                        advance = 2;
+                    }
+                }
+            } break;
+
+            case 3: {
+                if (str.len >= 3) {
+                    u8 c1 = str.text[1];
+                    u8 c2 = str.text[2];
+                    if (g_utf8_lens[c1 >> 3] == 0 &&
+                        g_utf8_lens[c2 >> 3] == 0) {
+                        codepoint  = (c2 & 0x3F) << 0;
+                        codepoint |= (c1 & 0x3F) << 6;
+                        codepoint |= (c0 & 0x0F) << 12;
+                        advance = 3;
+                    }
+                }
+            } break;
+
+            case 4: {
+                if (str.len >= 4) {
+                    u8 c1 = str.text[1];
+                    u8 c2 = str.text[2];
+                    u8 c3 = str.text[3];
+                    if (g_utf8_lens[c1 >> 3] == 0 &&
+                        g_utf8_lens[c2 >> 3] == 0 &&
+                        g_utf8_lens[c3 >> 3] == 0) {
+                        codepoint  = (c3 & 0x3F) << 0;
+                        codepoint |= (c2 & 0x3F) << 6;
+                        codepoint |= (c1 & 0x3F) << 12;
+                        codepoint |= (c0 & 0x07) << 16;
+                        advance = 3;
+                    }
+                }
+            } break;
+
+            default: break;
+        }
+    }
+
+    return (struct utf8_decode_result) {
+        .advance8 = advance,
+        .codepoint = codepoint
+    };
+}
+
+struct utf8_encode_result utf8_encode(u32 codepoint)
+{
+    struct utf8_encode_result res = { 0 };
+
+    if (codepoint <= 0x7F) {
+        res.count8 = 1;
+        res.chars8[0] = codepoint;
+    } else if (codepoint <= 0x7FF) {
+        res.count8 = 2;
+        res.chars8[1] = 0x80 | ((codepoint >> 0) & 0x3F);
+        res.chars8[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+    } else if (codepoint <= 0xFFFF) {
+        res.count8 = 3;
+        res.chars8[2] = 0x80 | ((codepoint >>  0) & 0x3F);
+        res.chars8[1] = 0x80 | ((codepoint >>  6) & 0x3F);
+        res.chars8[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+    } else if (codepoint <= 0x10FFFF) {
+        res.count8 = 4;
+        res.chars8[3] = 0x80 | ((codepoint >>  0) & 0x3F);
+        res.chars8[2] = 0x80 | ((codepoint >>  6) & 0x3F);
+        res.chars8[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+        res.chars8[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+    } else {
+        /* Invalid codepoint */
+        res.count8 = 1;
+        res.chars8[0] = '?';
+    }
+
+    return res;
+}
+
+/* ========================== *
+ * utf16
+ * ========================== */
+
+struct utf16_decode_result utf16_decode(struct string16 str)
+{
+    u32 codepoint = U32_MAX;
+    u32 advance = 0;
+
+    if (str.len >= 1) {
+        u16 c0 = str.text[0];
+        codepoint = c0;
+        advance = 1;
+        if (str.len >= 2) {
+            u16 c1 = str.text[1];
+            if ((0xD800 <= c0 && c0 < 0xDC00) && (0xDC00 <= c1 && c1 < 0xE000)) {
+                codepoint  = (c1 & 0x3FF) << 0;
+                codepoint |= (c0 & 0x3FF) << 10;
+                advance = 2;
+            }
+        }
+    }
+
+    return (struct utf16_decode_result) {
+        .advance16 = advance,
+        .codepoint = codepoint
+    };
+}
+
+struct utf16_encode_result utf16_encode(u32 codepoint)
+{
+    struct utf16_encode_result res = { 0 };
+
+    if (codepoint <= 0xFFFF) {
+        res.count16 = 1;
+        res.chars16[0] = codepoint;
+    } else if (codepoint <= 0x10FFFF) {
+        res.count16 = 2;
+        res.chars16[1] = 0xDC00 | ((codepoint >>  0) & 0x3FF);
+        res.chars16[0] = 0xD800 | ((codepoint >> 10) & 0x3FF);
+    } else {
+        /* Invalid codepoint */
+        res.count16 = 1;
+        res.chars16[0] = '?';
+    }
+
+    return res;
+}
+
+/* ========================== *
+ * utf32
+ * ========================== */
+
+struct utf32_decode_result utf32_decode(struct string32 str)
+{
+    u32 codepoint = U32_MAX;
+    u32 advance = 0;
+
+    if (str.len >= 1) {
+        u32 c = str.text[0];
+        advance = 1;
+        if (c <= 0x10FFFF) {
+            codepoint = c;
+        }
+    }
+
+    return (struct utf32_decode_result) {
+        .advance32 = advance,
+        .codepoint = codepoint
+    };
+}
+
+struct utf32_encode_result utf32_encode(u32 codepoint)
+{
+    struct utf32_encode_result res = { 0 };
+
+    if (codepoint <= 0x10FFFF) {
+        res.chars32 = codepoint;
+    } else {
+        /* Invalid codepoint */
+        res.chars32 = '?';
+    }
+
+    return res;
+}
--- a/src/utf.h
+++ b/src/utf.h
@ -0,0 +1,54 @@
+#ifndef UTF_H
+#define UTF_H
+
+/* ========================== *
+ * utf8
+ * ========================== */
+
+struct utf8_decode_result {
+    u32 advance8;
+    u32 codepoint;
+};
+
+struct utf8_encode_result {
+    u32 count8;
+    u8 chars8[4];
+};
+
+struct utf8_decode_result utf8_decode(struct string str);
+struct utf8_encode_result utf8_encode(u32 codepoint);
+
+/* ========================== *
+ * utf16
+ * ========================== */
+
+struct utf16_decode_result {
+    u32 advance16;
+    u32 codepoint;
+};
+
+struct utf16_encode_result {
+    u32 count16;
+    u16 chars16[2];
+};
+
+struct utf16_decode_result utf16_decode(struct string16 str);
+struct utf16_encode_result utf16_encode(u32 codepoint);
+
+/* ========================== *
+ * utf32
+ * ========================== */
+
+struct utf32_decode_result {
+    u32 advance32;
+    u32 codepoint;
+};
+
+struct utf32_encode_result {
+    u32 chars32;
+};
+
+struct utf32_decode_result utf32_decode(struct string32 str);
+struct utf32_encode_result utf32_encode(u32 codepoint);
+
+#endif