power_play/src/utf.c

#include "utf.h"

/* ========================== *
 * utf8
 * ========================== */

GLOBAL READONLY u8 g_utf8_lens[32] = {
  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,5
};

struct utf8_decode_result utf8_decode(struct string str)
{
    u32 codepoint = U32_MAX;
    u32 advance = 0;
    if (str.len > 0) {
        u8 c0 = str.text[0];
        u8 utf8_len = g_utf8_lens[c0 >> 3];

        advance = 1;
        switch (utf8_len) {
            case 1: {
                codepoint = c0;
            } break;

            case 2: {
                if (str.len >= 2) {
                    u8 c1 = str.text[1];
                    if (g_utf8_lens[c1 >> 3] == 0) {
                        codepoint  = (c1 & 0x3F) << 0;
                        codepoint |= (c0 & 0x1F) << 6;
                        advance = 2;
                    }
                }
            } break;

            case 3: {
                if (str.len >= 3) {
                    u8 c1 = str.text[1];
                    u8 c2 = str.text[2];
                    if (g_utf8_lens[c1 >> 3] == 0 &&
                        g_utf8_lens[c2 >> 3] == 0) {
                        codepoint  = (c2 & 0x3F) << 0;
                        codepoint |= (c1 & 0x3F) << 6;
                        codepoint |= (c0 & 0x0F) << 12;
                        advance = 3;
                    }
                }
            } break;

            case 4: {
                if (str.len >= 4) {
                    u8 c1 = str.text[1];
                    u8 c2 = str.text[2];
                    u8 c3 = str.text[3];
                    if (g_utf8_lens[c1 >> 3] == 0 &&
                        g_utf8_lens[c2 >> 3] == 0 &&
                        g_utf8_lens[c3 >> 3] == 0) {
                        codepoint  = (c3 & 0x3F) << 0;
                        codepoint |= (c2 & 0x3F) << 6;
                        codepoint |= (c1 & 0x3F) << 12;
                        codepoint |= (c0 & 0x07) << 16;
                        advance = 3;
                    }
                }
            } break;

            default: break;
        }
    }

    return (struct utf8_decode_result) {
        .advance8 = advance,
        .codepoint = codepoint
    };
}

struct utf8_encode_result utf8_encode(u32 codepoint)
{
    struct utf8_encode_result res = { 0 };

    if (codepoint <= 0x7F) {
        res.count8 = 1;
        res.chars8[0] = codepoint;
    } else if (codepoint <= 0x7FF) {
        res.count8 = 2;
        res.chars8[1] = 0x80 | ((codepoint >> 0) & 0x3F);
        res.chars8[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
    } else if (codepoint <= 0xFFFF) {
        res.count8 = 3;
        res.chars8[2] = 0x80 | ((codepoint >>  0) & 0x3F);
        res.chars8[1] = 0x80 | ((codepoint >>  6) & 0x3F);
        res.chars8[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
    } else if (codepoint <= 0x10FFFF) {
        res.count8 = 4;
        res.chars8[3] = 0x80 | ((codepoint >>  0) & 0x3F);
        res.chars8[2] = 0x80 | ((codepoint >>  6) & 0x3F);
        res.chars8[1] = 0x80 | ((codepoint >> 12) & 0x3F);
        res.chars8[0] = 0xF0 | ((codepoint >> 18) & 0x07);
    } else {
        /* Invalid codepoint */
        res.count8 = 1;
        res.chars8[0] = '?';
    }

    return res;
}

/* ========================== *
 * utf16
 * ========================== */

struct utf16_decode_result utf16_decode(struct string16 str)
{
    u32 codepoint = U32_MAX;
    u32 advance = 0;

    if (str.len >= 1) {
        u16 c0 = str.text[0];
        codepoint = c0;
        advance = 1;
        if (str.len >= 2) {
            u16 c1 = str.text[1];
            if ((0xD800 <= c0 && c0 < 0xDC00) && (0xDC00 <= c1 && c1 < 0xE000)) {
                codepoint  = (c1 & 0x3FF) << 0;
                codepoint |= (c0 & 0x3FF) << 10;
                advance = 2;
            }
        }
    }

    return (struct utf16_decode_result) {
        .advance16 = advance,
        .codepoint = codepoint
    };
}

struct utf16_encode_result utf16_encode(u32 codepoint)
{
    struct utf16_encode_result res = { 0 };

    if (codepoint <= 0xFFFF) {
        res.count16 = 1;
        res.chars16[0] = codepoint;
    } else if (codepoint <= 0x10FFFF) {
        res.count16 = 2;
        res.chars16[1] = 0xDC00 | ((codepoint >>  0) & 0x3FF);
        res.chars16[0] = 0xD800 | ((codepoint >> 10) & 0x3FF);
    } else {
        /* Invalid codepoint */
        res.count16 = 1;
        res.chars16[0] = '?';
    }

    return res;
}

b32 utf16_is_high_surrogate(u16 c)
{
    return 0xD800 <= c && c < 0xDC00;
}

b32 utf16_is_low_surrogate(u16 c)
{
    return 0xDC00 <= c && c < 0xE000;
}

/* ========================== *
 * utf32
 * ========================== */

struct utf32_decode_result utf32_decode(struct string32 str)
{
    u32 codepoint = U32_MAX;
    u32 advance = 0;

    if (str.len >= 1) {
        u32 c = str.text[0];
        advance = 1;
        if (c <= 0x10FFFF) {
            codepoint = c;
        }
    }

    return (struct utf32_decode_result) {
        .advance32 = advance,
        .codepoint = codepoint
    };
}

struct utf32_encode_result utf32_encode(u32 codepoint)
{
    struct utf32_encode_result res = { 0 };

    if (codepoint <= 0x10FFFF) {
        res.chars32 = codepoint;
    } else {
        /* Invalid codepoint */
        res.chars32 = '?';
    }

    return res;
}