#include "string.h" #include "arena.h" #include "memory.h" #include "scratch.h" #include "math.h" #include "uni.h" /* * NOTE: Strings should be considered ~IMMUTABLE~ * * All string functions return a new string as a result. Any strings used as * an argument (IE: in string_cat) will not be modified. * * Use the STR macro to create strings from string literals * * NOTE: It is valid for a string to have len 0 but a non-NULL text pointer. * Always check string.len rather than string.text for string presence. * (If we want to change this behavior then we need to check for length = 0 in * our functions that return a pointer from arena_dry_push, or guarantee that * all functions returning an arena_dry_push do allocate.) */ /* ========================== * * Conversion * ========================== */ #define INT_CHARS ("0123456789abcdef") struct string string_from_char(struct arena *arena, char c) { u8 *dest = arena_push(arena, u8); *dest = c; return (struct string) { .len = 1, .text = dest }; } struct string string_from_uint(struct arena *arena, u64 n, u32 base) { /* Base too large */ ASSERT(base <= (ARRAY_COUNT(INT_CHARS) - 1)); struct temp_arena scratch = scratch_begin(arena); /* Build backwards text starting from least significant digit */ u64 len = 0; u8 *backwards_text = arena_dry_push(scratch.arena, u8); do { string_from_char(scratch.arena, INT_CHARS[n % base]); ++len; n /= base; } while (n > 0); /* Reverse text into final string */ u8 *final_text = arena_push_array(arena, u8, len); for (u64 i = 0; i < len; ++i) { final_text[i] = backwards_text[len - i - 1]; } scratch_end(scratch); return (struct string) { .len = len, .text = final_text }; } struct string string_from_int(struct arena *arena, i64 n, u32 base) { u8 *final_text = arena_dry_push(arena, u8); u8 len = 0; if (n < 0) { /* Push sign */ string_from_char(arena, '-'); len = 1; n = -n; } /* Push unsigned number */ struct string uint_str = string_from_uint(arena, n, base); return (struct string) { .len = len + uint_str.len, .text = final_text }; } struct string string_from_ptr(struct arena *arena, void *ptr) { struct string prepend = string_copy(arena, STR("0x")); struct string uint_str = string_from_uint(arena, (u64)ptr, 16); return (struct string) { .len = prepend.len + uint_str.len, .text = prepend.text }; } struct string string_from_float(struct arena *arena, f64 f, u32 precision) { struct temp_arena scratch = scratch_begin(arena); u8 *final_text = arena_dry_push(arena, u8); u64 final_len = 0; if (F32_IS_NAN(f)) { final_len += string_copy(arena, STR("NaN")).len; } else if (f == F64_INFINITY) { final_len += string_copy(arena, STR("inf")).len; } else if (f == -F64_INFINITY) { final_len += string_copy(arena, STR("-inf")).len; } else { if (f < 0) { string_from_char(arena, '-'); f = -f; ++final_len; } /* Add one half of next precision level to round up */ f += 0.5 / (f64)math_pow_u64(10, (u8)precision); f64 part_whole = math_trunc64(f); f64 part_decimal = f - part_whole; /* Print whole part */ { /* Build backwards text starting from least significant digit */ u8 *backwards_text = arena_dry_push(scratch.arena, u8); u64 backwards_text_len = 0; do { u64 digit = (u64)math_round_to_int64(math_fmod64(part_whole, 10.0)); string_from_char(scratch.arena, INT_CHARS[digit % 10]); ++backwards_text_len; part_whole = math_trunc64(part_whole / 10.0); } while (part_whole > 0); /* Reverse text into final string */ arena_push_array(arena, u8, backwards_text_len); for (u64 i = backwards_text_len; i-- > 0;) { final_text[final_len++] = backwards_text[i]; } } /* Print decimal part */ if (precision > 0) { string_from_char(arena, '.'); for (u64 i = 0; i < precision; ++i) { part_decimal *= 10.0; u64 digit = (u64)part_decimal; part_decimal -= digit; string_from_char(arena, INT_CHARS[digit % 10]); } final_len += (u64)precision + 1; } } scratch_end(scratch); return (struct string) { .len = final_len, .text = final_text }; } /* ========================== * * String operations * ========================== */ struct string string_copy(struct arena *arena, struct string src) { struct string str = { .len = src.len, .text = arena_push_array(arena, u8, src.len) }; MEMCPY(str.text, src.text, src.len); return str; } struct string string_copy_buff(struct buffer dest_buff, struct string src) { u64 len = min_u64(dest_buff.size, src.len); struct string str = { .len = len, .text = dest_buff.data }; MEMCPY(str.text, src.text, src.len); return str; } struct string string_repeat(struct arena *arena, struct string src, u64 count) { u64 final_len = src.len * count; u8 *final_text = arena_push_array(arena, u8, final_len); for (u64 i = 0; i < count; ++i) { MEMCPY(final_text + (src.len * i), src.text, src.len); } return (struct string) { .text = final_text, .len = final_len }; } struct string string_cat(struct arena *arena, struct string str1, struct string str2) { struct string new_str = { 0 }; new_str.len = str1.len + str2.len; new_str.text = arena_push_array(arena, u8, new_str.len); MEMCPY(new_str.text, str1.text, str1.len); MEMCPY(new_str.text + str1.len, str2.text, str2.len); return new_str; } /* `arena` is where pieces items will be allocated. These strings point * into the existing supplied string and do not allocate any new text. */ struct string_array string_split(struct arena *arena, struct string str, struct string delim) { struct string_array pieces = { .count = 0, .strings = arena_dry_push(arena, struct string) }; struct string piece = { .len = 0, .text = str.text }; for (u64 i = 0; i <= str.len - delim.len; ++i) { /* Clamp comparison string so we don't overflow. */ struct string comp_str = { .len = delim.len, .text = &str.text[i] }; b32 is_delimiter = string_eq(comp_str, delim); b32 is_end = i == str.len - 1; if (!is_delimiter || is_end) { ++piece.len; } if (is_delimiter || is_end) { /* Delimiter found */ struct string *piece_pushed = arena_push(arena, struct string); *piece_pushed = piece; ++pieces.count; piece.text = piece.text + piece.len + delim.len; piece.len = 0; } } return pieces; } /* NOTE: Really slow */ struct string string_indent(struct arena *arena, struct string str, u32 indent) { struct temp_arena scratch = scratch_begin(arena); u64 final_len = 0; u8 *final_text = arena_dry_push(arena, u8); struct string_array split = string_split(scratch.arena, str, STR("\n")); for (u64 i = 0; i < split.count; ++i) { struct string piece = split.strings[i]; for (u32 j = 0; j < indent; ++j) { string_from_char(arena, ' '); ++final_len; } string_copy(arena, piece); final_len += piece.len; if (i < split.count - 1) { string_from_char(arena, '\n'); ++final_len; } } scratch_end(scratch); return (struct string) { .len = final_len, .text = final_text }; } struct string string_lower(struct arena *arena, struct string str) { struct string res = { 0 }; res.text = arena_push_array(arena, u8, str.len); res.len = str.len; for (u64 i = 0; i < str.len; ++i) { u8 c = str.text[i]; if (65 <= c && c <= 90) { c += 32; } res.text[i] = c; } return res; } b32 string_eq(struct string str1, struct string str2) { b32 eq = true; if (str1.len == str2.len) { for (u64 i = 0; i < str1.len; ++i) { if (str1.text[i] != str2.text[i]) { eq = false; break; } } } else { eq = false; } return eq; } b32 string_contains(struct string str, struct string substring) { if (substring.len > str.len) { return false; } for (u64 i = 0; i <= str.len - substring.len; ++i) { b32 match = true; for (u64 j = 0; j < substring.len; ++j) { if (str.text[i + j] != substring.text[j]) { match = false; break; } } if (match) { return true; } } return false; } b32 string_starts_with(struct string str, struct string substring) { if (str.len >= substring.len) { for (u64 i = 0; i < substring.len; ++i) { if (str.text[i] != substring.text[i]) { return false; } } return true; } return false; } b32 string_ends_with(struct string str, struct string substring) { if (str.len >= substring.len) { u64 start = str.len - substring.len; for (u64 i = 0; i < substring.len; ++i) { if (str.text[start + i] != substring.text[i]) { return false; } } return true; } return false; } /* ========================== * * Format * ========================== */ /* String formatting only has one format specifier: "%F". All specifier info is * included in the arguments (instead of w/ the specifier like in printf). This * is safer. * * Example: * string_format(arena, * STR("Hello there %F. You are %F feet %F inches tall!"), * FMT_STR(STR("George")), * FMT_UINT(6), * FMT_FLOAT(5.375)); * * NOTE: FMT_END must be passed as the last arg in the va_list (This is * done automatically by the `string_format` macro). * * Format arguments: * FMT_CHAR: Format a single u8 character * FMT_STR: Format a `string` struct * FMT_UINT: Format a u64 * FMT_SINT: Format an i64 * FMT_FLOAT: Format an f64 with DEFAULT_FMT_PRECISION * FMT_FLOAT_P: Format an f64 with specified precision * FMT_HEX: Format a u64 in hexadecimal notation * FMT_PTR: Format a pointer in hexadecimal notation prefixed by "0x" * * FMT_END (internal): Denote the end of the va_list * * TODO: * %n equivalent? (nothing) * %e/%E equivalent? (scientific notation of floats) * %o equivalent? (octal representation) */ struct string string_formatv(struct arena *arena, struct string fmt, va_list args) { __prof; u64 final_len = 0; u8 *final_text = arena_dry_push(arena, u8); u8 *end = fmt.text + fmt.len; b32 no_more_args = false; for (u8 *c = fmt.text; c < end; ++c) { u8 *next = ((c + 1) < end) ? (c + 1) : (u8 *)"\0"; /* Escape '%%' */ b32 escape = !no_more_args && *c == '%' && *next == '%'; if (escape) { /* Skip the escape '%' char from parsing */ ++c; } if (!no_more_args && !escape && *c == '%' && *next == 'F') { struct string parsed_str = { 0 }; /* Detect arg type and parse to string */ struct fmt_arg arg = va_arg(args, struct fmt_arg); switch (arg.type) { case FMT_TYPE_CHAR: { parsed_str = string_from_char(arena, arg.value.c); } break; case FMT_TYPE_STR: { parsed_str = string_copy(arena, arg.value.string); } break; case FMT_TYPE_UINT: { parsed_str = string_from_uint(arena, arg.value.uint, 10); } break; case FMT_TYPE_SINT: { parsed_str = string_from_int(arena, arg.value.sint, 10); } break; case FMT_TYPE_HEX: { parsed_str = string_from_uint(arena, arg.value.sint, 16); } break; case FMT_TYPE_PTR: { parsed_str = string_from_ptr(arena, arg.value.ptr); } break; case FMT_TYPE_FLOAT: { parsed_str = string_from_float(arena, arg.value.f.val, arg.value.f.precision); } break; case FMT_TYPE_END: { /* Unexpected end. Not enough FMT args passed to function. */ ASSERT(false); parsed_str = string_copy(arena, STR("")); no_more_args = true; } break; default: { /* Unknown format type */ ASSERT(false); parsed_str = string_copy(arena, STR("")); no_more_args = true; } break; } /* Update final string len / start */ final_len += parsed_str.len; /* Skip 'F' from parsing */ ++c; } else { /* Parse character normally */ string_from_char(arena, *c); ++final_len; } } #if RTC if (!no_more_args) { struct fmt_arg last_arg = va_arg(args, struct fmt_arg); /* End arg not reached. Too many FMT values passed to function. */ ASSERT(last_arg.type == FMT_TYPE_END); } #endif return (struct string) { .len = final_len, .text = final_text }; } struct string _string_format(struct arena *arena, struct string fmt, ...) { va_list args; va_start(args, fmt); struct string new_str = string_formatv(arena, fmt, args); va_end(args); return new_str; } /* ========================== * * Unicode * ========================== */ /* Codepoint iteration */ struct string_codepoint_iter string_codepoint_iter_begin(struct string str) { return (struct string_codepoint_iter) { .src = str }; } b32 string_codepoint_iter_next(struct string_codepoint_iter *iter) { if (iter->pos < iter->src.len) { struct string str_remaining = { .len = (iter->src.len - iter->pos), .text = iter->src.text + iter->pos }; struct uni_decode_utf8_result decoded = uni_decode_utf8(str_remaining); iter->pos += decoded.advance8; iter->codepoint = decoded.codepoint; return true; } else { return false; } } void string_codepoint_iter_end(struct string_codepoint_iter *iter) { /* Do nothing */ (UNUSED)iter; } /* utf8 <- utf16 */ struct string string_from_string16(struct arena *arena, struct string16 str16) { struct string res = { .len = 0, .text = arena_dry_push(arena, u8) }; u64 pos16 = 0; while (pos16 < str16.len) { struct string16 str16_remaining = { .len = (str16.len - pos16), .text = str16.text + pos16 }; struct uni_decode_utf16_result decoded = uni_decode_utf16(str16_remaining); struct uni_encode_utf8_result encoded = uni_encode_utf8(decoded.codepoint); u8 *dest = arena_push_array(arena, u8, encoded.count8); MEMCPY(dest, encoded.chars8, encoded.count8); pos16 += decoded.advance16; res.len += encoded.count8; } return res; } /* utf8 <- utf32 */ struct string string_from_string32(struct arena *arena, struct string32 str32) { struct string res = { .len = 0, .text = arena_dry_push(arena, u8) }; u64 pos32 = 0; while (pos32 < str32.len) { struct string32 str32_remaining = { .len = (str32.len - pos32), .text = str32.text + pos32 }; struct uni_decode_utf32_result decoded = uni_decode_utf32(str32_remaining); struct uni_encode_utf8_result encoded = uni_encode_utf8(decoded.codepoint); u8 *dest = arena_push_array(arena, u8, encoded.count8); MEMCPY(dest, &encoded.chars8, encoded.count8); pos32 += 1; res.len += encoded.count8; } return res; } /* utf16 <- utf8 */ struct string16 string16_from_string(struct arena *arena, struct string str8) { struct string16 res = { .len = 0, .text = arena_dry_push(arena, u16) }; u64 pos8 = 0; while (pos8 < str8.len) { struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 }; struct uni_decode_utf8_result decoded = uni_decode_utf8(str8_remaining); struct uni_encode_utf16_result encoded = uni_encode_utf16(decoded.codepoint); u16 *dest = arena_push_array(arena, u16, encoded.count16); MEMCPY(dest, encoded.chars16, (encoded.count16 << 1)); pos8 += decoded.advance8; res.len += encoded.count16; } return res; } /* utf32 <- utf8 */ struct string32 string32_from_string(struct arena *arena, struct string str8) { struct string32 res = { .len = 0, .text = arena_dry_push(arena, u32) }; u64 pos8 = 0; while (pos8 < str8.len) { struct string str8_remaining = { .len = (str8.len - pos8), .text = str8.text + pos8 }; struct uni_decode_utf8_result decoded = uni_decode_utf8(str8_remaining); struct uni_encode_utf32_result encoded = uni_encode_utf32(decoded.codepoint); u32 *dest = arena_push(arena, u32); *dest = encoded.chars32; pos8 += decoded.advance8; res.len += 1; } return res; } /* ========================== * * Legacy strings * ========================== */ /* C narrow strings */ u64 cstr_len(char *cstr) { char *end = cstr; if (cstr) { while (*end) { ++end; } } return end - cstr; } char *cstr_from_string(struct arena *arena, struct string src) { u8 *text = arena_push_array(arena, u8, src.len + 1); MEMCPY(text, src.text, src.len); text[src.len] = 0; return (char *)text; } char *cstr_buff_from_string(struct buffer dest_buff, struct string src) { if (dest_buff.size > 0) { u64 len = min_u64(src.len, dest_buff.size - 1); MEMCPY(dest_buff.data, src.text, len); dest_buff.data[len] = 0; } return (char *)dest_buff.data; } struct string string_from_cstr(char *cstr) { u64 len = cstr_len(cstr); return (struct string) { .len = len, .text = (u8 *)cstr }; } struct string string_from_cstr_len(char *cstr, u64 len) { return (struct string) { .text = (u8 *)cstr, .len = len }; } /* C wide strings */ u64 wstr_len(wchar_t *wstr) { wchar_t *end = wstr; while (*end) { ++end; } return end - wstr; } wchar_t *wstr_from_string(struct arena *arena, struct string src) { struct string16 str16 = string16_from_string(arena, src); *arena_push(arena, u16) = 0; return (wchar_t *)str16.text; } wchar_t *wstr_from_string16(struct arena *arena, struct string16 src) { u16 *text = arena_push_array(arena, u16, src.len + 1); text[src.len] = 0; return (wchar_t *)text; } struct string string_from_wstr(struct arena *arena, wchar_t *wstr) { struct string16 str16 = string16_from_wstr(wstr); return string_from_string16(arena, str16); } struct string16 string16_from_wstr(wchar_t *wstr) { u64 len = wstr_len(wstr); return (struct string16) { .len = len, .text = (u16 *)wstr }; }