power_play/src/json.c
2025-07-03 12:29:09 -05:00

859 lines
26 KiB
C

#include "json.h"
#include "arena.h"
#include "string.h"
#include "arena.h"
#include "math.h"
/* TODO (if we want to be JSON standard compliant):
* - Support unicode escape sequences in strings (\u)
* - Don't allow leading 0s in numbers
*/
/* ========================== *
* Lex
* ========================== */
#define CASE_NEWLINE \
case 0x0A: /* Line feed or New line */ \
case 0x0D /* Carriage return */
#define CASE_SPACE \
case 0x20: /* Space */ \
case 0x09 /* Horizontal tab */
#define CASE_DIGIT_0_TO_9 \
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9'
#define CASE_DIGIT_1_TO_9 \
case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9'
#define CASE_SYMBOL \
case ',': case ':': case '[': case ']': case '{': case '}'
enum token_type {
TOKEN_TYPE_UNKNOWN,
TOKEN_TYPE_NUMBER,
TOKEN_TYPE_STRING,
TOKEN_TYPE_KEYWORD_TRUE,
TOKEN_TYPE_KEYWORD_FALSE,
TOKEN_TYPE_KEYWORD_NULL,
TOKEN_TYPE_COMMA,
TOKEN_TYPE_COLON,
TOKEN_TYPE_SQUARE_BRACE_OPEN,
TOKEN_TYPE_SQUARE_BRACE_CLOSE,
TOKEN_TYPE_CURLY_BRACE_OPEN,
TOKEN_TYPE_CURLY_BRACE_CLOSE,
TOKEN_TYPE_BOF,
TOKEN_TYPE_EOF
};
struct token {
enum token_type type;
u64 start;
u64 end;
struct token *next;
};
struct token_list {
struct token *token_first;
struct token *token_last;
};
enum lex_number_state {
LEX_NUMBER_STATE_WHOLE,
LEX_NUMBER_STATE_FRACTION,
LEX_NUMBER_STATE_EXPONENT
};
GLOBAL READONLY struct string g_keyword_strings[] = {
['t'] = LIT_NOCAST("true"),
['f'] = LIT_NOCAST("false"),
['n'] = LIT_NOCAST("null")
};
GLOBAL READONLY enum token_type g_keyword_types[] = {
['t'] = TOKEN_TYPE_KEYWORD_TRUE,
['f'] = TOKEN_TYPE_KEYWORD_FALSE,
['n'] = TOKEN_TYPE_KEYWORD_NULL
};
INTERNAL struct token *push_token(struct arena *arena, struct token_list *list)
{
struct token *t = arena_push(arena, struct token);
if (!list->token_first) {
list->token_first = t;
} else {
list->token_last->next = t;
}
list->token_last = t;
return t;
}
INTERNAL struct token_list lex(struct arena *arena, struct string src)
{
struct token_list res = ZI;
struct token *bof = push_token(arena, &res);
bof->type = TOKEN_TYPE_BOF;
u64 pos = 0;
b32 lexing_done = false;
while (!lexing_done) {
/* Skip whitespace */
b32 whitespace_done = false;
while (!whitespace_done && pos < src.len) {
switch (src.text[pos]) {
CASE_NEWLINE:
CASE_SPACE: {
++pos;
} break;
default: {
whitespace_done = true;
} break;
}
}
/* Create token */
struct token *t = push_token(arena, &res);
t->start = pos;
if (pos >= src.len) {
t->type = TOKEN_TYPE_EOF;
t->next = t; /* Self reference */
lexing_done = true;
} else {
/* Lex known token types */
switch (src.text[pos]) {
/* Symbols */
case ',': {
t->type = TOKEN_TYPE_COMMA;
++pos;
} break;
case ':': {
t->type = TOKEN_TYPE_COLON;
++pos;
} break;
case '[': {
t->type = TOKEN_TYPE_SQUARE_BRACE_OPEN;
++pos;
} break;
case ']': {
t->type = TOKEN_TYPE_SQUARE_BRACE_CLOSE;
++pos;
} break;
case '{': {
t->type = TOKEN_TYPE_CURLY_BRACE_OPEN;
++pos;
} break;
case '}': {
t->type = TOKEN_TYPE_CURLY_BRACE_CLOSE;
++pos;
} break;
/* Number */
case '-': {
/* Verify '-' precedes digit */
b32 next_is_digit = false;
if ((pos + 1) < src.len) {
switch (src.text[pos + 1]) {
CASE_DIGIT_0_TO_9: {
next_is_digit = true;
} break;
}
}
++pos;
if (!next_is_digit) {
break;
}
} FALLTHROUGH;
CASE_DIGIT_0_TO_9: {
t->type = TOKEN_TYPE_NUMBER;
enum lex_number_state state = LEX_NUMBER_STATE_WHOLE;
b32 number_done = false;
while (!number_done && pos < src.len) {
switch (src.text[pos]) {
CASE_DIGIT_0_TO_9: {
++pos;
} break;
case '.': {
u64 consume = 0;
if (state == LEX_NUMBER_STATE_WHOLE && (pos + 1) < src.len) {
u8 c1 = src.text[pos + 1];
switch (c1) {
CASE_DIGIT_0_TO_9: {
/* Consume '.' */
++consume;
} break;
default: break;
}
}
if (consume) {
state = LEX_NUMBER_STATE_FRACTION;
pos += consume;
} else {
number_done = true;
}
} break;
case 'e':
case 'E': {
u64 consume = 0;
if ((state == LEX_NUMBER_STATE_WHOLE || state == LEX_NUMBER_STATE_FRACTION) && (pos + 1) < src.len) {
u8 c1 = src.text[pos + 1];
switch (c1) {
CASE_DIGIT_0_TO_9: {
/* Consume 'E'/'e' */
++consume;
} break;
case '-':
case '+': {
if ((pos + 2) < src.len) {
u8 c2 = src.text[pos + 2];
switch (c2) {
CASE_DIGIT_0_TO_9: {
/* Consume 'E'/'e' & '+'/'-' */
consume += 2;
} break;
default: break;
}
}
} break;
default: break;
}
}
if (consume) {
state = LEX_NUMBER_STATE_EXPONENT;
pos += consume;
} else {
number_done = true;
}
} break;
default: {
number_done = true;
} break;
}
}
} break;
/* String */
case '"': {
++pos;
b32 string_done = false;
b32 next_escaped = false;
while (!string_done && pos < src.len) {
b32 escaped = next_escaped;
next_escaped = false;
switch (src.text[pos]) {
CASE_NEWLINE: {
++pos;
string_done = true;
} break;
case '"': {
++pos;
if (!escaped) {
t->type = TOKEN_TYPE_STRING;
string_done = true;
}
} break;
case '\\': {
++pos;
if (!escaped) {
next_escaped = true;
}
} break;
default: {
++pos;
} break;
}
}
} break;
/* Keywords */
case 't':
case 'f':
case 'n': {
struct string keyword = g_keyword_strings[src.text[pos]];
b32 match = true;
if ((pos + keyword.len - 1) < src.len) {
if ((pos + keyword.len) < src.len) {
/* Don't match if word continues past keyword */
switch (src.text[pos + keyword.len]) {
CASE_SYMBOL:
CASE_SPACE:
CASE_NEWLINE: {
} break;
default: {
match = false;
} break;
}
}
if (match) {
struct string cmp_str = {
.len = keyword.len,
.text = &src.text[pos]
};
match = string_eq(cmp_str, keyword);
}
}
if (match) {
t->type = g_keyword_types[src.text[pos]];
pos += keyword.len;
}
} break;
default: break;
}
}
/* Lex unknown token */
if (t->type == TOKEN_TYPE_UNKNOWN) {
b32 unknown_done = false;
while (!unknown_done && pos < src.len) {
switch (src.text[pos]) {
CASE_SYMBOL:
CASE_SPACE:
CASE_NEWLINE: {
unknown_done = true;
} break;
default: {
++pos;
} break;
}
}
t->end = pos;
/* Exit early if unknown token encountered */
return res;
} else {
t->end = pos;
}
}
return res;
}
/* ========================== *
* Interpret
* ========================== */
INTERNAL void append_char(struct arena *arena, struct string *str, u8 c)
{
*arena_push_no_zero(arena, u8) = c;
++str->len;
}
INTERNAL f64 interpret_number(struct string src)
{
b32 whole_present = false;
u64 whole_left = 0;
u64 whole_right = 0;
i32 whole_sign = 1;
b32 fraction_present = false;
u64 fraction_left = 0;
u64 fraction_right = 0;
b32 exponent_present = false;
u64 exponent_left = 0;
u64 exponent_right = 0;
i32 exponent_sign = 1;
/* Lex number parts */
{
u64 pos = 0;
if (src.len > 0 && src.text[0] == '-') {
whole_sign = -1;
++pos;
}
enum lex_number_state state = LEX_NUMBER_STATE_WHOLE;
while (pos < src.len) {
switch (src.text[pos]) {
CASE_DIGIT_0_TO_9: {
switch (state) {
case LEX_NUMBER_STATE_WHOLE: {
if (!whole_present) {
whole_present = true;
whole_left = pos;
}
whole_right = pos;
++pos;
} break;
case LEX_NUMBER_STATE_FRACTION: {
if (!fraction_present) {
fraction_present = true;
fraction_left = pos;
}
fraction_right = pos;
++pos;
} break;
case LEX_NUMBER_STATE_EXPONENT: {
if (!exponent_present) {
exponent_present = true;
exponent_left = pos;
}
exponent_right = pos;
++pos;
} break;
}
} break;
case '.': {
state = LEX_NUMBER_STATE_FRACTION;
++pos;
} break;
case 'e':
case 'E': {
state = LEX_NUMBER_STATE_EXPONENT;
++pos;
} break;
case '-': {
switch (state) {
case LEX_NUMBER_STATE_WHOLE: {
whole_sign = -1;
++pos;
} break;
case LEX_NUMBER_STATE_EXPONENT: {
exponent_sign = -1;
++pos;
} break;
default: {
/* Unreachable */
ASSERT(false);
++pos;
} break;
}
} break;
case '+': {
switch (state) {
case LEX_NUMBER_STATE_EXPONENT: {
exponent_sign = 1;
++pos;
} break;
default: {
/* Unreachable */
ASSERT(false);
++pos;
} break;
}
} break;
default: {
/* Unreachable */
ASSERT(false);
++pos;
} break;
}
}
}
f64 res = 0;
/* Process whole part */
if (whole_present) {
u64 pos = whole_left;
while (pos <= whole_right) {
u8 digit = min_u8(src.text[pos] - 48, 9);
u64 exp = whole_right - pos;
res += digit * math_pow_u64(10, exp);
++pos;
}
res *= whole_sign;
}
/* Process fraction part */
if (fraction_present) {
u64 frac_whole = 0;
u64 pos = fraction_left;
while (pos <= fraction_right) {
u8 digit = min_u8(src.text[pos] - 48, 9);
u64 exp = fraction_right - pos;
frac_whole += digit * math_pow_u64(10, exp);
++pos;
}
res += (f64)frac_whole / math_pow_u64(10, (fraction_right - fraction_left + 1));
}
/* Process exponent part */
if (exponent_present) {
u64 exponent_whole = 0;
u64 pos = exponent_left;
while (pos <= exponent_right) {
u8 digit = min_u8(src.text[pos] - 48, 9);
u64 exp = exponent_right - pos;
exponent_whole += digit * math_pow_u64(10, exp);
++pos;
}
if (exponent_sign >= 0) {
res *= math_pow_u64(10, exponent_whole);
} else {
res /= math_pow_u64(10, exponent_whole);
}
}
return res;
}
INTERNAL struct string interpret_string(struct arena *arena, struct string src, struct string *error)
{
struct string res = {
.len = 0,
.text = arena_push_dry(arena, u8)
};
if (src.len < 2) {
if (error) {
*error = LIT("Malformed string.");
}
return res;
}
/* Ignore beginning quote */
u64 pos = 1;
b32 valid_close = false;
b32 string_done = false;
b32 next_escaped = false;
while (!string_done && pos < src.len) {
b32 escaped = next_escaped;
next_escaped = false;
if (escaped) {
switch (src.text[pos]) {
case '"':
case '\\':
case '/': {
append_char(arena, &res, src.text[pos]);
++pos;
} break;
/* Backspace */
case 'b': {
append_char(arena, &res, '\b');
++pos;
} break;
/* Formfeed */
case 'f': {
append_char(arena, &res, '\f');
++pos;
} break;
/* Linefeed */
case 'n': {
append_char(arena, &res, '\n');
++pos;
} break;
/* Carriage return */
case 'r': {
append_char(arena, &res, '\r');
++pos;
} break;
/* Horizontal tab */
case 't': {
append_char(arena, &res, '\t');
++pos;
} break;
/* TODO: Unicode escape support */
#if 0
case 'u': {
/* TODO */
} break;
#endif
default: {
if (error) {
*error = LIT("Invalid escape character in string.");
return res;
}
} break;
}
} else {
switch (src.text[pos]) {
case '\\': {
escaped = true;
++pos;
} break;
case '"': {
string_done = true;
valid_close = true;
++pos;
} break;
default: {
append_char(arena, &res, src.text[pos]);
++pos;
} break;
}
}
}
if (!valid_close) {
if (error) {
*error = LIT("Expected end of string.");
}
}
return res;
}
/* ========================== *
* Parse
* ========================== */
struct parser {
/* Input */
struct string src;
struct token *at;
/* Output */
struct json *root;
struct json_error_list errors;
};
INTERNAL void push_error(struct arena *arena, struct parser *p, struct token *t, struct string msg)
{
struct json_error *error = arena_push(arena, struct json_error);
error->msg = msg;
error->start = t->start;
error->end = t->end;
struct json_error_list *list = &p->errors;
if (!list->first) {
list->first = error;
} else {
list->last->next = error;
}
list->last = error;
++list->count;
}
INTERNAL void parse(struct arena *arena, struct parser *p)
{
struct arena_temp scratch = scratch_begin(arena);
struct json *root = arena_push(arena, struct json);
struct token *at = p->at;
struct string src = p->src;
if (at->type == TOKEN_TYPE_BOF) {
at = at->next;
}
/* Depth first stack */
*arena_push_no_zero(scratch.arena, struct json *) = root;
u64 stack_count = 1;
while (stack_count > 0) {
struct json *json = NULL;
arena_pop(scratch.arena, struct json *, &json);
--stack_count;
struct json *parent_json = json->parent;
b32 is_new_parent = false;
if (json->type == JSON_TYPE_OBJECT || json->type == JSON_TYPE_ARRAY) {
/* No more children to parse for object/array, check for closing brace. */
enum token_type tok_close_type = json->type == JSON_TYPE_OBJECT ? TOKEN_TYPE_CURLY_BRACE_CLOSE : TOKEN_TYPE_SQUARE_BRACE_CLOSE;
if (at->type == tok_close_type) {
at = at->next;
} else {
push_error(arena, p, at, LIT("Expected comma."));
at = at->next;
goto abort;
}
} else {
if (parent_json) {
if (parent_json->type == JSON_TYPE_OBJECT) {
/* Parse key */
if (at->type == TOKEN_TYPE_STRING) {
struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] };
struct string error = ZI;
struct string key = interpret_string(arena, t_text, &error);
if (error.len > 0) {
push_error(arena, p, at, error);
goto abort;
} else {
json->key = key;
at = at->next;
}
} else {
push_error(arena, p, at, LIT("Key expected."));
goto abort;
}
/* Parse colon */
if (at->type == TOKEN_TYPE_COLON) {
at = at->next;
} else {
push_error(arena, p, at, LIT("Colon expected."));
goto abort;
}
}
if (parent_json->child_last) {
parent_json->child_last->next = json;
} else {
parent_json->child_first = json;
}
parent_json->child_last = json;
}
/* Parse value */
switch (at->type) {
case TOKEN_TYPE_NUMBER: {
struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] };
f64 value = interpret_number(t_text);
json->type = JSON_TYPE_NUMBER;
json->value.number = value;
at = at->next;
} break;
case TOKEN_TYPE_STRING: {
struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] };
struct string error = ZI;
struct string value = interpret_string(arena, t_text, &error);
if (error.len > 0) {
push_error(arena, p, at, error);
goto abort;
} else {
json->type = JSON_TYPE_STRING;
json->value.string = value;
at = at->next;
}
} break;
case TOKEN_TYPE_KEYWORD_TRUE: {
json->type = JSON_TYPE_BOOL;
json->value.boolean = true;
at = at->next;
} break;
case TOKEN_TYPE_KEYWORD_FALSE: {
json->type = JSON_TYPE_BOOL;
json->value.boolean = false;
at = at->next;
} break;
case TOKEN_TYPE_KEYWORD_NULL: {
json->type = JSON_TYPE_NULL;
at = at->next;
} break;
case TOKEN_TYPE_CURLY_BRACE_OPEN: {
json->type = JSON_TYPE_OBJECT;
at = at->next;
is_new_parent = true;
} break;
case TOKEN_TYPE_SQUARE_BRACE_OPEN: {
json->type = JSON_TYPE_ARRAY;
at = at->next;
is_new_parent = true;
} break;
default: {
push_error(arena, p, at, LIT("Value expected."));
at = at->next;
goto abort;
} break;
}
}
if (is_new_parent) {
/* Push self back to stack to re-check for closing brace later */
*arena_push_no_zero(scratch.arena, struct json *) = json;
++stack_count;
/* Create child & push to stack */
struct json *child = arena_push(arena, struct json);
child->parent = json;
*arena_push_no_zero(scratch.arena, struct json *) = child;
++stack_count;
} else if (parent_json) {
/* Check for comma */
if (at->type == TOKEN_TYPE_COMMA) {
/* Create sibling & push to stack */
struct json *sibling = arena_push(arena, struct json);
sibling->parent = parent_json;
*arena_push_no_zero(scratch.arena, struct json *) = sibling;
++stack_count;
at = at->next;
}
}
}
abort:
p->at = at;
p->root = root;
scratch_end(scratch);
}
/* ========================== *
* Interface
* ========================== */
struct json_parse_result json_from_string(struct arena *arena, struct string src)
{
struct arena_temp scratch = scratch_begin(arena);
struct token_list tl = lex(scratch.arena, src);
/* Parse root */
struct parser p = {
.src = src,
.at = tl.token_first
};
parse(arena, &p);
/* Verify end of file */
if (p.errors.count == 0 && p.at->type != TOKEN_TYPE_EOF) {
push_error(arena, &p, p.at, LIT("Expected end of file."));
}
scratch_end(scratch);
return (struct json_parse_result) {
.root = p.root,
.errors = p.errors
};
}