#include "json.h" #include "arena.h" #include "string.h" #include "arena.h" #include "math.h" /* TODO (if we want to be JSON standard compliant): * - Support unicode escape sequences in strings (\u) * - Don't allow leading 0s in numbers */ /* ========================== * * Lex * ========================== */ #define CASE_NEWLINE \ case 0x0A: /* Line feed or New line */ \ case 0x0D /* Carriage return */ #define CASE_SPACE \ case 0x20: /* Space */ \ case 0x09 /* Horizontal tab */ #define CASE_DIGIT_0_TO_9 \ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9' #define CASE_DIGIT_1_TO_9 \ case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9' #define CASE_SYMBOL \ case ',': case ':': case '[': case ']': case '{': case '}' enum token_type { TOKEN_TYPE_UNKNOWN, TOKEN_TYPE_NUMBER, TOKEN_TYPE_STRING, TOKEN_TYPE_KEYWORD_TRUE, TOKEN_TYPE_KEYWORD_FALSE, TOKEN_TYPE_KEYWORD_NULL, TOKEN_TYPE_COMMA, TOKEN_TYPE_COLON, TOKEN_TYPE_SQUARE_BRACE_OPEN, TOKEN_TYPE_SQUARE_BRACE_CLOSE, TOKEN_TYPE_CURLY_BRACE_OPEN, TOKEN_TYPE_CURLY_BRACE_CLOSE, TOKEN_TYPE_BOF, TOKEN_TYPE_EOF }; struct token { enum token_type type; u64 start; u64 end; struct token *next; }; struct token_list { struct token *token_first; struct token *token_last; }; enum lex_number_state { LEX_NUMBER_STATE_WHOLE, LEX_NUMBER_STATE_FRACTION, LEX_NUMBER_STATE_EXPONENT }; GLOBAL READONLY struct string g_keyword_strings[] = { ['t'] = LIT_NOCAST("true"), ['f'] = LIT_NOCAST("false"), ['n'] = LIT_NOCAST("null") }; GLOBAL READONLY enum token_type g_keyword_types[] = { ['t'] = TOKEN_TYPE_KEYWORD_TRUE, ['f'] = TOKEN_TYPE_KEYWORD_FALSE, ['n'] = TOKEN_TYPE_KEYWORD_NULL }; INTERNAL struct token *push_token(struct arena *arena, struct token_list *list) { struct token *t = arena_push(arena, struct token); if (!list->token_first) { list->token_first = t; } else { list->token_last->next = t; } list->token_last = t; return t; } INTERNAL struct token_list lex(struct arena *arena, struct string src) { struct token_list res = ZI; struct token *bof = push_token(arena, &res); bof->type = TOKEN_TYPE_BOF; u64 pos = 0; b32 lexing_done = 0; while (!lexing_done) { /* Skip whitespace */ b32 whitespace_done = 0; while (!whitespace_done && pos < src.len) { switch (src.text[pos]) { CASE_NEWLINE: CASE_SPACE: { ++pos; } break; default: { whitespace_done = 1; } break; } } /* Create token */ struct token *t = push_token(arena, &res); t->start = pos; if (pos >= src.len) { t->type = TOKEN_TYPE_EOF; t->next = t; /* Self reference */ lexing_done = 1; } else { /* Lex known token types */ switch (src.text[pos]) { /* Symbols */ case ',': { t->type = TOKEN_TYPE_COMMA; ++pos; } break; case ':': { t->type = TOKEN_TYPE_COLON; ++pos; } break; case '[': { t->type = TOKEN_TYPE_SQUARE_BRACE_OPEN; ++pos; } break; case ']': { t->type = TOKEN_TYPE_SQUARE_BRACE_CLOSE; ++pos; } break; case '{': { t->type = TOKEN_TYPE_CURLY_BRACE_OPEN; ++pos; } break; case '}': { t->type = TOKEN_TYPE_CURLY_BRACE_CLOSE; ++pos; } break; /* Number */ case '-': { /* Verify '-' precedes digit */ b32 next_is_digit = 0; if ((pos + 1) < src.len) { switch (src.text[pos + 1]) { CASE_DIGIT_0_TO_9: { next_is_digit = 1; } break; } } ++pos; if (!next_is_digit) { break; } } FALLTHROUGH; CASE_DIGIT_0_TO_9: { t->type = TOKEN_TYPE_NUMBER; enum lex_number_state state = LEX_NUMBER_STATE_WHOLE; b32 number_done = 0; while (!number_done && pos < src.len) { switch (src.text[pos]) { CASE_DIGIT_0_TO_9: { ++pos; } break; case '.': { u64 consume = 0; if (state == LEX_NUMBER_STATE_WHOLE && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { CASE_DIGIT_0_TO_9: { /* Consume '.' */ ++consume; } break; default: break; } } if (consume) { state = LEX_NUMBER_STATE_FRACTION; pos += consume; } else { number_done = 1; } } break; case 'e': case 'E': { u64 consume = 0; if ((state == LEX_NUMBER_STATE_WHOLE || state == LEX_NUMBER_STATE_FRACTION) && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { CASE_DIGIT_0_TO_9: { /* Consume 'E'/'e' */ ++consume; } break; case '-': case '+': { if ((pos + 2) < src.len) { u8 c2 = src.text[pos + 2]; switch (c2) { CASE_DIGIT_0_TO_9: { /* Consume 'E'/'e' & '+'/'-' */ consume += 2; } break; default: break; } } } break; default: break; } } if (consume) { state = LEX_NUMBER_STATE_EXPONENT; pos += consume; } else { number_done = 1; } } break; default: { number_done = 1; } break; } } } break; /* String */ case '"': { ++pos; b32 string_done = 0; b32 next_escaped = 0; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = 0; switch (src.text[pos]) { CASE_NEWLINE: { ++pos; string_done = 1; } break; case '"': { ++pos; if (!escaped) { t->type = TOKEN_TYPE_STRING; string_done = 1; } } break; case '\\': { ++pos; if (!escaped) { next_escaped = 1; } } break; default: { ++pos; } break; } } } break; /* Keywords */ case 't': case 'f': case 'n': { struct string keyword = g_keyword_strings[src.text[pos]]; b32 match = 1; if ((pos + keyword.len - 1) < src.len) { if ((pos + keyword.len) < src.len) { /* Don't match if word continues past keyword */ switch (src.text[pos + keyword.len]) { CASE_SYMBOL: CASE_SPACE: CASE_NEWLINE: { } break; default: { match = 0; } break; } } if (match) { struct string cmp_str = { .len = keyword.len, .text = &src.text[pos] }; match = string_eq(cmp_str, keyword); } } if (match) { t->type = g_keyword_types[src.text[pos]]; pos += keyword.len; } } break; default: break; } } /* Lex unknown token */ if (t->type == TOKEN_TYPE_UNKNOWN) { b32 unknown_done = 0; while (!unknown_done && pos < src.len) { switch (src.text[pos]) { CASE_SYMBOL: CASE_SPACE: CASE_NEWLINE: { unknown_done = 1; } break; default: { ++pos; } break; } } t->end = pos; /* Exit early if unknown token encountered */ return res; } else { t->end = pos; } } return res; } /* ========================== * * Interpret * ========================== */ INTERNAL void append_char(struct arena *arena, struct string *str, u8 c) { *arena_push_no_zero(arena, u8) = c; ++str->len; } INTERNAL f64 interpret_number(struct string src) { b32 whole_present = 0; u64 whole_left = 0; u64 whole_right = 0; i32 whole_sign = 1; b32 fraction_present = 0; u64 fraction_left = 0; u64 fraction_right = 0; b32 exponent_present = 0; u64 exponent_left = 0; u64 exponent_right = 0; i32 exponent_sign = 1; /* Lex number parts */ { u64 pos = 0; if (src.len > 0 && src.text[0] == '-') { whole_sign = -1; ++pos; } enum lex_number_state state = LEX_NUMBER_STATE_WHOLE; while (pos < src.len) { switch (src.text[pos]) { CASE_DIGIT_0_TO_9: { switch (state) { case LEX_NUMBER_STATE_WHOLE: { if (!whole_present) { whole_present = 1; whole_left = pos; } whole_right = pos; ++pos; } break; case LEX_NUMBER_STATE_FRACTION: { if (!fraction_present) { fraction_present = 1; fraction_left = pos; } fraction_right = pos; ++pos; } break; case LEX_NUMBER_STATE_EXPONENT: { if (!exponent_present) { exponent_present = 1; exponent_left = pos; } exponent_right = pos; ++pos; } break; } } break; case '.': { state = LEX_NUMBER_STATE_FRACTION; ++pos; } break; case 'e': case 'E': { state = LEX_NUMBER_STATE_EXPONENT; ++pos; } break; case '-': { switch (state) { case LEX_NUMBER_STATE_WHOLE: { whole_sign = -1; ++pos; } break; case LEX_NUMBER_STATE_EXPONENT: { exponent_sign = -1; ++pos; } break; default: { /* Unreachable */ ASSERT(0); ++pos; } break; } } break; case '+': { switch (state) { case LEX_NUMBER_STATE_EXPONENT: { exponent_sign = 1; ++pos; } break; default: { /* Unreachable */ ASSERT(0); ++pos; } break; } } break; default: { /* Unreachable */ ASSERT(0); ++pos; } break; } } } f64 res = 0; /* Process whole part */ if (whole_present) { u64 pos = whole_left; while (pos <= whole_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = whole_right - pos; res += digit * math_pow_u64(10, exp); ++pos; } res *= whole_sign; } /* Process fraction part */ if (fraction_present) { u64 frac_whole = 0; u64 pos = fraction_left; while (pos <= fraction_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = fraction_right - pos; frac_whole += digit * math_pow_u64(10, exp); ++pos; } res += (f64)frac_whole / math_pow_u64(10, (fraction_right - fraction_left + 1)); } /* Process exponent part */ if (exponent_present) { u64 exponent_whole = 0; u64 pos = exponent_left; while (pos <= exponent_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = exponent_right - pos; exponent_whole += digit * math_pow_u64(10, exp); ++pos; } if (exponent_sign >= 0) { res *= math_pow_u64(10, exponent_whole); } else { res /= math_pow_u64(10, exponent_whole); } } return res; } INTERNAL struct string interpret_string(struct arena *arena, struct string src, struct string *error) { struct string res = { .len = 0, .text = arena_push_dry(arena, u8) }; if (src.len < 2) { if (error) { *error = LIT("Malformed string."); } return res; } /* Ignore beginning quote */ u64 pos = 1; b32 valid_close = 0; b32 string_done = 0; b32 next_escaped = 0; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = 0; if (escaped) { switch (src.text[pos]) { case '"': case '\\': case '/': { append_char(arena, &res, src.text[pos]); ++pos; } break; /* Backspace */ case 'b': { append_char(arena, &res, '\b'); ++pos; } break; /* Formfeed */ case 'f': { append_char(arena, &res, '\f'); ++pos; } break; /* Linefeed */ case 'n': { append_char(arena, &res, '\n'); ++pos; } break; /* Carriage return */ case 'r': { append_char(arena, &res, '\r'); ++pos; } break; /* Horizontal tab */ case 't': { append_char(arena, &res, '\t'); ++pos; } break; /* TODO: Unicode escape support */ #if 0 case 'u': { /* TODO */ } break; #endif default: { if (error) { *error = LIT("Invalid escape character in string."); return res; } } break; } } else { switch (src.text[pos]) { case '\\': { escaped = 1; ++pos; } break; case '"': { string_done = 1; valid_close = 1; ++pos; } break; default: { append_char(arena, &res, src.text[pos]); ++pos; } break; } } } if (!valid_close) { if (error) { *error = LIT("Expected end of string."); } } return res; } /* ========================== * * Parse * ========================== */ struct parser { /* Input */ struct string src; struct token *at; /* Output */ struct json *root; struct json_error_list errors; }; INTERNAL void push_error(struct arena *arena, struct parser *p, struct token *t, struct string msg) { struct json_error *error = arena_push(arena, struct json_error); error->msg = msg; error->start = t->start; error->end = t->end; struct json_error_list *list = &p->errors; if (!list->first) { list->first = error; } else { list->last->next = error; } list->last = error; ++list->count; } INTERNAL void parse(struct arena *arena, struct parser *p) { struct arena_temp scratch = scratch_begin(arena); struct json *root = arena_push(arena, struct json); struct token *at = p->at; struct string src = p->src; if (at->type == TOKEN_TYPE_BOF) { at = at->next; } /* Depth first stack */ *arena_push_no_zero(scratch.arena, struct json *) = root; u64 stack_count = 1; while (stack_count > 0) { struct json *json = 0; arena_pop(scratch.arena, struct json *, &json); --stack_count; struct json *parent_json = json->parent; b32 is_new_parent = 0; if (json->type == JSON_TYPE_OBJECT || json->type == JSON_TYPE_ARRAY) { /* No more children to parse for object/array, check for closing brace. */ enum token_type tok_close_type = json->type == JSON_TYPE_OBJECT ? TOKEN_TYPE_CURLY_BRACE_CLOSE : TOKEN_TYPE_SQUARE_BRACE_CLOSE; if (at->type == tok_close_type) { at = at->next; } else { push_error(arena, p, at, LIT("Expected comma.")); at = at->next; goto abort; } } else { if (parent_json) { if (parent_json->type == JSON_TYPE_OBJECT) { /* Parse key */ if (at->type == TOKEN_TYPE_STRING) { struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] }; struct string error = ZI; struct string key = interpret_string(arena, t_text, &error); if (error.len > 0) { push_error(arena, p, at, error); goto abort; } else { json->key = key; at = at->next; } } else { push_error(arena, p, at, LIT("Key expected.")); goto abort; } /* Parse colon */ if (at->type == TOKEN_TYPE_COLON) { at = at->next; } else { push_error(arena, p, at, LIT("Colon expected.")); goto abort; } } if (parent_json->child_last) { parent_json->child_last->next = json; } else { parent_json->child_first = json; } parent_json->child_last = json; } /* Parse value */ switch (at->type) { case TOKEN_TYPE_NUMBER: { struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] }; f64 value = interpret_number(t_text); json->type = JSON_TYPE_NUMBER; json->value.number = value; at = at->next; } break; case TOKEN_TYPE_STRING: { struct string t_text = (struct string) { .len = at->end - at->start, .text = &src.text[at->start] }; struct string error = ZI; struct string value = interpret_string(arena, t_text, &error); if (error.len > 0) { push_error(arena, p, at, error); goto abort; } else { json->type = JSON_TYPE_STRING; json->value.string = value; at = at->next; } } break; case TOKEN_TYPE_KEYWORD_TRUE: { json->type = JSON_TYPE_BOOL; json->value.boolean = 1; at = at->next; } break; case TOKEN_TYPE_KEYWORD_FALSE: { json->type = JSON_TYPE_BOOL; json->value.boolean = 0; at = at->next; } break; case TOKEN_TYPE_KEYWORD_NULL: { json->type = JSON_TYPE_NULL; at = at->next; } break; case TOKEN_TYPE_CURLY_BRACE_OPEN: { json->type = JSON_TYPE_OBJECT; at = at->next; is_new_parent = 1; } break; case TOKEN_TYPE_SQUARE_BRACE_OPEN: { json->type = JSON_TYPE_ARRAY; at = at->next; is_new_parent = 1; } break; default: { push_error(arena, p, at, LIT("Value expected.")); at = at->next; goto abort; } break; } } if (is_new_parent) { /* Push self back to stack to re-check for closing brace later */ *arena_push_no_zero(scratch.arena, struct json *) = json; ++stack_count; /* Create child & push to stack */ struct json *child = arena_push(arena, struct json); child->parent = json; *arena_push_no_zero(scratch.arena, struct json *) = child; ++stack_count; } else if (parent_json) { /* Check for comma */ if (at->type == TOKEN_TYPE_COMMA) { /* Create sibling & push to stack */ struct json *sibling = arena_push(arena, struct json); sibling->parent = parent_json; *arena_push_no_zero(scratch.arena, struct json *) = sibling; ++stack_count; at = at->next; } } } abort: p->at = at; p->root = root; scratch_end(scratch); } /* ========================== * * Interface * ========================== */ struct json_parse_result json_from_string(struct arena *arena, struct string src) { struct arena_temp scratch = scratch_begin(arena); struct token_list tl = lex(scratch.arena, src); /* Parse root */ struct parser p = { .src = src, .at = tl.token_first }; parse(arena, &p); /* Verify end of file */ if (p.errors.count == 0 && p.at->type != TOKEN_TYPE_EOF) { push_error(arena, &p, p.at, LIT("Expected end of file.")); } scratch_end(scratch); return (struct json_parse_result) { .root = p.root, .errors = p.errors }; }