#include "json.h" #include "string.h" #include "arena.h" #include "scratch.h" #include "math.h" /* Non-standard-conforming JSON parser. * - Unicode not supported (TODO) * - Unicode escape sequences in strings (\u) not supported * - Leading 0s in numbers are allowed */ /* ========================== * * Lex * ========================== */ #define CASE_NEWLINE \ case 0x0A: /* Line feed or New line */ \ case 0x0D /* Carriage return */ #define CASE_SPACE \ case 0x20: /* Space */ \ case 0x09 /* Horizontal tab */ #define CASE_ALPHABETICAL \ case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': \ case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z' #define CASE_DIGIT \ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9' #define CASE_SYMBOL \ case ',': case ':': case '[': case ']': case '{': case '}' enum token_type { TOKEN_TYPE_UNKNOWN, TOKEN_TYPE_NUMBER, TOKEN_TYPE_STRING, TOKEN_TYPE_TRUE, TOKEN_TYPE_FALSE, TOKEN_TYPE_NULL, TOKEN_TYPE_COMMA, TOKEN_TYPE_COLON, TOKEN_TYPE_SQUARE_BRACE_OPEN, TOKEN_TYPE_SQUARE_BRACE_CLOSE, TOKEN_TYPE_CURLY_BRACE_OPEN, TOKEN_TYPE_CURLY_BRACE_CLOSE, TOKEN_TYPE_EOF }; struct token { enum token_type type; u64 start; u64 end; struct token *next; }; struct lex_result { struct token *token_first; struct token *token_last; }; enum lex_number_state { LEX_NUMBER_STATE_WHOLE, LEX_NUMBER_STATE_FRACTION, LEX_NUMBER_STATE_EXPONENT }; GLOBAL READONLY struct string g_keyword_strings[] = { ['t'] = STR("true"), ['f'] = STR("false"), ['n'] = STR("null") }; GLOBAL READONLY enum token_type g_keyword_types[] = { ['t'] = TOKEN_TYPE_TRUE, ['f'] = TOKEN_TYPE_FALSE, ['n'] = TOKEN_TYPE_NULL }; INTERNAL struct lex_result lex(struct arena *arena, struct string src) { struct lex_result res = { 0 }; u64 pos = 0; b32 lexing_done = false; while (!lexing_done) { /* Skip whitespace */ b32 whitespace_done = false; while (!whitespace_done && pos < src.len) { switch (src.text[pos]) { CASE_NEWLINE: CASE_SPACE: { ++pos; } break; default: { whitespace_done = true; } break; } } /* Create token */ struct token *t = arena_push_zero(arena, struct token); t->start = pos; /* Push token to list */ if (!res.token_first) { res.token_first = t; } else { res.token_last->next = t; } res.token_last = t; if (pos >= src.len) { t->type = TOKEN_TYPE_EOF; lexing_done = true; } else { /* Lex known token types */ switch (src.text[pos]) { /* Symbols */ case ',': { t->type = TOKEN_TYPE_COMMA; ++pos; } break; case ':': { t->type = TOKEN_TYPE_COLON; ++pos; } break; case '[': { t->type = TOKEN_TYPE_SQUARE_BRACE_OPEN; ++pos; } break; case ']': { t->type = TOKEN_TYPE_SQUARE_BRACE_CLOSE; ++pos; } break; case '{': { t->type = TOKEN_TYPE_CURLY_BRACE_OPEN; ++pos; } break; case '}': { t->type = TOKEN_TYPE_CURLY_BRACE_CLOSE; ++pos; } break; /* Number */ case '-': { /* Verify '-' precedes digit */ b32 next_is_digit = false; if ((pos + 1) < src.len) { switch (src.text[pos + 1]) { CASE_DIGIT: { next_is_digit = true; } break; } } ++pos; if (!next_is_digit) { break; } } FALLTHROUGH; CASE_DIGIT: { t->type = TOKEN_TYPE_NUMBER; enum lex_number_state state = LEX_NUMBER_STATE_WHOLE; b32 number_done = false; while (!number_done && pos < src.len) { switch (src.text[pos]) { CASE_DIGIT: { ++pos; } break; case '.': { u64 consume = 0; if (state == LEX_NUMBER_STATE_WHOLE && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { CASE_DIGIT: { /* Consume '.' */ ++consume; } break; default: break; } } if (consume) { state = LEX_NUMBER_STATE_FRACTION; pos += consume; } else { number_done = true; } } break; case 'e': case 'E': { u64 consume = 0; if ((state == LEX_NUMBER_STATE_WHOLE || state == LEX_NUMBER_STATE_FRACTION) && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { CASE_DIGIT: { /* Consume 'E'/'e' */ ++consume; } break; case '-': case '+': { if ((pos + 2) < src.len) { u8 c2 = src.text[pos + 2]; switch (c2) { CASE_DIGIT: { /* Consume 'E'/'e' & '+'/'-' */ consume += 2; } break; default: break; } } } break; default: break; } } if (consume) { state = LEX_NUMBER_STATE_EXPONENT; pos += consume; } else { number_done = true; } } break; default: { number_done = true; } break; } } } break; /* String */ case '"': { ++pos; b32 string_done = false; b32 next_escaped = false; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = false; switch (src.text[pos]) { CASE_NEWLINE: { ++pos; string_done = true; } break; case '"': { ++pos; if (!escaped) { t->type = TOKEN_TYPE_STRING; string_done = true; } } break; case '\\': { ++pos; if (!escaped) { next_escaped = true; } } break; default: { ++pos; } break; } } } break; /* Keywords */ case 't': case 'f': case 'n': { struct string keyword = g_keyword_strings[src.text[pos]]; b32 match = true; /* FIXME: Verify bounds checks are correct here */ if ((pos + keyword.len - 1) < src.len) { if ((pos + keyword.len) < src.len) { /* Don't match if word continues past keyword */ switch (src.text[pos + keyword.len]) { CASE_SYMBOL: CASE_SPACE: CASE_NEWLINE: { } break; default: { match = false; } break; } } if (match) { struct string cmp_str = { .len = keyword.len, .text = &src.text[pos] }; match = string_eq(cmp_str, keyword); } } if (match) { t->type = g_keyword_types[src.text[pos]]; pos += keyword.len; } } break; default: break; } } /* Lex unknown token */ if (t->type == TOKEN_TYPE_UNKNOWN) { b32 unknown_done = false; while (!unknown_done && pos < src.len) { switch (src.text[pos]) { CASE_SYMBOL: CASE_SPACE: CASE_NEWLINE: { unknown_done = true; } break; default: { ++pos; } break; } } t->end = pos; /* Exit early if unknown token encountered */ return res; } else { t->end = pos; } } return res; } /* ========================== * * Interpret * ========================== */ INTERNAL void append_char(struct arena *arena, struct string *str, u8 c) { *arena_push(arena, u8) = c; ++str->len; } INTERNAL f64 interpret_number(struct string src) { b32 whole_present = false; u64 whole_left = 0; u64 whole_right = 0; i32 whole_sign = 1; b32 fraction_present = false; u64 fraction_left = 0; u64 fraction_right = 0; b32 exponent_present = false; u64 exponent_left = 0; u64 exponent_right = 0; i32 exponent_sign = 1; (UNUSED)exponent_present; (UNUSED)exponent_left; (UNUSED)exponent_right; (UNUSED)exponent_sign; /* Lex number parts */ { u64 pos = 0; if (src.len > 0 && src.text[0] == '-') { whole_sign = -1; ++pos; } enum lex_number_state state = LEX_NUMBER_STATE_WHOLE; while (pos < src.len) { switch (src.text[pos]) { CASE_DIGIT: { switch (state) { case LEX_NUMBER_STATE_WHOLE: { if (!whole_present) { whole_present = true; whole_left = pos; } whole_right = pos; ++pos; } break; case LEX_NUMBER_STATE_FRACTION: { if (!fraction_present) { fraction_present = true; fraction_left = pos; } fraction_right = pos; ++pos; } break; case LEX_NUMBER_STATE_EXPONENT: { if (!exponent_present) { exponent_present = true; exponent_left = pos; } exponent_right = pos; ++pos; } break; } } break; case '.': { state = LEX_NUMBER_STATE_FRACTION; ++pos; } break; case 'e': case 'E': { state = LEX_NUMBER_STATE_EXPONENT; ++pos; } break; case '-': { switch (state) { case LEX_NUMBER_STATE_WHOLE: { whole_sign = -1; ++pos; } break; case LEX_NUMBER_STATE_EXPONENT: { exponent_sign = -1; ++pos; } break; default: { /* Unreachable */ ASSERT(false); ++pos; } break; } } break; case '+': { switch (state) { case LEX_NUMBER_STATE_EXPONENT: { exponent_sign = 1; ++pos; } break; default: { /* Unreachable */ ASSERT(false); ++pos; } break; } } break; default: { /* Unreachable */ ASSERT(false); ++pos; } break; } } } f64 res = 0; /* Process whole part */ if (whole_present) { u64 pos = whole_left; while (pos <= whole_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = whole_right - pos; res += digit * math_pow_u64(10, exp); ++pos; } res *= whole_sign; } /* Process fraction part */ if (fraction_present) { u64 frac_whole = 0; u64 pos = fraction_left; while (pos <= fraction_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = fraction_right - pos; frac_whole += digit * math_pow_u64(10, exp); ++pos; } res += (f64)frac_whole / math_pow_u64(10, (fraction_right - fraction_left + 1)); } /* Process exponent part */ if (exponent_present) { u64 exponent_whole = 0; u64 pos = exponent_left; while (pos <= exponent_right) { u8 digit = min_u8(src.text[pos] - 48, 9); u64 exp = exponent_right - pos; exponent_whole += digit * math_pow_u64(10, exp); ++pos; } if (exponent_sign >= 0) { res *= math_pow_u64(10, exponent_whole); } else { res /= math_pow_u64(10, exponent_whole); } } return res; } INTERNAL struct string interpret_string(struct arena *arena, struct string src, struct string *error) { (UNUSED)arena; (UNUSED)src; (UNUSED)error; struct string res = { .len = 0, .text = arena_dry_push(arena, u8) }; if (src.len < 2) { if (error) { *error = STR("Malformed string."); } return res; } /* Ignore beginning quote */ u64 pos = 1; b32 valid_close = false; b32 string_done = false; b32 next_escaped = false; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = false; if (escaped) { switch (src.text[pos]) { case '"': case '\\': case '/': { append_char(arena, &res, src.text[pos]); ++pos; } break; /* Backspace */ case 'b': { append_char(arena, &res, '\b'); ++pos; } break; /* Formfeed */ case 'f': { append_char(arena, &res, '\f'); ++pos; } break; /* Linefeed */ case 'n': { append_char(arena, &res, '\n'); ++pos; } break; /* Carriage return */ case 'r': { append_char(arena, &res, '\r'); ++pos; } break; /* Horizontal tab */ case 't': { append_char(arena, &res, '\t'); ++pos; } break; /* TODO: Unicode escape support */ #if 0 case 'u': { /* TODO */ } break; #endif default: { if (error) { *error = STR("Invalid escape character in string."); return res; } } break; } } else { switch (src.text[pos]) { case '\\': { escaped = true; ++pos; } break; case '"': { string_done = true; valid_close = true; ++pos; } break; default: { append_char(arena, &res, src.text[pos]); ++pos; } break; } } } if (!valid_close) { if (error) { *error = STR("Expected end of string."); } } return res; } /* ========================== * * Parse * ========================== */ struct parser { struct string src; struct token *t; struct json_error_list errors; }; INTERNAL void push_error(struct arena *arena, struct parser *p, struct token *t, struct string msg) { struct json_error *error = arena_push(arena, struct json_error); *error = (struct json_error) { .msg = msg, .start = t->start, .end = t->end }; struct json_error_list *list = &p->errors; if (!list->first) { list->first = error; } else { list->last->next = error; } list->last = error; ++list->count; } INTERNAL struct json *parse(struct arena *arena, struct parser *p) { struct temp_arena scratch = scratch_begin(arena); struct string src = p->src; struct json *res = arena_push_zero(arena, struct json); /* Depth first stack */ *arena_push(scratch.arena, struct json *) = res; u64 stack_count = 1; while (stack_count > 0) { struct json *json = NULL; arena_pop(scratch.arena, struct json *, &json); --stack_count; struct json *parent_json = json->parent; b32 is_new_parent = false; if (json->type == JSON_TYPE_OBJECT || json->type == JSON_TYPE_ARRAY) { /* No more children to parse for object/array, check for closing brace. */ enum token_type tok_close_type = json->type == JSON_TYPE_OBJECT ? TOKEN_TYPE_CURLY_BRACE_CLOSE : TOKEN_TYPE_SQUARE_BRACE_CLOSE; if (p->t->type == tok_close_type) { p->t = p->t->next; } else { push_error(arena, p, p->t, STR("Expected comma.")); p->t = p->t->next; goto abort; } } else { if (parent_json) { if (parent_json->type == JSON_TYPE_OBJECT) { /* Parse key */ if (p->t->type == TOKEN_TYPE_STRING) { struct string t_text = (struct string) { .len = p->t->end - p->t->start, .text = &src.text[p->t->start] }; struct string error = { 0 }; struct string key = interpret_string(arena, t_text, &error); if (error.len > 0) { push_error(arena, p, p->t, error); goto abort; } else { json->key = key; p->t = p->t->next; } } else { push_error(arena, p, p->t, STR("Key expected.")); goto abort; } /* Parse colon */ if (p->t->type == TOKEN_TYPE_COLON) { p->t = p->t->next; } else { push_error(arena, p, p->t, STR("Colon expected.")); goto abort; } } if (parent_json->child_last) { parent_json->child_last->next = json; } else { parent_json->child_first = json; } parent_json->child_last = json; } /* Parse value */ switch (p->t->type) { case TOKEN_TYPE_NUMBER: { struct string t_text = (struct string) { .len = p->t->end - p->t->start, .text = &src.text[p->t->start] }; f64 value = interpret_number(t_text); json->type = JSON_TYPE_NUMBER; json->value.number = value; p->t = p->t->next; } break; case TOKEN_TYPE_STRING: { struct string t_text = (struct string) { .len = p->t->end - p->t->start, .text = &src.text[p->t->start] }; struct string error = { 0 }; struct string value = interpret_string(arena, t_text, &error); if (error.len > 0) { push_error(arena, p, p->t, error); goto abort; } else { json->type = JSON_TYPE_STRING; json->value.string = value; p->t = p->t->next; } } break; case TOKEN_TYPE_TRUE: { json->type = JSON_TYPE_BOOL; json->value.boolean = true; p->t = p->t->next; } break; case TOKEN_TYPE_FALSE: { json->type = JSON_TYPE_BOOL; json->value.boolean = false; p->t = p->t->next; } break; case TOKEN_TYPE_NULL: { json->type = JSON_TYPE_NULL; p->t = p->t->next; } break; case TOKEN_TYPE_CURLY_BRACE_OPEN: { json->type = JSON_TYPE_OBJECT; p->t = p->t->next; is_new_parent = true; } break; case TOKEN_TYPE_SQUARE_BRACE_OPEN: { json->type = JSON_TYPE_ARRAY; p->t = p->t->next; is_new_parent = true; } break; default: { push_error(arena, p, p->t, STR("Value expected.")); p->t = p->t->next; goto abort; } break; } } if (is_new_parent) { /* Push self back to stack to re-check for closing brace later */ *arena_push(scratch.arena, struct json *) = json; ++stack_count; /* Create child & push to stack */ struct json *child = arena_push(arena, struct json); child->parent = json; *arena_push(scratch.arena, struct json *) = child; ++stack_count; } else if (parent_json) { /* Check for comma */ if (p->t->type == TOKEN_TYPE_COMMA) { /* Create sibling & push to stack */ struct json *sibling = arena_push(arena, struct json); sibling->parent = parent_json; *arena_push(scratch.arena, struct json *) = sibling; ++stack_count; p->t = p->t->next; } } } abort: scratch_end(scratch); return res; } /* ========================== * * Interface * ========================== */ struct json_parse_result json_from_string(struct arena *arena, struct string src) { struct temp_arena scratch = scratch_begin(arena); struct lex_result lex_res = lex(scratch.arena, src); struct parser p = { .src = src, .t = lex_res.token_first, .errors = { 0 } }; /* Parse root */ struct json *root = parse(arena, &p); /* Verify end of file */ if (p.errors.count == 0 && p.t->type != TOKEN_TYPE_EOF) { push_error(arena, &p, p.t, STR("Expected end of file.")); } scratch_end(scratch); return (struct json_parse_result) { .root = root, .errors = p.errors }; }