// TODO (if we want to be JSON standard compliant): // - Support unicode escape sequences in strings (\u) // - Don't allow leading 0s in numbers //////////////////////////////////////////////////////////// //~ Lex JSON_Token *JSON_PushToken(Arena *arena, JSON_TokenList *list) { JSON_Token *t = PushStruct(arena, JSON_Token); if (!list->token_first) { list->token_first = t; } else { list->token_last->next = t; } list->token_last = t; return t; } JSON_TokenList JSON_TokensFromString(Arena *arena, String src) { JSON_TokenList result = ZI; JSON_Token *bof = JSON_PushToken(arena, &result); bof->kind = JSON_TokenKind_Bof; u64 pos = 0; b32 lexing_done = 0; while (!lexing_done) { // Skip whitespace b32 whitespace_done = 0; while (!whitespace_done && pos < src.len) { switch (src.text[pos]) { default: { whitespace_done = 1; } break; case JSON_Case_Newline: case JSON_Case_Space: { ++pos; } break; } } // Create token JSON_Token *t = JSON_PushToken(arena, &result); t->start = pos; if (pos >= src.len) { t->kind = JSON_TokenKind_Eof; t->next = t; // Self reference lexing_done = 1; } else { // Lex known token kinds switch (src.text[pos]) { default: break; // Symbols case ',': { t->kind = JSON_TokenKind_Comma; ++pos; } break; case ':': { t->kind = JSON_TokenKind_Colon; ++pos; } break; case '[': { t->kind = JSON_TokenKind_SquareBraceOpen; ++pos; } break; case ']': { t->kind = JSON_TokenKind_SquareBraceClose; ++pos; } break; case '{': { t->kind = JSON_TokenKind_CurlyBraceOpen; ++pos; } break; case '}': { t->kind = JSON_TokenKind_CurlyBraceClose; ++pos; } break; // Number case '-': { // Verify '-' precedes digit b32 next_is_digit = 0; if ((pos + 1) < src.len) { switch (src.text[pos + 1]) { case JSON_Case_Digit0Through9: { next_is_digit = 1; } break; } } ++pos; if (!next_is_digit) { break; } } FALLTHROUGH; case JSON_Case_Digit0Through9: { t->kind = JSON_TokenKind_Number; JSON_LexNumberState state = JSON_LexNumberState_Whole; b32 number_done = 0; while (!number_done && pos < src.len) { switch (src.text[pos]) { default: { number_done = 1; } break; case JSON_Case_Digit0Through9: { ++pos; } break; case '.': { u64 consume = 0; if (state == JSON_LexNumberState_Whole && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { default: break; case JSON_Case_Digit0Through9: { // Consume '.' ++consume; } break; } } if (consume) { state = JSON_LexNumberState_Fraction; pos += consume; } else { number_done = 1; } } break; case 'e': case 'E': { u64 consume = 0; if ((state == JSON_LexNumberState_Whole || state == JSON_LexNumberState_Fraction) && (pos + 1) < src.len) { u8 c1 = src.text[pos + 1]; switch (c1) { case JSON_Case_Digit0Through9: { // Consume 'E'/'e' ++consume; } break; case '-': case '+': { if ((pos + 2) < src.len) { u8 c2 = src.text[pos + 2]; switch (c2) { default: break; case JSON_Case_Digit0Through9: { // Consume 'E'/'e' & '+'/'-' consume += 2; } break; } } } break; default: break; } } if (consume) { state = JSON_LexNumberState_Exponent; pos += consume; } else { number_done = 1; } } break; } } } break; // String case '"': { ++pos; b32 string_done = 0; b32 next_escaped = 0; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = 0; switch (src.text[pos]) { default: { ++pos; } break; case JSON_Case_Newline: { ++pos; string_done = 1; } break; case '"': { ++pos; if (!escaped) { t->kind = JSON_TokenKind_String; string_done = 1; } } break; case '\\': { ++pos; if (!escaped) { next_escaped = 1; } } break; } } } break; // Keywords case 't': case 'f': case 'n': { String keyword = JSON_keyword_strings[src.text[pos]]; b32 match = 1; if ((pos + keyword.len - 1) < src.len) { if ((pos + keyword.len) < src.len) { // Don't match if word continues past keyword switch (src.text[pos + keyword.len]) { default: { match = 0; } break; case JSON_Case_Symbol: case JSON_Case_Space: case JSON_Case_Newline: { } break; } } if (match) { String cmp_str = { .len = keyword.len, .text = &src.text[pos] }; match = MatchString(cmp_str, keyword); } } if (match) { t->kind = JSON_keyword_types[src.text[pos]]; pos += keyword.len; } } break; } } // Lex unknown token if (t->kind == JSON_TokenKind_Unknown) { b32 unknown_done = 0; while (!unknown_done && pos < src.len) { switch (src.text[pos]) { default: { ++pos; } break; case JSON_Case_Symbol: case JSON_Case_Space: case JSON_Case_Newline: { unknown_done = 1; } break; } } t->end = pos; // Exit early if unknown token encountered return result; } else { t->end = pos; } } return result; } //////////////////////////////////////////////////////////// //~ Interpret f64 interpret_number(String src) { b32 whole_present = 0; u64 whole_left = 0; u64 whole_right = 0; i32 whole_sign = 1; b32 fraction_present = 0; u64 fraction_left = 0; u64 fraction_right = 0; b32 exponent_present = 0; u64 exponent_left = 0; u64 exponent_right = 0; i32 exponent_sign = 1; // Lex number parts { u64 pos = 0; if (src.len > 0 && src.text[0] == '-') { whole_sign = -1; ++pos; } JSON_LexNumberState state = JSON_LexNumberState_Whole; while (pos < src.len) { switch (src.text[pos]) { default: { // Unreachable Assert(0); ++pos; } break; case JSON_Case_Digit0Through9: { switch (state) { case JSON_LexNumberState_Whole: { if (!whole_present) { whole_present = 1; whole_left = pos; } whole_right = pos; ++pos; } break; case JSON_LexNumberState_Fraction: { if (!fraction_present) { fraction_present = 1; fraction_left = pos; } fraction_right = pos; ++pos; } break; case JSON_LexNumberState_Exponent: { if (!exponent_present) { exponent_present = 1; exponent_left = pos; } exponent_right = pos; ++pos; } break; } } break; case '.': { state = JSON_LexNumberState_Fraction; ++pos; } break; case 'e': case 'E': { state = JSON_LexNumberState_Exponent; ++pos; } break; case '-': { switch (state) { default: { // Unreachable Assert(0); ++pos; } break; case JSON_LexNumberState_Whole: { whole_sign = -1; ++pos; } break; case JSON_LexNumberState_Exponent: { exponent_sign = -1; ++pos; } break; } } break; case '+': { switch (state) { default: { // Unreachable Assert(0); ++pos; } break; case JSON_LexNumberState_Exponent: { exponent_sign = 1; ++pos; } break; } } break; } } } f64 result = 0; // Process whole part if (whole_present) { u64 pos = whole_left; while (pos <= whole_right) { u8 digit = MinU8(src.text[pos] - 48, 9); u64 exp = whole_right - pos; result += digit * PowU64(10, exp); ++pos; } result *= whole_sign; } // Process fraction part if (fraction_present) { u64 frac_whole = 0; u64 pos = fraction_left; while (pos <= fraction_right) { u8 digit = MinU8(src.text[pos] - 48, 9); u64 exp = fraction_right - pos; frac_whole += digit * PowU64(10, exp); ++pos; } result += (f64)frac_whole / PowU64(10, (fraction_right - fraction_left + 1)); } // Process exponent part if (exponent_present) { u64 exponent_whole = 0; u64 pos = exponent_left; while (pos <= exponent_right) { u8 digit = MinU8(src.text[pos] - 48, 9); u64 exp = exponent_right - pos; exponent_whole += digit * PowU64(10, exp); ++pos; } if (exponent_sign >= 0) { result *= PowU64(10, exponent_whole); } else { result /= PowU64(10, exponent_whole); } } return result; } String interpret_string(Arena *arena, String src, String *error) { String result = { .len = 0, .text = PushDry(arena, u8) }; if (src.len < 2) { if (error) { *error = Lit("Malformed string."); } return result; } // Ignore beginning quote u64 pos = 1; b32 valid_close = 0; b32 string_done = 0; b32 next_escaped = 0; while (!string_done && pos < src.len) { b32 escaped = next_escaped; next_escaped = 0; if (escaped) { switch (src.text[pos]) { default: { if (error) { *error = Lit("Invalid escape character in string."); return result; } } break; case '"': case '\\': case '/': { *PushStructNoZero(arena, u8) = src.text[pos]; ++result.len; ++pos; } break; // Backspace case 'b': { *PushStructNoZero(arena, u8) = '\b'; ++result.len; ++pos; } break; // Formfeed case 'f': { *PushStructNoZero(arena, u8) = '\f'; ++result.len; ++pos; } break; // Linefeed case 'n': { *PushStructNoZero(arena, u8) = '\n'; ++result.len; ++pos; } break; // Carriage return case 'r': { *PushStructNoZero(arena, u8) = '\r'; ++result.len; ++pos; } break; // Horizontal tab case 't': { *PushStructNoZero(arena, u8) = '\t'; ++result.len; ++pos; } break; // TODO: Unicode escape support // case 'u': // { // // TODO // } break; } } else { switch (src.text[pos]) { default: { *PushStructNoZero(arena, u8) = src.text[pos]; ++result.len; ++pos; } break; case '\\': { escaped = 1; ++pos; } break; case '"': { string_done = 1; valid_close = 1; ++pos; } break; } } } if (!valid_close) { if (error) { *error = Lit("Expected end of string."); } } return result; } //////////////////////////////////////////////////////////// //~ Parse void JSON_PushError(Arena *arena, JSON_Parser *p, JSON_Token *t, String msg) { JSON_Error *error = PushStruct(arena, JSON_Error); error->msg = msg; error->start = t->start; error->end = t->end; JSON_ErrorList *list = &p->errors; if (!list->first) { list->first = error; } else { list->last->next = error; } list->last = error; ++list->count; } void JSON_Parse(Arena *arena, JSON_Parser *p) { TempArena scratch = BeginScratch(arena); JSON_Blob *root = PushStruct(arena, JSON_Blob); JSON_Token *at = p->at; String src = p->src; if (at->kind == JSON_TokenKind_Bof) { at = at->next; } // Depth first stack *PushStructNoZero(scratch.arena, JSON_Blob *) = root; u64 stack_count = 1; while (stack_count > 0) { JSON_Blob *json = 0; PopStruct(scratch.arena, JSON_Blob *, &json); --stack_count; JSON_Blob *parent_json = json->parent; b32 is_new_parent = 0; if (json->type == JSON_Type_Object || json->type == JSON_Type_Array) { // No more children to parse for object/array, check for closing brace. JSON_TokenKind tok_close_kind = json->type == JSON_Type_Object ? JSON_TokenKind_CurlyBraceClose : JSON_TokenKind_SquareBraceClose; if (at->kind == tok_close_kind) { at = at->next; } else { JSON_PushError(arena, p, at, Lit("Expected comma.")); at = at->next; goto abort; } } else { if (parent_json) { if (parent_json->type == JSON_Type_Object) { // Parse key if (at->kind == JSON_TokenKind_String) { String t_text = (String) { .len = at->end - at->start, .text = &src.text[at->start] }; String error = ZI; String key = interpret_string(arena, t_text, &error); if (error.len > 0) { JSON_PushError(arena, p, at, error); goto abort; } else { json->key = key; at = at->next; } } else { JSON_PushError(arena, p, at, Lit("Key expected.")); goto abort; } // Parse colon if (at->kind == JSON_TokenKind_Colon) { at = at->next; } else { JSON_PushError(arena, p, at, Lit("Colon expected.")); goto abort; } } if (parent_json->child_last) { parent_json->child_last->next = json; } else { parent_json->child_first = json; } parent_json->child_last = json; } // Parse value switch (at->kind) { default: { JSON_PushError(arena, p, at, Lit("Value expected.")); at = at->next; goto abort; } break; case JSON_TokenKind_Number: { String t_text = STRING(at->end - at->start, &src.text[at->start]); f64 value = interpret_number(t_text); json->type = JSON_Type_Number; json->value.number = value; at = at->next; } break; case JSON_TokenKind_String: { String t_text = STRING(at->end - at->start, &src.text[at->start]); String error = ZI; String value = interpret_string(arena, t_text, &error); if (error.len > 0) { JSON_PushError(arena, p, at, error); goto abort; } else { json->type = JSON_Type_String; json->value.string = value; at = at->next; } } break; case JSON_TokenKind_KeywordTrue: { json->type = JSON_Type_Bool; json->value.boolean = 1; at = at->next; } break; case JSON_TokenKind_KeywordFalse: { json->type = JSON_Type_Bool; json->value.boolean = 0; at = at->next; } break; case JSON_TokenKind_KeywordNull: { json->type = JSON_Type_Null; at = at->next; } break; case JSON_TokenKind_CurlyBraceOpen: { json->type = JSON_Type_Object; at = at->next; is_new_parent = 1; } break; case JSON_TokenKind_SquareBraceOpen: { json->type = JSON_Type_Array; at = at->next; is_new_parent = 1; } break; } } if (is_new_parent) { // Push self back to stack to re-check for closing brace later *PushStructNoZero(scratch.arena, JSON_Blob *) = json; ++stack_count; // Create child & push to stack JSON_Blob *child = PushStruct(arena, JSON_Blob); child->parent = json; *PushStructNoZero(scratch.arena, JSON_Blob *) = child; ++stack_count; } else if (parent_json) { // Check for comma if (at->kind == JSON_TokenKind_Comma) { // Create sibling & push to stack JSON_Blob *sibling = PushStruct(arena, JSON_Blob); sibling->parent = parent_json; *PushStructNoZero(scratch.arena, JSON_Blob *) = sibling; ++stack_count; at = at->next; } } } abort: p->at = at; p->root = root; EndScratch(scratch); } JSON_Result JSON_BlobFromString(Arena *arena, String src) { TempArena scratch = BeginScratch(arena); JSON_TokenList tl = JSON_TokensFromString(scratch.arena, src); // Parse root JSON_Parser p = ZI; p.src = src; p.at = tl.token_first; JSON_Parse(arena, &p); // Verify end of file if (p.errors.count == 0 && p.at->kind != JSON_TokenKind_Eof) { JSON_PushError(arena, &p, p.at, Lit("Expected end of file.")); } EndScratch(scratch); JSON_Result result = ZI; result.root = p.root; result.errors = p.errors; return result; }