// Copyright (c) 2019-2023 Alexander Medvednikov. All rights reserved. // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. module json2 import strconv struct Scanner { mut: text []u8 pos int line int col int } enum TokenKind { none_ error str_ float int_ null bool_ eof comma = 44 colon = 58 lsbr = 91 rsbr = 93 lcbr = 123 rcbr = 125 } pub struct Token { lit []u8 kind TokenKind line int col int } // full_col returns the full column information which includes the length pub fn (t Token) full_col() int { return t.col + t.lit.len } const ( // list of characters commonly used in JSON. char_list = [`{`, `}`, `[`, `]`, `,`, `:`] // list of newlines to check when moving to a new position. newlines = [`\r`, `\n`, `\t`] // list of escapable that needs to be escaped inside a JSON string. // double quotes and forward slashes are excluded intentionally since // they have their own separate checks for it in order to pass the // JSON test suite (https://github.com/nst/JSONTestSuite/). important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`] // list of valid unicode escapes aside from \u{4-hex digits} valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`] // used for transforming escapes into valid unicode (eg. n => \n) unicode_transform_escapes = { 98: `\b` 102: `\f` 110: `\n` 114: `\r` 116: `\t` 92: `\\` 34: `"` 47: `/` } exp_signs = [u8(`-`), `+`] ) // move_pos proceeds to the next position. fn (mut s Scanner) move() { s.move_pos(true, true) } // move_pos_with_newlines is the same as move_pos but only enables newline checking. fn (mut s Scanner) move_pos_with_newlines() { s.move_pos(false, true) } fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) { s.pos++ if s.pos < s.text.len { if include_newlines && s.text[s.pos] in json2.newlines { s.line++ s.col = 0 if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` { s.pos++ } for s.pos < s.text.len && s.text[s.pos] in json2.newlines { s.move() } } else if include_space && s.text[s.pos] == ` ` { s.pos++ s.col++ for s.pos < s.text.len && s.text[s.pos] == ` ` { s.move() } } } else { s.col++ } } // error returns an error token. fn (s Scanner) error(description string) Token { return s.tokenize(description.bytes(), .error) } // tokenize returns a token based on the given lit and kind. fn (s Scanner) tokenize(lit []u8, kind TokenKind) Token { return Token{ lit: lit kind: kind col: s.col line: s.line } } // text_scan scans and returns a string token. [manualfree] fn (mut s Scanner) text_scan() Token { mut has_closed := false mut chrs := []u8{} for { s.pos++ s.col++ if s.pos >= s.text.len { break } ch := s.text[s.pos] if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` { has_closed = true break } else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch in json2.important_escapable_chars { return s.error('character must be escaped with a backslash') } else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == u8(0) { return s.error('invalid backslash escape') } else if s.pos + 1 < s.text.len && ch == `\\` { peek := s.text[s.pos + 1] if peek in json2.valid_unicode_escapes { chrs << json2.unicode_transform_escapes[int(peek)] s.pos++ s.col++ continue } else if peek == `u` { if s.pos + 5 < s.text.len { s.pos++ s.col++ mut codepoint := []u8{} codepoint_start := s.pos for s.pos < s.text.len && s.pos < codepoint_start + 4 { s.pos++ s.col++ if s.text[s.pos] == `"` { break } else if !s.text[s.pos].is_hex_digit() { x := s.text[s.pos].ascii_str() return s.error('`${x}` is not a hex digit') } codepoint << s.text[s.pos] } if codepoint.len != 4 { return s.error('unicode escape must have 4 hex digits') } val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 }) converted := utf32_to_str(val) converted_bytes := converted.bytes() chrs << converted_bytes unsafe { converted.free() converted_bytes.free() codepoint.free() } continue } else { return s.error('incomplete unicode escape') } } else if peek == `U` { return s.error('unicode endpoints must be in lowercase `u`') } else if peek == u8(229) { return s.error('unicode endpoint not allowed') } else { return s.error('invalid backslash escape') } } chrs << ch } tok := s.tokenize(chrs, .str_) s.move() if !has_closed { return s.error('missing double quotes in string closing') } return tok } // num_scan scans and returns an int/float token. fn (mut s Scanner) num_scan() Token { // analyze json number structure // -[digit][?[dot][digit]][?[E/e][?-/+][digit]] mut is_fl := false mut dot_index := -1 mut digits := []u8{} if s.text[s.pos] == `-` { digits << `-` if !s.text[s.pos + 1].is_digit() { return s.invalid_token() } s.move_pos_with_newlines() } if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) { return s.error('leading zeroes in a number are not allowed') } for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) { digits << s.text[s.pos] if s.text[s.pos] == `.` { is_fl = true dot_index = digits.len - 1 } s.move_pos_with_newlines() } if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 { return s.error('invalid float') } if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) { digits << s.text[s.pos] s.move_pos_with_newlines() if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs { digits << s.text[s.pos] s.move_pos_with_newlines() } mut exp_digits_count := 0 for s.pos < s.text.len && s.text[s.pos].is_digit() { digits << s.text[s.pos] exp_digits_count++ s.move_pos_with_newlines() } if exp_digits_count == 0 { return s.error('invalid exponent') } } kind := if is_fl { TokenKind.float } else { TokenKind.int_ } return s.tokenize(digits, kind) } // invalid_token returns an error token with the invalid token message. fn (s Scanner) invalid_token() Token { if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 { x := s.text[s.pos].ascii_str() return s.error('invalid token `${x}`') } else { x := s.text[s.pos].str_escaped() return s.error('invalid token `${x}`') } } // scan returns a token based on the scanner's current position. [manualfree] fn (mut s Scanner) scan() Token { if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in json2.newlines) { s.move() } if s.pos >= s.text.len { return s.tokenize([]u8{}, .eof) } else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) { ident := s.text[s.pos..s.pos + 4].bytestr() if ident == 'true' || ident == 'null' { mut kind := TokenKind.null if ident == 'true' { kind = .bool_ } unsafe { ident.free() } val := s.text[s.pos..s.pos + 4] tok := s.tokenize(val, kind) s.move() // n / t s.move() // u / r s.move() // l / u s.move() // l / e return tok } unsafe { ident.free() } return s.invalid_token() } else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` { ident := s.text[s.pos..s.pos + 5].bytestr() if ident == 'false' { unsafe { ident.free() } val := s.text[s.pos..s.pos + 5] tok := s.tokenize(val, .bool_) s.move() // f s.move() // a s.move() // l s.move() // s s.move() // e return tok } unsafe { ident.free() } return s.invalid_token() } else if s.text[s.pos] in json2.char_list { chr := s.text[s.pos] tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) }) s.move() return tok } else if s.text[s.pos] == `"` { return s.text_scan() } else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` { return s.num_scan() } else { return s.invalid_token() } }