2022-01-04 12:21:08 +03:00
|
|
|
// Copyright (c) 2019-2022 Alexander Medvednikov. All rights reserved.
|
2021-02-26 09:36:02 +03:00
|
|
|
// Use of this source code is governed by an MIT license
|
|
|
|
// that can be found in the LICENSE file.
|
|
|
|
module json2
|
|
|
|
|
|
|
|
import strconv
|
|
|
|
|
|
|
|
struct Scanner {
|
|
|
|
mut:
|
2022-04-15 15:35:35 +03:00
|
|
|
text []u8
|
2021-02-26 09:36:02 +03:00
|
|
|
pos int
|
|
|
|
line int
|
|
|
|
col int
|
|
|
|
}
|
|
|
|
|
|
|
|
enum TokenKind {
|
|
|
|
none_
|
|
|
|
error
|
|
|
|
str_
|
|
|
|
float
|
|
|
|
int_
|
|
|
|
null
|
|
|
|
bool_
|
|
|
|
eof
|
|
|
|
comma = 44
|
|
|
|
colon = 58
|
|
|
|
lsbr = 91
|
|
|
|
rsbr = 93
|
|
|
|
lcbr = 123
|
|
|
|
rcbr = 125
|
|
|
|
}
|
|
|
|
|
2022-03-05 14:02:43 +03:00
|
|
|
pub struct Token {
|
2022-04-15 15:35:35 +03:00
|
|
|
lit []u8
|
2021-02-26 09:36:02 +03:00
|
|
|
kind TokenKind
|
|
|
|
line int
|
|
|
|
col int
|
|
|
|
}
|
|
|
|
|
2022-03-05 14:02:43 +03:00
|
|
|
// full_col returns the full column information which includes the length
|
|
|
|
pub fn (t Token) full_col() int {
|
|
|
|
return t.col + t.lit.len
|
|
|
|
}
|
|
|
|
|
2021-02-26 09:36:02 +03:00
|
|
|
const (
|
|
|
|
// list of characters commonly used in JSON.
|
|
|
|
char_list = [`{`, `}`, `[`, `]`, `,`, `:`]
|
|
|
|
// list of newlines to check when moving to a new position.
|
2021-03-01 12:22:36 +03:00
|
|
|
newlines = [`\r`, `\n`, `\t`]
|
2021-02-26 09:36:02 +03:00
|
|
|
// list of escapable that needs to be escaped inside a JSON string.
|
|
|
|
// double quotes and forward slashes are excluded intentionally since
|
|
|
|
// they have their own separate checks for it in order to pass the
|
|
|
|
// JSON test suite (https://github.com/nst/JSONTestSuite/).
|
2021-03-01 12:22:36 +03:00
|
|
|
important_escapable_chars = [`\b`, `\f`, `\n`, `\r`, `\t`]
|
2021-02-26 09:36:02 +03:00
|
|
|
// list of valid unicode escapes aside from \u{4-hex digits}
|
|
|
|
valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
|
|
|
|
// used for transforming escapes into valid unicode (eg. n => \n)
|
2021-08-04 12:44:41 +03:00
|
|
|
unicode_transform_escapes = {
|
2021-02-26 09:36:02 +03:00
|
|
|
98: `\b`
|
|
|
|
102: `\f`
|
|
|
|
110: `\n`
|
|
|
|
114: `\r`
|
|
|
|
116: `\t`
|
|
|
|
92: `\\`
|
|
|
|
34: `"`
|
|
|
|
47: `/`
|
|
|
|
}
|
2022-04-15 14:58:56 +03:00
|
|
|
exp_signs = [u8(`-`), `+`]
|
2021-02-26 09:36:02 +03:00
|
|
|
)
|
|
|
|
|
|
|
|
// move_pos proceeds to the next position.
|
2021-03-30 10:40:20 +03:00
|
|
|
fn (mut s Scanner) move() {
|
|
|
|
s.move_pos(true, true)
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// move_pos_with_newlines is the same as move_pos but only enables newline checking.
|
|
|
|
fn (mut s Scanner) move_pos_with_newlines() {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move_pos(false, true)
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
|
2021-03-30 10:40:20 +03:00
|
|
|
fn (mut s Scanner) move_pos(include_space bool, include_newlines bool) {
|
2021-02-26 09:36:02 +03:00
|
|
|
s.pos++
|
|
|
|
if s.pos < s.text.len {
|
|
|
|
if include_newlines && s.text[s.pos] in json2.newlines {
|
|
|
|
s.line++
|
|
|
|
s.col = 0
|
|
|
|
if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
|
|
|
|
s.pos++
|
|
|
|
}
|
|
|
|
for s.pos < s.text.len && s.text[s.pos] in json2.newlines {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move()
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
} else if include_space && s.text[s.pos] == ` ` {
|
|
|
|
s.pos++
|
|
|
|
s.col++
|
|
|
|
for s.pos < s.text.len && s.text[s.pos] == ` ` {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move()
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
s.col++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// error returns an error token.
|
|
|
|
fn (s Scanner) error(description string) Token {
|
|
|
|
return s.tokenize(description.bytes(), .error)
|
|
|
|
}
|
|
|
|
|
|
|
|
// tokenize returns a token based on the given lit and kind.
|
2022-04-15 15:35:35 +03:00
|
|
|
fn (s Scanner) tokenize(lit []u8, kind TokenKind) Token {
|
2021-02-26 09:36:02 +03:00
|
|
|
return Token{
|
|
|
|
lit: lit
|
|
|
|
kind: kind
|
|
|
|
col: s.col
|
|
|
|
line: s.line
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// text_scan scans and returns a string token.
|
|
|
|
[manualfree]
|
|
|
|
fn (mut s Scanner) text_scan() Token {
|
|
|
|
mut has_closed := false
|
2022-04-15 15:35:35 +03:00
|
|
|
mut chrs := []u8{}
|
2021-02-26 09:36:02 +03:00
|
|
|
for {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.pos++
|
|
|
|
s.col++
|
2021-02-26 09:36:02 +03:00
|
|
|
if s.pos >= s.text.len {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
ch := s.text[s.pos]
|
|
|
|
if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` {
|
|
|
|
has_closed = true
|
|
|
|
break
|
|
|
|
} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
|
|
|
|
&& ch in json2.important_escapable_chars {
|
|
|
|
return s.error('character must be escaped with a backslash')
|
2022-04-15 14:58:56 +03:00
|
|
|
} else if (s.pos == s.text.len - 1 && ch == `\\`) || ch == u8(0) {
|
2021-02-26 09:36:02 +03:00
|
|
|
return s.error('invalid backslash escape')
|
|
|
|
} else if s.pos + 1 < s.text.len && ch == `\\` {
|
|
|
|
peek := s.text[s.pos + 1]
|
|
|
|
if peek in json2.valid_unicode_escapes {
|
|
|
|
chrs << json2.unicode_transform_escapes[int(peek)]
|
2021-03-30 10:40:20 +03:00
|
|
|
s.pos++
|
|
|
|
s.col++
|
2021-02-26 09:36:02 +03:00
|
|
|
continue
|
|
|
|
} else if peek == `u` {
|
|
|
|
if s.pos + 5 < s.text.len {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.pos++
|
|
|
|
s.col++
|
2022-04-15 15:35:35 +03:00
|
|
|
mut codepoint := []u8{}
|
2021-02-26 09:36:02 +03:00
|
|
|
codepoint_start := s.pos
|
|
|
|
for s.pos < s.text.len && s.pos < codepoint_start + 4 {
|
2021-03-30 10:40:20 +03:00
|
|
|
s.pos++
|
|
|
|
s.col++
|
2021-02-26 09:36:02 +03:00
|
|
|
if s.text[s.pos] == `"` {
|
|
|
|
break
|
|
|
|
} else if !s.text[s.pos].is_hex_digit() {
|
2021-04-11 11:34:08 +03:00
|
|
|
x := s.text[s.pos].ascii_str()
|
|
|
|
return s.error('`$x` is not a hex digit')
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
codepoint << s.text[s.pos]
|
|
|
|
}
|
|
|
|
if codepoint.len != 4 {
|
|
|
|
return s.error('unicode escape must have 4 hex digits')
|
|
|
|
}
|
2021-07-02 10:39:57 +03:00
|
|
|
val := u32(strconv.parse_uint(codepoint.bytestr(), 16, 32) or { 0 })
|
2021-03-01 12:22:36 +03:00
|
|
|
converted := utf32_to_str(val)
|
|
|
|
converted_bytes := converted.bytes()
|
|
|
|
chrs << converted_bytes
|
|
|
|
unsafe {
|
|
|
|
converted.free()
|
|
|
|
converted_bytes.free()
|
|
|
|
codepoint.free()
|
|
|
|
}
|
2021-02-26 09:36:02 +03:00
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
return s.error('incomplete unicode escape')
|
|
|
|
}
|
|
|
|
} else if peek == `U` {
|
|
|
|
return s.error('unicode endpoints must be in lowercase `u`')
|
2022-04-15 14:58:56 +03:00
|
|
|
} else if peek == u8(229) {
|
2021-02-26 09:36:02 +03:00
|
|
|
return s.error('unicode endpoint not allowed')
|
|
|
|
} else {
|
|
|
|
return s.error('invalid backslash escape')
|
|
|
|
}
|
|
|
|
}
|
|
|
|
chrs << ch
|
|
|
|
}
|
|
|
|
tok := s.tokenize(chrs, .str_)
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move()
|
2021-02-26 09:36:02 +03:00
|
|
|
if !has_closed {
|
|
|
|
return s.error('missing double quotes in string closing')
|
|
|
|
}
|
|
|
|
return tok
|
|
|
|
}
|
|
|
|
|
|
|
|
// num_scan scans and returns an int/float token.
|
|
|
|
fn (mut s Scanner) num_scan() Token {
|
|
|
|
// analyze json number structure
|
|
|
|
// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
|
|
|
|
mut is_fl := false
|
|
|
|
mut dot_index := -1
|
2022-04-15 15:35:35 +03:00
|
|
|
mut digits := []u8{}
|
2021-02-26 09:36:02 +03:00
|
|
|
if s.text[s.pos] == `-` {
|
|
|
|
digits << `-`
|
|
|
|
if !s.text[s.pos + 1].is_digit() {
|
|
|
|
return s.invalid_token()
|
|
|
|
}
|
|
|
|
s.move_pos_with_newlines()
|
|
|
|
}
|
|
|
|
if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
|
|
|
|
return s.error('leading zeroes in a number are not allowed')
|
|
|
|
}
|
|
|
|
for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
|
|
|
|
digits << s.text[s.pos]
|
|
|
|
if s.text[s.pos] == `.` {
|
|
|
|
is_fl = true
|
|
|
|
dot_index = digits.len - 1
|
|
|
|
}
|
|
|
|
s.move_pos_with_newlines()
|
|
|
|
}
|
|
|
|
if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
|
|
|
|
return s.error('invalid float')
|
|
|
|
}
|
|
|
|
if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
|
|
|
|
digits << s.text[s.pos]
|
|
|
|
s.move_pos_with_newlines()
|
|
|
|
if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs {
|
|
|
|
digits << s.text[s.pos]
|
|
|
|
s.move_pos_with_newlines()
|
|
|
|
}
|
|
|
|
mut exp_digits_count := 0
|
|
|
|
for s.pos < s.text.len && s.text[s.pos].is_digit() {
|
|
|
|
digits << s.text[s.pos]
|
|
|
|
exp_digits_count++
|
|
|
|
s.move_pos_with_newlines()
|
|
|
|
}
|
|
|
|
if exp_digits_count == 0 {
|
|
|
|
return s.error('invalid exponent')
|
|
|
|
}
|
|
|
|
}
|
|
|
|
kind := if is_fl { TokenKind.float } else { TokenKind.int_ }
|
|
|
|
return s.tokenize(digits, kind)
|
|
|
|
}
|
|
|
|
|
|
|
|
// invalid_token returns an error token with the invalid token message.
|
|
|
|
fn (s Scanner) invalid_token() Token {
|
2021-03-30 10:40:20 +03:00
|
|
|
if s.text[s.pos] >= 32 && s.text[s.pos] <= 126 {
|
2021-04-11 11:34:08 +03:00
|
|
|
x := s.text[s.pos].ascii_str()
|
|
|
|
return s.error('invalid token `$x`')
|
2021-03-30 10:40:20 +03:00
|
|
|
} else {
|
2021-04-11 11:34:08 +03:00
|
|
|
x := s.text[s.pos].str_escaped()
|
|
|
|
return s.error('invalid token `$x`')
|
2021-03-30 10:40:20 +03:00
|
|
|
}
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// scan returns a token based on the scanner's current position.
|
|
|
|
[manualfree]
|
|
|
|
fn (mut s Scanner) scan() Token {
|
2021-03-30 10:40:20 +03:00
|
|
|
if s.pos < s.text.len && (s.text[s.pos] == ` ` || s.text[s.pos] in json2.newlines) {
|
|
|
|
s.move()
|
2021-02-26 09:36:02 +03:00
|
|
|
}
|
|
|
|
if s.pos >= s.text.len {
|
2022-04-15 15:35:35 +03:00
|
|
|
return s.tokenize([]u8{}, .eof)
|
2021-02-26 09:36:02 +03:00
|
|
|
} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
|
|
|
|
ident := s.text[s.pos..s.pos + 4].bytestr()
|
|
|
|
if ident == 'true' || ident == 'null' {
|
|
|
|
mut kind := TokenKind.null
|
|
|
|
if ident == 'true' {
|
|
|
|
kind = .bool_
|
|
|
|
}
|
|
|
|
unsafe { ident.free() }
|
|
|
|
val := s.text[s.pos..s.pos + 4]
|
|
|
|
tok := s.tokenize(val, kind)
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move() // n / t
|
|
|
|
s.move() // u / r
|
|
|
|
s.move() // l / u
|
|
|
|
s.move() // l / e
|
2021-02-26 09:36:02 +03:00
|
|
|
return tok
|
|
|
|
}
|
|
|
|
unsafe { ident.free() }
|
|
|
|
return s.invalid_token()
|
|
|
|
} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
|
|
|
|
ident := s.text[s.pos..s.pos + 5].bytestr()
|
|
|
|
if ident == 'false' {
|
|
|
|
unsafe { ident.free() }
|
|
|
|
val := s.text[s.pos..s.pos + 5]
|
|
|
|
tok := s.tokenize(val, .bool_)
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move() // f
|
|
|
|
s.move() // a
|
|
|
|
s.move() // l
|
|
|
|
s.move() // s
|
|
|
|
s.move() // e
|
2021-02-26 09:36:02 +03:00
|
|
|
return tok
|
|
|
|
}
|
|
|
|
unsafe { ident.free() }
|
|
|
|
return s.invalid_token()
|
|
|
|
} else if s.text[s.pos] in json2.char_list {
|
|
|
|
chr := s.text[s.pos]
|
2022-10-02 22:39:11 +03:00
|
|
|
tok := s.tokenize([]u8{}, unsafe { TokenKind(int(chr)) })
|
2021-03-30 10:40:20 +03:00
|
|
|
s.move()
|
2021-02-26 09:36:02 +03:00
|
|
|
return tok
|
|
|
|
} else if s.text[s.pos] == `"` {
|
|
|
|
return s.text_scan()
|
|
|
|
} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
|
|
|
|
return s.num_scan()
|
|
|
|
} else {
|
|
|
|
return s.invalid_token()
|
|
|
|
}
|
|
|
|
}
|