diff --git a/vlib/x/json2/any_test.v b/vlib/x/json2/any_test.v index 5e042af05f..6f8690055d 100644 --- a/vlib/x/json2/any_test.v +++ b/vlib/x/json2/any_test.v @@ -1,16 +1,16 @@ import x.json2 const ( - sample_data = { - 'int': json2.Any(int(1)) - 'i64': json2.Any(i64(128)) - 'f32': json2.Any(f32(2.0)) - 'f64': json2.Any(f64(1.283)) + sample_data = map{ + 'int': json2.Any(int(1)) + 'i64': json2.Any(i64(128)) + 'f32': json2.Any(f32(2.0)) + 'f64': json2.Any(f64(1.283)) 'bool': json2.Any(false) - 'str': json2.Any('test') + 'str': json2.Any('test') 'null': json2.Any(json2.null) - 'arr': json2.Any([json2.Any('lol')]) - 'obj': json2.Any({ + 'arr': json2.Any([json2.Any('lol')]) + 'obj': json2.Any(map{ 'foo': json2.Any(10) }) } @@ -126,6 +126,5 @@ fn test_str() { assert sample_data['str'].str() == 'test' assert sample_data['null'].str() == 'null' assert sample_data['arr'].str() == '["lol"]' - assert sample_data.str() == - '{"int":1,"i64":128,"f32":2.0,"f64":1.283,"bool":false,"str":"test","null":null,"arr":["lol"],"obj":{"foo":10}}' + assert sample_data.str() == '{"int":1,"i64":128,"f32":2.0,"f64":1.283,"bool":false,"str":"test","null":null,"arr":["lol"],"obj":{"foo":10}}' } diff --git a/vlib/x/json2/decoder.v b/vlib/x/json2/decoder.v index b88d4be3d2..8837495b8d 100644 --- a/vlib/x/json2/decoder.v +++ b/vlib/x/json2/decoder.v @@ -3,42 +3,20 @@ // that can be found in the LICENSE file. module json2 -import strings -import strconv -import v.scanner -import v.token -import v.util -import v.pref - // `Any` is a sum type that lists the possible types to be decoded and used. -pub type Any = string | int | i64 | f32 | f64 | bool | Null | []Any | map[string]Any +pub type Any = Null | []Any | bool | f32 | f64 | i64 | int | map[string]Any | string // `Null` struct is a simple representation of the `null` value in JSON. pub struct Null { + is_null bool = true } -enum ParseMode { - array - bool - invalid - null - number - object - string -} - -const ( - formfeed_err = 'formfeed not allowed.' - eof_err = 'reached eof. data not closed properly.' -) - struct Parser { mut: - scanner &scanner.Scanner - p_tok token.Token - tok token.Token - n_tok token.Token - mode ParseMode = .invalid + scanner &Scanner + p_tok Token + tok Token + n_tok Token n_level int convert_type bool = true } @@ -49,131 +27,63 @@ fn (mut p Parser) next() { p.n_tok = p.scanner.scan() } -fn (p Parser) emit_error(msg string) string { - source := p.scanner.text - cur := p.tok - mut pp := util.imax(0, util.imin(source.len - 1, cur.pos)) - if source.len > 0 { - for pp >= 0 { - if source[pp] !in [`\r`, `\n`] { - pp-- - continue - } - break - } +fn (mut p Parser) next_with_err() ? { + p.next() + if p.tok.kind == .error { + return error(p.emit_error(p.tok.lit.bytestr())) } - column := util.imax(0, cur.pos - pp + cur.len - 1) - line := cur.line_nr - return '[json] $msg ($line:$column)' } -fn new_parser(srce string, convert_type bool) Parser { - mut src := srce - // from v/util/util.v - if src.len >= 3 { - c_text := src.str +fn (p Parser) emit_error(msg string) string { + line := p.tok.line + column := p.tok.col + p.tok.lit.len + return '[x.json2] $msg ($line:$column)' +} + +// TODO: copied from v.util to avoid the entire module and its functions +// from being imported. remove later once -skip-unused is enabled by default. +fn skip_bom(file_content string) string { + mut raw_text := file_content + // BOM check + if raw_text.len >= 3 { unsafe { + c_text := raw_text.str if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF { // skip three BOM bytes offset_from_begin := 3 - src = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin) + raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin) } } } + return raw_text +} + +fn new_parser(srce string, convert_type bool) Parser { + src := skip_bom(srce) return Parser{ - scanner: scanner.new_scanner(src, .parse_comments, &pref.Preferences{output_mode: .silent}) + scanner: &Scanner{ + text: src.bytes() + } convert_type: convert_type } } -fn check_valid_hex(str string) ? { - if str.len != 4 { - return error('hex string must be 4 characters.') - } - for l in str { - if l.is_hex_digit() { - continue - } - return error('provided string is not a hex digit.') - } -} - fn (mut p Parser) decode() ?Any { - p.detect_parse_mode() - if p.mode == .invalid { - return error(p.emit_error('invalid JSON.')) - } - fi := p.decode_value() or { - return error(p.emit_error(err)) - } + p.next() + p.next_with_err() ? + fi := p.decode_value() ? if p.tok.kind != .eof { - return error(p.emit_error('unknown token `$p.tok.kind`.')) + return error(p.emit_error('invalid token `$p.tok.kind`')) } return fi } -fn (p Parser) is_formfeed() bool { - prev_tok_pos := p.p_tok.pos + p.p_tok.len - 2 - if prev_tok_pos < p.scanner.text.len && p.scanner.text[prev_tok_pos] == 0x0c { - return true - } - return false -} - -fn (p Parser) is_singlequote() bool { - src := p.scanner.text - prev_tok_pos := p.p_tok.pos + p.p_tok.len - return src[prev_tok_pos] == `\'` -} - -fn (mut p Parser) detect_parse_mode() { - src := p.scanner.text - if src.len > 1 && src[0].is_digit() && !src[1].is_digit() { - p.mode = .invalid - return - } - p.tok = p.scanner.scan() - p.n_tok = p.scanner.scan() - if src.len == 1 && p.tok.kind == .string && p.n_tok.kind == .eof { - p.mode = .invalid - return - } - match p.tok.kind { - .lcbr { - p.mode = .object - } - .lsbr { - p.mode = .array - } - .number { - p.mode = .number - } - .key_true, .key_false { - p.mode = .bool - } - .string { - p.mode = .string - } - .name { - if p.tok.lit == 'null' { - p.mode = .null - } - } - .minus { - if p.n_tok.kind == .number { - p.mode = .number - } - } - else {} - } -} - fn (mut p Parser) decode_value() ?Any { if p.n_level == 500 { - return error('reached maximum nesting level of 500.') + return error(p.emit_error('reached maximum nesting level of 500')) } - if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr) || - (p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) { + if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr) + || (p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) { p.n_level++ } match p.tok.kind { @@ -183,235 +93,76 @@ fn (mut p Parser) decode_value() ?Any { .lcbr { return p.decode_object() } - .number { - return p.decode_number() + .int_, .float { + tl := p.tok.lit.bytestr() + kind := p.tok.kind + p.next_with_err() ? + if p.convert_type { + return if kind == .float { Any(tl.f64()) } else { Any(tl.i64()) } + } + return Any(tl) } - .key_true { - p.next() - return if p.convert_type { - Any(true) - } else { - Any('true') - } + .bool_ { + lit := p.tok.lit.bytestr() + p.next_with_err() ? + return if p.convert_type { Any(lit.bool()) } else { Any(lit) } } - .key_false { - p.next() - return if p.convert_type { - Any(false) - } else { - Any('false') - } + .null { + p.next_with_err() ? + return if p.convert_type { Any(null) } else { Any('null') } } - .name { - if p.tok.lit != 'null' { - return error('unknown identifier `$p.tok.lit`') - } - p.next() - return if p.convert_type { - Any(Null{}) - } else { - Any('null') - } - } - .string { - if p.is_singlequote() { - return error('strings must be in double-quotes.') - } - return p.decode_string() + .str_ { + str := p.tok.lit.bytestr() + p.next_with_err() ? + return Any(str) } else { - if p.tok.kind == .minus && p.n_tok.kind == .number && p.n_tok.pos == p.tok.pos + 1 { - p.next() - d_num := p.decode_number() ? - return d_num - } - return error("unknown token '$p.tok.lit' when decoding value") + return error(p.emit_error('invalid token `$p.tok.kind`')) } } - if p.is_formfeed() { - return error(formfeed_err) - } return Any{} } -fn (mut p Parser) decode_string() ?Any { - mut strwr := strings.new_builder(200) - for i := 0; i < p.tok.lit.len; i++ { - if ((i - 1 >= 0 && p.tok.lit[i - 1] != `/`) || i == 0) && int(p.tok.lit[i]) in [9, 10, 0] { - return error('character must be escaped with a backslash.') - } - if i == p.tok.lit.len - 1 && p.tok.lit[i] == 92 { - return error('invalid backslash escape.') - } - if i + 1 < p.tok.lit.len && p.tok.lit[i] == 92 { - peek := p.tok.lit[i + 1] - match peek { - `b` { - i++ - strwr.write_b(`\b`) - continue - } - `f` { - i++ - strwr.write_b(`\f`) - continue - } - `n` { - i++ - strwr.write_b(`\n`) - continue - } - `r` { - i++ - strwr.write_b(`\r`) - continue - } - `t` { - i++ - strwr.write_b(`\t`) - continue - } - `u` { - if i + 5 < p.tok.lit.len { - codepoint := p.tok.lit[i + 2..i + 6] - check_valid_hex(codepoint) ? - hex_val := strconv.parse_int(codepoint, 16, 0) - strwr.write_b(byte(hex_val)) - i += 5 - continue - } else { - return error('incomplete unicode escape.') - } - } - `\\` { - i++ - strwr.write_b(`\\`) - continue - } - `"` { - i++ - strwr.write_b(`\"`) - continue - } - `/` { - i++ - strwr.write_b(`/`) - continue - } - else { return error('invalid backslash escape.') } - } - if int(peek) == 85 { - return error('unicode endpoints must be in lowercase `u`.') - } - if int(peek) in [9, 229] { - return error('unicode endpoint not allowed.') - } - } - strwr.write_b(p.tok.lit[i]) - } - p.next() - defer { - unsafe { strwr.free() } - } - str := strwr.str() - return Any(str) -} - -// now returns string instead of int or float -fn (mut p Parser) decode_number() ?Any { - src := p.scanner.text - mut tl := p.tok.lit - mut is_fl := false - sep_by_dot := tl.to_lower().split('.') - if tl.starts_with('0x') && tl.all_after('0x').len <= 2 { - return error('hex numbers should not be less than or equal to two digits.') - } - if src[p.p_tok.pos + p.p_tok.len] == `0` && src[p.p_tok.pos + p.p_tok.len + 1].is_digit() { - return error('leading zeroes in integers are not allowed.') - } - if tl.starts_with('.') { - return error('decimals must start with a digit followed by a dot.') - } - if tl.ends_with('+') || tl.ends_with('-') { - return error('exponents must have a digit before the sign.') - } - if sep_by_dot.len > 1 { - // analyze json number structure - // -[digit][dot][digit][E/e][-/+][digit] - // float number - is_fl = true - last := sep_by_dot.last() - if last.starts_with('e') { - return error('exponents must have a digit before the exponent notation.') - } - } - if p.p_tok.kind == .minus && p.tok.pos == p.p_tok.pos + 1 { - tl = '-$tl' - } - p.next() - if p.convert_type { - return if is_fl { - Any(tl.f64()) - } else { - Any(tl.i64()) - } - } - return Any(tl) -} - fn (mut p Parser) decode_array() ?Any { mut items := []Any{} - p.next() + p.next_with_err() ? for p.tok.kind != .rsbr { - if p.tok.kind == .eof { - return error(eof_err) - } item := p.decode_value() ? items << item - if p.tok.kind == .comma && p.n_tok.kind !in [.rsbr, .comma] { - p.next() - continue - } - if p.tok.kind == .rsbr { + if p.tok.kind == .comma { + p.next_with_err() ? + if p.tok.kind == .rsbr || p.tok.kind == .rcbr { + return error(p.emit_error('invalid token `$p.tok.lit')) + } + } else if p.tok.kind == .rsbr { break + } else { + return error(p.emit_error("unknown token '$p.tok.lit' when decoding array.")) } - return error("unknown token '$p.tok.lit' when decoding arrays.") } - p.next() + p.next_with_err() ? return Any(items) } fn (mut p Parser) decode_object() ?Any { mut fields := map[string]Any{} - mut cur_key := '' - p.next() + p.next_with_err() ? for p.tok.kind != .rcbr { - is_key := p.tok.kind == .string && p.n_tok.kind == .colon - // todo - // if p.is_formfeed() { - // return error(formfeed_err) - // } - if p.tok.kind == .eof { - return error(eof_err) - } - if p.is_singlequote() { - return error('object keys must be in single quotes.') - } + is_key := p.tok.kind == .str_ && p.n_tok.kind == .colon if !is_key { - return error("invalid token `$p.tok.lit`, expected \'string\'") + return error(p.emit_error('invalid token `$p.tok.kind`, expecting `str_`')) } - cur_key = p.tok.lit - p.next() - p.next() + cur_key := p.tok.lit.bytestr() + p.next_with_err() ? + p.next_with_err() ? fields[cur_key] = p.decode_value() ? - if p.tok.kind == .comma && p.n_tok.kind !in [.rcbr, .comma] { - p.next() - continue - } else if p.tok.kind == .rcbr { - break + if p.tok.kind == .comma { + p.next_with_err() ? + if p.tok.kind != .str_ { + return error(p.emit_error("unknown token '$p.tok.lit' when decoding object.")) + } } - return error("unknown token '$p.tok.lit' when decoding object.") } - p.next() + p.next_with_err() ? return Any(fields) } diff --git a/vlib/x/json2/decoder_test.v b/vlib/x/json2/decoder_test.v index 5a613edfff..0a99202bbb 100644 --- a/vlib/x/json2/decoder_test.v +++ b/vlib/x/json2/decoder_test.v @@ -54,8 +54,16 @@ fn test_raw_decode_null() { fn test_raw_decode_invalid() { json2.raw_decode('1z') or { - assert err == '[json] invalid JSON. (0:0)' + assert err == '[x.json2] invalid token `z` (0:17)' return } assert false } + +fn test_raw_decode_string_with_dollarsign() { + str := json2.raw_decode(r'"Hello $world"') or { + assert false + json2.Any{} + } + assert str.str() == r'Hello $world' +} diff --git a/vlib/x/json2/encoder.v b/vlib/x/json2/encoder.v index 894f398e96..0476231a03 100644 --- a/vlib/x/json2/encoder.v +++ b/vlib/x/json2/encoder.v @@ -65,19 +65,11 @@ pub fn (f Any) str() string { } f32 { str_f32 := f.str() - return if str_f32.ends_with('.') { - str_f32 + '0' - } else { - str_f32 - } + return if str_f32.ends_with('.') { '${str_f32}0' } else { str_f32 } } f64 { str_f64 := f.str() - return if str_f64.ends_with('.') { - str_f64 + '0' - } else { - str_f64 - } + return if str_f64.ends_with('.') { '${str_f64}0' } else { str_f64 } } bool { return f.str() @@ -85,14 +77,11 @@ pub fn (f Any) str() string { map[string]Any { return f.str() } + []Any { + return f.str() + } Null { return 'null' } - else { - if f is []Any { - return f.str() - } - return '' - } } } diff --git a/vlib/x/json2/json2.v b/vlib/x/json2/json2.v index 622d40bdfa..03c8ebdfcf 100644 --- a/vlib/x/json2/json2.v +++ b/vlib/x/json2/json2.v @@ -48,7 +48,7 @@ pub fn (f Any) as_map() map[string]Any { } return mp } - return { + return map{ '0': f } } diff --git a/vlib/x/json2/json2_test.v b/vlib/x/json2/json2_test.v index 1d10aa21f2..e8893d2300 100644 --- a/vlib/x/json2/json2_test.v +++ b/vlib/x/json2/json2_test.v @@ -48,8 +48,9 @@ fn test_simple() { eprintln('Employee x: $s') assert s == '{"name":"Peter","age":28,"salary":95000.5,"title":2}' y := json2.decode(s) or { + println(err) assert false - Employee{} + return } eprintln('Employee y: $y') assert y.name == 'Peter' @@ -69,17 +70,17 @@ fn test_fast_raw_decode() { } fn test_character_unescape() { - // Need to test `\r`, `\b`, `\f` ?? - message := '{ - "newline":"new\\nline", - "tab":"\\ttab", - "backslash": "back\\\\slash", - "quotes": "\\"quotes\\"", - "slash":"\/dev\/null" - }' + message := r'{ + "newline": "new\nline", + "tab": "\ttab", + "backslash": "back\\slash", + "quotes": "\"quotes\"", + "slash":"\/dev\/null" +}' mut obj := json2.raw_decode(message) or { + println(err) assert false - json2.Any{} + return } lines := obj.as_map() eprintln('$lines') @@ -152,7 +153,7 @@ fn (mut u User) from_json(an json2.Any) { fn (u User) to_json() string { // TODO: derive from field - mut mp := { + mut mp := map{ 'age': json2.Any(u.age) } mp['nums'] = u.nums.map(json2.Any(it)) @@ -166,13 +167,15 @@ fn (u User) to_json() string { fn test_parse_user() { s := '{"age": 10, "nums": [1,2,3], "type": 1, "lastName": "Johnson", "IsRegistered": true, "pet_animals": {"name": "Bob", "animal": "Dog"}}' u2 := json2.decode(s) or { + println(err) assert false - User2{} + return } println(u2) u := json2.decode(s) or { + println(err) assert false - User{} + return } assert u.age == 10 assert u.last_name == 'Johnson' @@ -249,7 +252,7 @@ fn test_struct_in_struct() { */ fn test_encode_map() { expected := '{"one":1,"two":2,"three":3,"four":4}' - numbers := { + numbers := map{ 'one': json2.Any(1) 'two': json2.Any(2) 'three': json2.Any(3) diff --git a/vlib/x/json2/scanner.v b/vlib/x/json2/scanner.v new file mode 100644 index 0000000000..20aad798e8 --- /dev/null +++ b/vlib/x/json2/scanner.v @@ -0,0 +1,288 @@ +// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module json2 + +import strconv + +struct Scanner { +mut: + text []byte + pos int + line int + col int +} + +enum TokenKind { + none_ + error + str_ + float + int_ + null + bool_ + eof + comma = 44 + colon = 58 + lsbr = 91 + rsbr = 93 + lcbr = 123 + rcbr = 125 +} + +struct Token { + lit []byte + kind TokenKind + line int + col int +} + +const ( + // list of characters commonly used in JSON. + char_list = [`{`, `}`, `[`, `]`, `,`, `:`] + // list of newlines to check when moving to a new position. + newlines = [`\r`, `\n`, byte(9), `\t`] + // list of escapable that needs to be escaped inside a JSON string. + // double quotes and forward slashes are excluded intentionally since + // they have their own separate checks for it in order to pass the + // JSON test suite (https://github.com/nst/JSONTestSuite/). + important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`] + // list of valid unicode escapes aside from \u{4-hex digits} + valid_unicode_escapes = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`] + // used for transforming escapes into valid unicode (eg. n => \n) + unicode_transform_escapes = map{ + 98: `\b` + 102: `\f` + 110: `\n` + 114: `\r` + 116: `\t` + 92: `\\` + 34: `"` + 47: `/` + } + exp_signs = [byte(`-`), `+`] +) + +// move_pos proceeds to the next position. +fn (mut s Scanner) move_pos() { + s.move(true, true) +} + +// move_pos_with_newlines is the same as move_pos but only enables newline checking. +fn (mut s Scanner) move_pos_with_newlines() { + s.move(false, true) +} + +fn (mut s Scanner) move(include_space bool, include_newlines bool) { + s.pos++ + if s.pos < s.text.len { + if include_newlines && s.text[s.pos] in json2.newlines { + s.line++ + s.col = 0 + if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` { + s.pos++ + } + for s.pos < s.text.len && s.text[s.pos] in json2.newlines { + s.move_pos() + } + } else if include_space && s.text[s.pos] == ` ` { + s.pos++ + s.col++ + for s.pos < s.text.len && s.text[s.pos] == ` ` { + s.move_pos() + } + } + } else { + s.col++ + } +} + +// error returns an error token. +fn (s Scanner) error(description string) Token { + return s.tokenize(description.bytes(), .error) +} + +// tokenize returns a token based on the given lit and kind. +fn (s Scanner) tokenize(lit []byte, kind TokenKind) Token { + return Token{ + lit: lit + kind: kind + col: s.col + line: s.line + } +} + +// text_scan scans and returns a string token. +[manualfree] +fn (mut s Scanner) text_scan() Token { + mut has_closed := false + mut chrs := []byte{} + for { + s.move(false, false) + if s.pos >= s.text.len { + break + } + ch := s.text[s.pos] + if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` { + has_closed = true + break + } else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) + && ch in json2.important_escapable_chars { + return s.error('character must be escaped with a backslash') + } else if s.pos == s.text.len - 1 && ch == `\\` { + return s.error('invalid backslash escape') + } else if s.pos + 1 < s.text.len && ch == `\\` { + peek := s.text[s.pos + 1] + if peek in json2.valid_unicode_escapes { + chrs << json2.unicode_transform_escapes[int(peek)] + s.move(false, false) + continue + } else if peek == `u` { + if s.pos + 5 < s.text.len { + s.move(false, false) + mut codepoint := []byte{} + codepoint_start := s.pos + for s.pos < s.text.len && s.pos < codepoint_start + 4 { + s.move(false, false) + if s.text[s.pos] == `"` { + break + } else if !s.text[s.pos].is_hex_digit() { + return s.error('`${s.text[s.pos].ascii_str()}` is not a hex digit') + } + codepoint << s.text[s.pos] + } + if codepoint.len != 4 { + return s.error('unicode escape must have 4 hex digits') + } + chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32)) + unsafe { codepoint.free() } + continue + } else { + return s.error('incomplete unicode escape') + } + } else if peek == `U` { + return s.error('unicode endpoints must be in lowercase `u`') + } else if peek == byte(229) { + return s.error('unicode endpoint not allowed') + } else { + return s.error('invalid backslash escape') + } + } + chrs << ch + } + tok := s.tokenize(chrs, .str_) + s.move_pos() + if !has_closed { + return s.error('missing double quotes in string closing') + } + return tok +} + +// num_scan scans and returns an int/float token. +fn (mut s Scanner) num_scan() Token { + // analyze json number structure + // -[digit][?[dot][digit]][?[E/e][?-/+][digit]] + mut is_fl := false + mut dot_index := -1 + mut digits := []byte{} + if s.text[s.pos] == `-` { + digits << `-` + if !s.text[s.pos + 1].is_digit() { + return s.invalid_token() + } + s.move_pos_with_newlines() + } + if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) { + return s.error('leading zeroes in a number are not allowed') + } + for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) { + digits << s.text[s.pos] + if s.text[s.pos] == `.` { + is_fl = true + dot_index = digits.len - 1 + } + s.move_pos_with_newlines() + } + if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 { + return s.error('invalid float') + } + if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) { + digits << s.text[s.pos] + s.move_pos_with_newlines() + if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs { + digits << s.text[s.pos] + s.move_pos_with_newlines() + } + mut exp_digits_count := 0 + for s.pos < s.text.len && s.text[s.pos].is_digit() { + digits << s.text[s.pos] + exp_digits_count++ + s.move_pos_with_newlines() + } + if exp_digits_count == 0 { + return s.error('invalid exponent') + } + } + kind := if is_fl { TokenKind.float } else { TokenKind.int_ } + return s.tokenize(digits, kind) +} + +// invalid_token returns an error token with the invalid token message. +fn (s Scanner) invalid_token() Token { + return s.error('invalid token `${s.text[s.pos].ascii_str()}`') +} + +// scan returns a token based on the scanner's current position. +[manualfree] +fn (mut s Scanner) scan() Token { + for s.pos < s.text.len && s.text[s.pos] == ` ` { + s.pos++ + } + if s.pos >= s.text.len { + return s.tokenize([]byte{}, .eof) + } else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) { + ident := s.text[s.pos..s.pos + 4].bytestr() + if ident == 'true' || ident == 'null' { + mut kind := TokenKind.null + if ident == 'true' { + kind = .bool_ + } + unsafe { ident.free() } + val := s.text[s.pos..s.pos + 4] + tok := s.tokenize(val, kind) + s.move_pos() + s.move_pos() + s.move_pos() + s.move_pos() + return tok + } + unsafe { ident.free() } + return s.invalid_token() + } else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` { + ident := s.text[s.pos..s.pos + 5].bytestr() + if ident == 'false' { + unsafe { ident.free() } + val := s.text[s.pos..s.pos + 5] + tok := s.tokenize(val, .bool_) + s.move_pos() + s.move_pos() + s.move_pos() + s.move_pos() + s.move_pos() + return tok + } + unsafe { ident.free() } + return s.invalid_token() + } else if s.text[s.pos] in json2.char_list { + chr := s.text[s.pos] + tok := s.tokenize([]byte{}, TokenKind(int(chr))) + s.move_pos() + return tok + } else if s.text[s.pos] == `"` { + return s.text_scan() + } else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` { + return s.num_scan() + } else { + return s.invalid_token() + } +} diff --git a/vlib/x/json2/scanner_test.v b/vlib/x/json2/scanner_test.v new file mode 100644 index 0000000000..5e1686b760 --- /dev/null +++ b/vlib/x/json2/scanner_test.v @@ -0,0 +1,320 @@ +module json2 + +fn test_str() { + mut sc := Scanner{ + text: '"test"'.bytes() + } + tok := sc.scan() + assert tok.kind == .str_ + assert tok.lit.len == 4 + assert tok.lit.bytestr() == 'test' +} + +fn test_str_valid_unicode_escape() { + mut sc := Scanner{ + text: r'"\u0048"'.bytes() + } + tok := sc.scan() + assert tok.kind == .str_ + assert tok.lit.len == 1 + assert tok.lit.bytestr() == 'H' +} + +fn test_str_invalid_escape() { + mut sc := Scanner{ + text: r'"\z"'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid backslash escape' +} + +fn test_str_invalid_must_be_escape() { + for char in important_escapable_chars { + mut sc := Scanner{ + text: [byte(`"`), `t`, char, `"`] + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'character must be escaped with a backslash' + } +} + +fn test_str_invalid_unicode_escape() { + mut sc := Scanner{ + text: r'"\u010G"'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == '`G` is not a hex digit' +} + +fn test_str_invalid_unicode_escape_len() { + mut sc := Scanner{ + text: r'"\u001"'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'unicode escape must have 4 hex digits' +} + +fn test_str_invalid_uppercase_u() { + mut sc := Scanner{ + text: r'"\U0000"'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'unicode endpoints must be in lowercase `u`' +} + +fn test_str_missing_closing_bracket() { + mut sc := Scanner{ + text: '"incomplete'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'missing double quotes in string closing' +} + +fn test_int() { + mut sc := Scanner{ + text: '10'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 2 + assert tok.lit.bytestr() == '10' +} + +fn test_int_negative() { + mut sc := Scanner{ + text: '-10'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 3 + assert tok.lit.bytestr() == '-10' +} + +fn test_float() { + mut sc := Scanner{ + text: '123.400'.bytes() + } + tok := sc.scan() + assert tok.kind == .float + assert tok.lit.len == 7 + assert tok.lit.bytestr() == '123.400' +} + +fn test_float_negative() { + mut sc := Scanner{ + text: '-123.400'.bytes() + } + tok := sc.scan() + assert tok.kind == .float + assert tok.lit.len == 8 + assert tok.lit.bytestr() == '-123.400' +} + +fn test_int_exp() { + mut sc := Scanner{ + text: '1E22'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 4 + assert tok.lit.bytestr() == '1E22' +} + +fn test_int_exp_negative() { + mut sc := Scanner{ + text: '1E-2'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 4 + assert tok.lit.bytestr() == '1E-2' +} + +fn test_int_exp_positive() { + mut sc := Scanner{ + text: '1E+2'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 4 + assert tok.lit.bytestr() == '1E+2' +} + +fn test_float_exp() { + mut sc := Scanner{ + text: '123.456e78'.bytes() + } + tok := sc.scan() + assert tok.kind == .float + assert tok.lit.len == 10 + assert tok.lit.bytestr() == '123.456e78' +} + +fn test_float_exp_negative() { + mut sc := Scanner{ + text: '20.56e-5'.bytes() + } + tok := sc.scan() + assert tok.kind == .float + assert tok.lit.len == 8 + assert tok.lit.bytestr() == '20.56e-5' +} + +fn test_float_exp_positive() { + mut sc := Scanner{ + text: '20.56e+5'.bytes() + } + tok := sc.scan() + assert tok.kind == .float + assert tok.lit.len == 8 + assert tok.lit.bytestr() == '20.56e+5' +} + +fn test_number_with_space() { + mut sc := Scanner{ + text: ' 4'.bytes() + } + tok := sc.scan() + assert tok.kind == .int_ + assert tok.lit.len == 1 + assert tok.lit.bytestr() == '4' +} + +fn test_number_invalid_leading_zero() { + mut sc := Scanner{ + text: '0010'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'leading zeroes in a number are not allowed' +} + +fn test_number_invalid_leading_zero_negative() { + mut sc := Scanner{ + text: '-0010'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'leading zeroes in a number are not allowed' +} + +fn test_number_invalid_start_char() { + mut sc := Scanner{ + text: '+1'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid token `+`' +} + +fn test_number_invalid_char() { + mut sc := Scanner{ + text: '122x'.bytes() + } + sc.scan() + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid token `x`' +} + +fn test_number_invalid_char_float() { + mut sc := Scanner{ + text: '122x.1'.bytes() + } + sc.scan() + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid token `x`' +} + +fn test_number_invalid_multiple_dot() { + mut sc := Scanner{ + text: '122.108.10'.bytes() + } + sc.scan() + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid token `.`' +} + +fn test_number_invalid_exp() { + mut sc := Scanner{ + text: '0.3e'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid exponent' +} + +fn test_number_invalid_exp_with_sign() { + mut sc := Scanner{ + text: '0.3e+'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid exponent' +} + +fn test_number_invalid_zero_exp() { + mut sc := Scanner{ + text: '0e'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid exponent' +} + +fn test_number_invalid_dot_exp() { + mut sc := Scanner{ + text: '0.e'.bytes() + } + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid float' +} + +fn test_number_invalid_double_exp() { + mut sc := Scanner{ + text: '2eE'.bytes() + } + sc.scan() + tok := sc.scan() + assert tok.kind == .error + assert tok.lit.bytestr() == 'invalid token `E`' +} + +fn test_null() { + mut sc := Scanner{ + text: 'null'.bytes() + } + tok := sc.scan() + assert tok.kind == .null + assert tok.lit.len == 4 + assert tok.lit.bytestr() == 'null' +} + +fn test_bool_true() { + mut sc := Scanner{ + text: 'true'.bytes() + } + tok := sc.scan() + assert tok.kind == .bool_ + assert tok.lit.len == 4 + assert tok.lit.bytestr() == 'true' +} + +fn test_bool_false() { + mut sc := Scanner{ + text: 'false'.bytes() + } + tok := sc.scan() + assert tok.kind == .bool_ + assert tok.lit.len == 5 + assert tok.lit.bytestr() == 'false' +}