x.json2: create custom scanner for scanning JSON (#8716)

2023-08-10 21:13:21 +03:00 · 2021-02-26 14:36:02 +08:00
parent 7bee3dc489
commit 8dff168e01
8 changed files with 730 additions and 372 deletions
--- a/vlib/x/json2/any_test.v
+++ b/vlib/x/json2/any_test.v
@@ -1,16 +1,16 @@
 import x.json2

 const (
-	sample_data = {
-		'int': json2.Any(int(1))
-		'i64': json2.Any(i64(128))
-		'f32': json2.Any(f32(2.0))
-		'f64': json2.Any(f64(1.283))
+	sample_data = map{
+		'int':  json2.Any(int(1))
+		'i64':  json2.Any(i64(128))
+		'f32':  json2.Any(f32(2.0))
+		'f64':  json2.Any(f64(1.283))
 		'bool': json2.Any(false)
-		'str': json2.Any('test')
+		'str':  json2.Any('test')
 		'null': json2.Any(json2.null)
-		'arr': json2.Any([json2.Any('lol')])
-		'obj': json2.Any({
+		'arr':  json2.Any([json2.Any('lol')])
+		'obj':  json2.Any(map{
 			'foo': json2.Any(10)
 		})
 	}
@@ -126,6 +126,5 @@ fn test_str() {
 	assert sample_data['str'].str() == 'test'
 	assert sample_data['null'].str() == 'null'
 	assert sample_data['arr'].str() == '["lol"]'
-	assert sample_data.str() ==
-		'{"int":1,"i64":128,"f32":2.0,"f64":1.283,"bool":false,"str":"test","null":null,"arr":["lol"],"obj":{"foo":10}}'
+	assert sample_data.str() == '{"int":1,"i64":128,"f32":2.0,"f64":1.283,"bool":false,"str":"test","null":null,"arr":["lol"],"obj":{"foo":10}}'
 }
--- a/vlib/x/json2/decoder.v
+++ b/vlib/x/json2/decoder.v
@@ -3,42 +3,20 @@
 // that can be found in the LICENSE file.
 module json2

-import strings
-import strconv
-import v.scanner
-import v.token
-import v.util
-import v.pref
-
 // `Any` is a sum type that lists the possible types to be decoded and used.
-pub type Any = string | int | i64 | f32 | f64 | bool | Null | []Any | map[string]Any
+pub type Any = Null | []Any | bool | f32 | f64 | i64 | int | map[string]Any | string

 // `Null` struct is a simple representation of the `null` value in JSON.
 pub struct Null {
+	is_null bool = true
 }

-enum ParseMode {
-	array
-	bool
-	invalid
-	null
-	number
-	object
-	string
-}
-
-const (
-	formfeed_err = 'formfeed not allowed.'
-	eof_err      = 'reached eof. data not closed properly.'
-)
-
 struct Parser {
 mut:
-	scanner      &scanner.Scanner
-	p_tok        token.Token
-	tok          token.Token
-	n_tok        token.Token
-	mode         ParseMode = .invalid
+	scanner      &Scanner
+	p_tok        Token
+	tok          Token
+	n_tok        Token
 	n_level      int
 	convert_type bool = true
 }
@@ -49,131 +27,63 @@ fn (mut p Parser) next() {
 	p.n_tok = p.scanner.scan()
 }

-fn (p Parser) emit_error(msg string) string {
-	source := p.scanner.text
-	cur := p.tok
-	mut pp := util.imax(0, util.imin(source.len - 1, cur.pos))
-	if source.len > 0 {
-		for pp >= 0 {
-			if source[pp] !in [`\r`, `\n`] {
-				pp--
-				continue
-			}
-			break
-		}
+fn (mut p Parser) next_with_err() ? {
+	p.next()
+	if p.tok.kind == .error {
+		return error(p.emit_error(p.tok.lit.bytestr()))
 	}
-	column := util.imax(0, cur.pos - pp + cur.len - 1)
-	line := cur.line_nr
-	return '[json] $msg ($line:$column)'
 }

-fn new_parser(srce string, convert_type bool) Parser {
-	mut src := srce
-	// from v/util/util.v
-	if src.len >= 3 {
-		c_text := src.str
+fn (p Parser) emit_error(msg string) string {
+	line := p.tok.line
+	column := p.tok.col + p.tok.lit.len
+	return '[x.json2] $msg ($line:$column)'
+}
+
+// TODO: copied from v.util to avoid the entire module and its functions
+// from being imported. remove later once -skip-unused is enabled by default.
+fn skip_bom(file_content string) string {
+	mut raw_text := file_content
+	// BOM check
+	if raw_text.len >= 3 {
 		unsafe {
+			c_text := raw_text.str
 			if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF {
 				// skip three BOM bytes
 				offset_from_begin := 3
-				src = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin)
+				raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin)
 			}
 		}
 	}
+	return raw_text
+}
+
+fn new_parser(srce string, convert_type bool) Parser {
+	src := skip_bom(srce)
 	return Parser{
-		scanner: scanner.new_scanner(src, .parse_comments, &pref.Preferences{output_mode: .silent})
+		scanner: &Scanner{
+			text: src.bytes()
+		}
 		convert_type: convert_type
 	}
 }

-fn check_valid_hex(str string) ? {
-	if str.len != 4 {
-		return error('hex string must be 4 characters.')
-	}
-	for l in str {
-		if l.is_hex_digit() {
-			continue
-		}
-		return error('provided string is not a hex digit.')
-	}
-}
-
 fn (mut p Parser) decode() ?Any {
-	p.detect_parse_mode()
-	if p.mode == .invalid {
-		return error(p.emit_error('invalid JSON.'))
-	}
-	fi := p.decode_value() or {
-		return error(p.emit_error(err))
-	}
+	p.next()
+	p.next_with_err() ?
+	fi := p.decode_value() ?
 	if p.tok.kind != .eof {
-		return error(p.emit_error('unknown token `$p.tok.kind`.'))
+		return error(p.emit_error('invalid token `$p.tok.kind`'))
 	}
 	return fi
 }

-fn (p Parser) is_formfeed() bool {
-	prev_tok_pos := p.p_tok.pos + p.p_tok.len - 2
-	if prev_tok_pos < p.scanner.text.len && p.scanner.text[prev_tok_pos] == 0x0c {
-		return true
-	}
-	return false
-}
-
-fn (p Parser) is_singlequote() bool {
-	src := p.scanner.text
-	prev_tok_pos := p.p_tok.pos + p.p_tok.len
-	return src[prev_tok_pos] == `\'`
-}
-
-fn (mut p Parser) detect_parse_mode() {
-	src := p.scanner.text
-	if src.len > 1 && src[0].is_digit() && !src[1].is_digit() {
-		p.mode = .invalid
-		return
-	}
-	p.tok = p.scanner.scan()
-	p.n_tok = p.scanner.scan()
-	if src.len == 1 && p.tok.kind == .string && p.n_tok.kind == .eof {
-		p.mode = .invalid
-		return
-	}
-	match p.tok.kind {
-		.lcbr {
-			p.mode = .object
-		}
-		.lsbr {
-			p.mode = .array
-		}
-		.number {
-			p.mode = .number
-		}
-		.key_true, .key_false {
-			p.mode = .bool
-		}
-		.string {
-			p.mode = .string
-		}
-		.name {
-			if p.tok.lit == 'null' {
-				p.mode = .null
-			}
-		}
-		.minus {
-			if p.n_tok.kind == .number {
-				p.mode = .number
-			}
-		}
-		else {}
-	}
-}
-
 fn (mut p Parser) decode_value() ?Any {
 	if p.n_level == 500 {
-		return error('reached maximum nesting level of 500.')
+		return error(p.emit_error('reached maximum nesting level of 500'))
 	}
-	if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr) ||
-		(p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) {
+	if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr)
+		|| (p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) {
 		p.n_level++
 	}
 	match p.tok.kind {
@@ -183,235 +93,76 @@ fn (mut p Parser) decode_value() ?Any {
 		.lcbr {
 			return p.decode_object()
 		}
-		.number {
-			return p.decode_number()
+		.int_, .float {
+			tl := p.tok.lit.bytestr()
+			kind := p.tok.kind
+			p.next_with_err() ?
+			if p.convert_type {
+				return if kind == .float { Any(tl.f64()) } else { Any(tl.i64()) }
+			}
+			return Any(tl)
 		}
-		.key_true {
-			p.next()
-			return if p.convert_type {
-				Any(true)
-			} else {
-				Any('true')
-			}
+		.bool_ {
+			lit := p.tok.lit.bytestr()
+			p.next_with_err() ?
+			return if p.convert_type { Any(lit.bool()) } else { Any(lit) }
 		}
-		.key_false {
-			p.next()
-			return if p.convert_type {
-				Any(false)
-			} else {
-				Any('false')
-			}
+		.null {
+			p.next_with_err() ?
+			return if p.convert_type { Any(null) } else { Any('null') }
 		}
-		.name {
-			if p.tok.lit != 'null' {
-				return error('unknown identifier `$p.tok.lit`')
-			}
-			p.next()
-			return if p.convert_type {
-				Any(Null{})
-			} else {
-				Any('null')
-			}
-		}
-		.string {
-			if p.is_singlequote() {
-				return error('strings must be in double-quotes.')
-			}
-			return p.decode_string()
+		.str_ {
+			str := p.tok.lit.bytestr()
+			p.next_with_err() ?
+			return Any(str)
 		}
 		else {
-			if p.tok.kind == .minus && p.n_tok.kind == .number && p.n_tok.pos == p.tok.pos + 1 {
-				p.next()
-				d_num := p.decode_number() ?
-				return d_num
-			}
-			return error("unknown token '$p.tok.lit' when decoding value")
+			return error(p.emit_error('invalid token `$p.tok.kind`'))
 		}
 	}
-	if p.is_formfeed() {
-		return error(formfeed_err)
-	}
 	return Any{}
 }

-fn (mut p Parser) decode_string() ?Any {
-	mut strwr := strings.new_builder(200)
-	for i := 0; i < p.tok.lit.len; i++ {
-		if ((i - 1 >= 0 && p.tok.lit[i - 1] != `/`) || i == 0) && int(p.tok.lit[i]) in [9, 10, 0] {
-			return error('character must be escaped with a backslash.')
-		}
-		if i == p.tok.lit.len - 1 && p.tok.lit[i] == 92 {
-			return error('invalid backslash escape.')
-		}
-		if i + 1 < p.tok.lit.len && p.tok.lit[i] == 92 {
-			peek := p.tok.lit[i + 1]
-			match peek {
-				`b` {
-					i++
-					strwr.write_b(`\b`)
-					continue
-				}
-				`f` {
-					i++
-					strwr.write_b(`\f`)
-					continue
-				}
-				`n` {
-					i++
-					strwr.write_b(`\n`)
-					continue
-				}
-				`r` {
-					i++
-					strwr.write_b(`\r`)
-					continue
-				}
-				`t` {
-					i++
-					strwr.write_b(`\t`)
-					continue
-				}
-				`u` {
-					if i + 5 < p.tok.lit.len {
-						codepoint := p.tok.lit[i + 2..i + 6]
-						check_valid_hex(codepoint) ?
-						hex_val := strconv.parse_int(codepoint, 16, 0)
-						strwr.write_b(byte(hex_val))
-						i += 5
-						continue
-					} else {
-						return error('incomplete unicode escape.')
-					}
-				}
-				`\\` {
-					i++
-					strwr.write_b(`\\`)
-					continue
-				}
-				`"` {
-					i++
-					strwr.write_b(`\"`)
-					continue
-				}
-				`/` {
-					i++
-					strwr.write_b(`/`)
-					continue
-				}
-				else { return error('invalid backslash escape.') }
-			}
-			if int(peek) == 85 {
-				return error('unicode endpoints must be in lowercase `u`.')
-			}
-			if int(peek) in [9, 229] {
-				return error('unicode endpoint not allowed.')
-			}
-		}
-		strwr.write_b(p.tok.lit[i])
-	}
-	p.next()
-	defer {
-		unsafe { strwr.free() }
-	}
-	str := strwr.str()
-	return Any(str)
-}
-
-// now returns string instead of int or float
-fn (mut p Parser) decode_number() ?Any {
-	src := p.scanner.text
-	mut tl := p.tok.lit
-	mut is_fl := false
-	sep_by_dot := tl.to_lower().split('.')
-	if tl.starts_with('0x') && tl.all_after('0x').len <= 2 {
-		return error('hex numbers should not be less than or equal to two digits.')
-	}
-	if src[p.p_tok.pos + p.p_tok.len] == `0` && src[p.p_tok.pos + p.p_tok.len + 1].is_digit() {
-		return error('leading zeroes in integers are not allowed.')
-	}
-	if tl.starts_with('.') {
-		return error('decimals must start with a digit followed by a dot.')
-	}
-	if tl.ends_with('+') || tl.ends_with('-') {
-		return error('exponents must have a digit before the sign.')
-	}
-	if sep_by_dot.len > 1 {
-		// analyze json number structure
-		// -[digit][dot][digit][E/e][-/+][digit]
-		// float number
-		is_fl = true
-		last := sep_by_dot.last()
-		if last.starts_with('e') {
-			return error('exponents must have a digit before the exponent notation.')
-		}
-	}
-	if p.p_tok.kind == .minus && p.tok.pos == p.p_tok.pos + 1 {
-		tl = '-$tl'
-	}
-	p.next()
-	if p.convert_type {
-		return if is_fl {
-			Any(tl.f64())
-		} else {
-			Any(tl.i64())
-		}
-	}
-	return Any(tl)
-}
-
 fn (mut p Parser) decode_array() ?Any {
 	mut items := []Any{}
-	p.next()
+	p.next_with_err() ?
 	for p.tok.kind != .rsbr {
-		if p.tok.kind == .eof {
-			return error(eof_err)
-		}
 		item := p.decode_value() ?
 		items << item
-		if p.tok.kind == .comma && p.n_tok.kind !in [.rsbr, .comma] {
-			p.next()
-			continue
-		}
-		if p.tok.kind == .rsbr {
+		if p.tok.kind == .comma {
+			p.next_with_err() ?
+			if p.tok.kind == .rsbr || p.tok.kind == .rcbr {
+				return error(p.emit_error('invalid token `$p.tok.lit'))
+			}
+		} else if p.tok.kind == .rsbr {
 			break
+		} else {
+			return error(p.emit_error("unknown token '$p.tok.lit' when decoding array."))
 		}
-		return error("unknown token '$p.tok.lit' when decoding arrays.")
 	}
-	p.next()
+	p.next_with_err() ?
 	return Any(items)
 }

 fn (mut p Parser) decode_object() ?Any {
 	mut fields := map[string]Any{}
-	mut cur_key := ''
-	p.next()
+	p.next_with_err() ?
 	for p.tok.kind != .rcbr {
-		is_key := p.tok.kind == .string && p.n_tok.kind == .colon
-		// todo
-		// if p.is_formfeed() {
-		// return error(formfeed_err)
-		// }
-		if p.tok.kind == .eof {
-			return error(eof_err)
-		}
-		if p.is_singlequote() {
-			return error('object keys must be in single quotes.')
-		}
+		is_key := p.tok.kind == .str_ && p.n_tok.kind == .colon
 		if !is_key {
-			return error("invalid token `$p.tok.lit`, expected \'string\'")
+			return error(p.emit_error('invalid token `$p.tok.kind`, expecting `str_`'))
 		}
-		cur_key = p.tok.lit
-		p.next()
-		p.next()
+		cur_key := p.tok.lit.bytestr()
+		p.next_with_err() ?
+		p.next_with_err() ?
 		fields[cur_key] = p.decode_value() ?
-		if p.tok.kind == .comma && p.n_tok.kind !in [.rcbr, .comma] {
-			p.next()
-			continue
-		} else if p.tok.kind == .rcbr {
-			break
+		if p.tok.kind == .comma {
+			p.next_with_err() ?
+			if p.tok.kind != .str_ {
+				return error(p.emit_error("unknown token '$p.tok.lit' when decoding object."))
+			}
 		}
-		return error("unknown token '$p.tok.lit' when decoding object.")
 	}
-	p.next()
+	p.next_with_err() ?
 	return Any(fields)
 }
--- a/vlib/x/json2/decoder_test.v
+++ b/vlib/x/json2/decoder_test.v
@@ -54,8 +54,16 @@ fn test_raw_decode_null() {

 fn test_raw_decode_invalid() {
 	json2.raw_decode('1z') or {
-		assert err == '[json] invalid JSON. (0:0)'
+		assert err == '[x.json2] invalid token `z` (0:17)'
 		return
 	}
 	assert false
 }
+
+fn test_raw_decode_string_with_dollarsign() {
+	str := json2.raw_decode(r'"Hello $world"') or {
+		assert false
+		json2.Any{}
+	}
+	assert str.str() == r'Hello $world'
+}
--- a/vlib/x/json2/encoder.v
+++ b/vlib/x/json2/encoder.v
@@ -65,19 +65,11 @@ pub fn (f Any) str() string {
 		}
 		f32 {
 			str_f32 := f.str()
-			return if str_f32.ends_with('.') {
-				str_f32 + '0'
-			} else {
-				str_f32
-			}
+			return if str_f32.ends_with('.') { '${str_f32}0' } else { str_f32 }
 		}
 		f64 {
 			str_f64 := f.str()
-			return if str_f64.ends_with('.') {
-				str_f64 + '0'
-			} else {
-				str_f64
-			}
+			return if str_f64.ends_with('.') { '${str_f64}0' } else { str_f64 }
 		}
 		bool {
 			return f.str()
@@ -85,14 +77,11 @@ pub fn (f Any) str() string {
 		map[string]Any {
 			return f.str()
 		}
+		[]Any {
+			return f.str()
+		}
 		Null {
 			return 'null'
 		}
-		else {
-			if f is []Any {
-				return f.str()
-			}
-			return ''
-		}
 	}
 }
--- a/vlib/x/json2/json2.v
+++ b/vlib/x/json2/json2.v
@@ -48,7 +48,7 @@ pub fn (f Any) as_map() map[string]Any {
 		}
 		return mp
 	}
-	return {
+	return map{
 		'0': f
 	}
 }
--- a/vlib/x/json2/json2_test.v
+++ b/vlib/x/json2/json2_test.v
@@ -48,8 +48,9 @@ fn test_simple() {
 	eprintln('Employee x: $s')
 	assert s == '{"name":"Peter","age":28,"salary":95000.5,"title":2}'
 	y := json2.decode<Employee>(s) or {
+		println(err)
 		assert false
-		Employee{}
+		return
 	}
 	eprintln('Employee y: $y')
 	assert y.name == 'Peter'
@@ -69,17 +70,17 @@ fn test_fast_raw_decode() {
 }

 fn test_character_unescape() {
-	// Need to test `\r`, `\b`, `\f` ??
-	message := '{
-		"newline":"new\\nline",
-		"tab":"\\ttab",
-		"backslash": "back\\\\slash",
-		"quotes": "\\"quotes\\"",
-		"slash":"\/dev\/null"
-	}'
+	message := r'{
+	"newline": "new\nline",
+	"tab": "\ttab",
+	"backslash": "back\\slash",
+	"quotes": "\"quotes\"",
+	"slash":"\/dev\/null"
+}'
 	mut obj := json2.raw_decode(message) or {
+		println(err)
 		assert false
-		json2.Any{}
+		return
 	}
 	lines := obj.as_map()
 	eprintln('$lines')
@@ -152,7 +153,7 @@ fn (mut u User) from_json(an json2.Any) {

 fn (u User) to_json() string {
 	// TODO: derive from field
-	mut mp := {
+	mut mp := map{
 		'age': json2.Any(u.age)
 	}
 	mp['nums'] = u.nums.map(json2.Any(it))
@@ -166,13 +167,15 @@ fn (u User) to_json() string {
 fn test_parse_user() {
 	s := '{"age": 10, "nums": [1,2,3], "type": 1, "lastName": "Johnson", "IsRegistered": true, "pet_animals": {"name": "Bob", "animal": "Dog"}}'
 	u2 := json2.decode<User2>(s) or {
+		println(err)
 		assert false
-		User2{}
+		return
 	}
 	println(u2)
 	u := json2.decode<User>(s) or {
+		println(err)
 		assert false
-		User{}
+		return
 	}
 	assert u.age == 10
 	assert u.last_name == 'Johnson'
@@ -249,7 +252,7 @@ fn test_struct_in_struct() {
 */
 fn test_encode_map() {
 	expected := '{"one":1,"two":2,"three":3,"four":4}'
-	numbers := {
+	numbers := map{
 		'one':   json2.Any(1)
 		'two':   json2.Any(2)
 		'three': json2.Any(3)
--- a/vlib/x/json2/scanner.v
+++ b/vlib/x/json2/scanner.v
@@ -0,0 +1,288 @@
+// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module json2
+
+import strconv
+
+struct Scanner {
+mut:
+	text []byte
+	pos  int
+	line int
+	col  int
+}
+
+enum TokenKind {
+	none_
+	error
+	str_
+	float
+	int_
+	null
+	bool_
+	eof
+	comma = 44
+	colon = 58
+	lsbr = 91
+	rsbr = 93
+	lcbr = 123
+	rcbr = 125
+}
+
+struct Token {
+	lit  []byte
+	kind TokenKind
+	line int
+	col  int
+}
+
+const (
+	// list of characters commonly used in JSON.
+	char_list                 = [`{`, `}`, `[`, `]`, `,`, `:`]
+	// list of newlines to check when moving to a new position.
+	newlines                  = [`\r`, `\n`, byte(9), `\t`]
+	// list of escapable that needs to be escaped inside a JSON string.
+	// double quotes and forward slashes are excluded intentionally since
+	// they have their own separate checks for it in order to pass the
+	// JSON test suite (https://github.com/nst/JSONTestSuite/).
+	important_escapable_chars = [byte(9), 10, 0, `\b`, `\f`, `\n`, `\r`, `\t`]
+	// list of valid unicode escapes aside from \u{4-hex digits}
+	valid_unicode_escapes     = [`b`, `f`, `n`, `r`, `t`, `\\`, `"`, `/`]
+	// used for transforming escapes into valid unicode (eg. n => \n)
+	unicode_transform_escapes = map{
+		98:  `\b`
+		102: `\f`
+		110: `\n`
+		114: `\r`
+		116: `\t`
+		92:  `\\`
+		34:  `"`
+		47:  `/`
+	}
+	exp_signs                 = [byte(`-`), `+`]
+)
+
+// move_pos proceeds to the next position.
+fn (mut s Scanner) move_pos() {
+	s.move(true, true)
+}
+
+// move_pos_with_newlines is the same as move_pos but only enables newline checking.
+fn (mut s Scanner) move_pos_with_newlines() {
+	s.move(false, true)
+}
+
+fn (mut s Scanner) move(include_space bool, include_newlines bool) {
+	s.pos++
+	if s.pos < s.text.len {
+		if include_newlines && s.text[s.pos] in json2.newlines {
+			s.line++
+			s.col = 0
+			if s.text[s.pos] == `\r` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `\n` {
+				s.pos++
+			}
+			for s.pos < s.text.len && s.text[s.pos] in json2.newlines {
+				s.move_pos()
+			}
+		} else if include_space && s.text[s.pos] == ` ` {
+			s.pos++
+			s.col++
+			for s.pos < s.text.len && s.text[s.pos] == ` ` {
+				s.move_pos()
+			}
+		}
+	} else {
+		s.col++
+	}
+}
+
+// error returns an error token.
+fn (s Scanner) error(description string) Token {
+	return s.tokenize(description.bytes(), .error)
+}
+
+// tokenize returns a token based on the given lit and kind.
+fn (s Scanner) tokenize(lit []byte, kind TokenKind) Token {
+	return Token{
+		lit: lit
+		kind: kind
+		col: s.col
+		line: s.line
+	}
+}
+
+// text_scan scans and returns a string token.
+[manualfree]
+fn (mut s Scanner) text_scan() Token {
+	mut has_closed := false
+	mut chrs := []byte{}
+	for {
+		s.move(false, false)
+		if s.pos >= s.text.len {
+			break
+		}
+		ch := s.text[s.pos]
+		if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`) && ch == `"` {
+			has_closed = true
+			break
+		} else if (s.pos - 1 >= 0 && s.text[s.pos - 1] != `\\`)
+			&& ch in json2.important_escapable_chars {
+			return s.error('character must be escaped with a backslash')
+		} else if s.pos == s.text.len - 1 && ch == `\\` {
+			return s.error('invalid backslash escape')
+		} else if s.pos + 1 < s.text.len && ch == `\\` {
+			peek := s.text[s.pos + 1]
+			if peek in json2.valid_unicode_escapes {
+				chrs << json2.unicode_transform_escapes[int(peek)]
+				s.move(false, false)
+				continue
+			} else if peek == `u` {
+				if s.pos + 5 < s.text.len {
+					s.move(false, false)
+					mut codepoint := []byte{}
+					codepoint_start := s.pos
+					for s.pos < s.text.len && s.pos < codepoint_start + 4 {
+						s.move(false, false)
+						if s.text[s.pos] == `"` {
+							break
+						} else if !s.text[s.pos].is_hex_digit() {
+							return s.error('`${s.text[s.pos].ascii_str()}` is not a hex digit')
+						}
+						codepoint << s.text[s.pos]
+					}
+					if codepoint.len != 4 {
+						return s.error('unicode escape must have 4 hex digits')
+					}
+					chrs << byte(strconv.parse_uint(codepoint.bytestr(), 16, 32))
+					unsafe { codepoint.free() }
+					continue
+				} else {
+					return s.error('incomplete unicode escape')
+				}
+			} else if peek == `U` {
+				return s.error('unicode endpoints must be in lowercase `u`')
+			} else if peek == byte(229) {
+				return s.error('unicode endpoint not allowed')
+			} else {
+				return s.error('invalid backslash escape')
+			}
+		}
+		chrs << ch
+	}
+	tok := s.tokenize(chrs, .str_)
+	s.move_pos()
+	if !has_closed {
+		return s.error('missing double quotes in string closing')
+	}
+	return tok
+}
+
+// num_scan scans and returns an int/float token.
+fn (mut s Scanner) num_scan() Token {
+	// analyze json number structure
+	// -[digit][?[dot][digit]][?[E/e][?-/+][digit]]
+	mut is_fl := false
+	mut dot_index := -1
+	mut digits := []byte{}
+	if s.text[s.pos] == `-` {
+		digits << `-`
+		if !s.text[s.pos + 1].is_digit() {
+			return s.invalid_token()
+		}
+		s.move_pos_with_newlines()
+	}
+	if s.text[s.pos] == `0` && (s.pos + 1 < s.text.len && s.text[s.pos + 1].is_digit()) {
+		return s.error('leading zeroes in a number are not allowed')
+	}
+	for s.pos < s.text.len && (s.text[s.pos].is_digit() || (!is_fl && s.text[s.pos] == `.`)) {
+		digits << s.text[s.pos]
+		if s.text[s.pos] == `.` {
+			is_fl = true
+			dot_index = digits.len - 1
+		}
+		s.move_pos_with_newlines()
+	}
+	if dot_index + 1 < s.text.len && digits[dot_index + 1..].len == 0 {
+		return s.error('invalid float')
+	}
+	if s.pos < s.text.len && (s.text[s.pos] == `e` || s.text[s.pos] == `E`) {
+		digits << s.text[s.pos]
+		s.move_pos_with_newlines()
+		if s.pos < s.text.len && s.text[s.pos] in json2.exp_signs {
+			digits << s.text[s.pos]
+			s.move_pos_with_newlines()
+		}
+		mut exp_digits_count := 0
+		for s.pos < s.text.len && s.text[s.pos].is_digit() {
+			digits << s.text[s.pos]
+			exp_digits_count++
+			s.move_pos_with_newlines()
+		}
+		if exp_digits_count == 0 {
+			return s.error('invalid exponent')
+		}
+	}
+	kind := if is_fl { TokenKind.float } else { TokenKind.int_ }
+	return s.tokenize(digits, kind)
+}
+
+// invalid_token returns an error token with the invalid token message.
+fn (s Scanner) invalid_token() Token {
+	return s.error('invalid token `${s.text[s.pos].ascii_str()}`')
+}
+
+// scan returns a token based on the scanner's current position.
+[manualfree]
+fn (mut s Scanner) scan() Token {
+	for s.pos < s.text.len && s.text[s.pos] == ` ` {
+		s.pos++
+	}
+	if s.pos >= s.text.len {
+		return s.tokenize([]byte{}, .eof)
+	} else if s.pos + 3 < s.text.len && (s.text[s.pos] == `t` || s.text[s.pos] == `n`) {
+		ident := s.text[s.pos..s.pos + 4].bytestr()
+		if ident == 'true' || ident == 'null' {
+			mut kind := TokenKind.null
+			if ident == 'true' {
+				kind = .bool_
+			}
+			unsafe { ident.free() }
+			val := s.text[s.pos..s.pos + 4]
+			tok := s.tokenize(val, kind)
+			s.move_pos()
+			s.move_pos()
+			s.move_pos()
+			s.move_pos()
+			return tok
+		}
+		unsafe { ident.free() }
+		return s.invalid_token()
+	} else if s.pos + 4 < s.text.len && s.text[s.pos] == `f` {
+		ident := s.text[s.pos..s.pos + 5].bytestr()
+		if ident == 'false' {
+			unsafe { ident.free() }
+			val := s.text[s.pos..s.pos + 5]
+			tok := s.tokenize(val, .bool_)
+			s.move_pos()
+			s.move_pos()
+			s.move_pos()
+			s.move_pos()
+			s.move_pos()
+			return tok
+		}
+		unsafe { ident.free() }
+		return s.invalid_token()
+	} else if s.text[s.pos] in json2.char_list {
+		chr := s.text[s.pos]
+		tok := s.tokenize([]byte{}, TokenKind(int(chr)))
+		s.move_pos()
+		return tok
+	} else if s.text[s.pos] == `"` {
+		return s.text_scan()
+	} else if s.text[s.pos].is_digit() || s.text[s.pos] == `-` {
+		return s.num_scan()
+	} else {
+		return s.invalid_token()
+	}
+}
--- a/vlib/x/json2/scanner_test.v
+++ b/vlib/x/json2/scanner_test.v
@@ -0,0 +1,320 @@
+module json2
+
+fn test_str() {
+	mut sc := Scanner{
+		text: '"test"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .str_
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == 'test'
+}
+
+fn test_str_valid_unicode_escape() {
+	mut sc := Scanner{
+		text: r'"\u0048"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .str_
+	assert tok.lit.len == 1
+	assert tok.lit.bytestr() == 'H'
+}
+
+fn test_str_invalid_escape() {
+	mut sc := Scanner{
+		text: r'"\z"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid backslash escape'
+}
+
+fn test_str_invalid_must_be_escape() {
+	for char in important_escapable_chars {
+		mut sc := Scanner{
+			text: [byte(`"`), `t`, char, `"`]
+		}
+		tok := sc.scan()
+		assert tok.kind == .error
+		assert tok.lit.bytestr() == 'character must be escaped with a backslash'
+	}
+}
+
+fn test_str_invalid_unicode_escape() {
+	mut sc := Scanner{
+		text: r'"\u010G"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == '`G` is not a hex digit'
+}
+
+fn test_str_invalid_unicode_escape_len() {
+	mut sc := Scanner{
+		text: r'"\u001"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'unicode escape must have 4 hex digits'
+}
+
+fn test_str_invalid_uppercase_u() {
+	mut sc := Scanner{
+		text: r'"\U0000"'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'unicode endpoints must be in lowercase `u`'
+}
+
+fn test_str_missing_closing_bracket() {
+	mut sc := Scanner{
+		text: '"incomplete'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'missing double quotes in string closing'
+}
+
+fn test_int() {
+	mut sc := Scanner{
+		text: '10'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 2
+	assert tok.lit.bytestr() == '10'
+}
+
+fn test_int_negative() {
+	mut sc := Scanner{
+		text: '-10'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 3
+	assert tok.lit.bytestr() == '-10'
+}
+
+fn test_float() {
+	mut sc := Scanner{
+		text: '123.400'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .float
+	assert tok.lit.len == 7
+	assert tok.lit.bytestr() == '123.400'
+}
+
+fn test_float_negative() {
+	mut sc := Scanner{
+		text: '-123.400'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .float
+	assert tok.lit.len == 8
+	assert tok.lit.bytestr() == '-123.400'
+}
+
+fn test_int_exp() {
+	mut sc := Scanner{
+		text: '1E22'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == '1E22'
+}
+
+fn test_int_exp_negative() {
+	mut sc := Scanner{
+		text: '1E-2'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == '1E-2'
+}
+
+fn test_int_exp_positive() {
+	mut sc := Scanner{
+		text: '1E+2'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == '1E+2'
+}
+
+fn test_float_exp() {
+	mut sc := Scanner{
+		text: '123.456e78'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .float
+	assert tok.lit.len == 10
+	assert tok.lit.bytestr() == '123.456e78'
+}
+
+fn test_float_exp_negative() {
+	mut sc := Scanner{
+		text: '20.56e-5'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .float
+	assert tok.lit.len == 8
+	assert tok.lit.bytestr() == '20.56e-5'
+}
+
+fn test_float_exp_positive() {
+	mut sc := Scanner{
+		text: '20.56e+5'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .float
+	assert tok.lit.len == 8
+	assert tok.lit.bytestr() == '20.56e+5'
+}
+
+fn test_number_with_space() {
+	mut sc := Scanner{
+		text: ' 4'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .int_
+	assert tok.lit.len == 1
+	assert tok.lit.bytestr() == '4'
+}
+
+fn test_number_invalid_leading_zero() {
+	mut sc := Scanner{
+		text: '0010'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'leading zeroes in a number are not allowed'
+}
+
+fn test_number_invalid_leading_zero_negative() {
+	mut sc := Scanner{
+		text: '-0010'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'leading zeroes in a number are not allowed'
+}
+
+fn test_number_invalid_start_char() {
+	mut sc := Scanner{
+		text: '+1'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid token `+`'
+}
+
+fn test_number_invalid_char() {
+	mut sc := Scanner{
+		text: '122x'.bytes()
+	}
+	sc.scan()
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid token `x`'
+}
+
+fn test_number_invalid_char_float() {
+	mut sc := Scanner{
+		text: '122x.1'.bytes()
+	}
+	sc.scan()
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid token `x`'
+}
+
+fn test_number_invalid_multiple_dot() {
+	mut sc := Scanner{
+		text: '122.108.10'.bytes()
+	}
+	sc.scan()
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid token `.`'
+}
+
+fn test_number_invalid_exp() {
+	mut sc := Scanner{
+		text: '0.3e'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid exponent'
+}
+
+fn test_number_invalid_exp_with_sign() {
+	mut sc := Scanner{
+		text: '0.3e+'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid exponent'
+}
+
+fn test_number_invalid_zero_exp() {
+	mut sc := Scanner{
+		text: '0e'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid exponent'
+}
+
+fn test_number_invalid_dot_exp() {
+	mut sc := Scanner{
+		text: '0.e'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid float'
+}
+
+fn test_number_invalid_double_exp() {
+	mut sc := Scanner{
+		text: '2eE'.bytes()
+	}
+	sc.scan()
+	tok := sc.scan()
+	assert tok.kind == .error
+	assert tok.lit.bytestr() == 'invalid token `E`'
+}
+
+fn test_null() {
+	mut sc := Scanner{
+		text: 'null'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .null
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == 'null'
+}
+
+fn test_bool_true() {
+	mut sc := Scanner{
+		text: 'true'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .bool_
+	assert tok.lit.len == 4
+	assert tok.lit.bytestr() == 'true'
+}
+
+fn test_bool_false() {
+	mut sc := Scanner{
+		text: 'false'.bytes()
+	}
+	tok := sc.scan()
+	assert tok.kind == .bool_
+	assert tok.lit.len == 5
+	assert tok.lit.bytestr() == 'false'
+}