x.json2: create custom scanner for scanning JSON (#8716)

2023-08-10 21:13:21 +03:00 · 2021-02-26 14:36:02 +08:00
parent 7bee3dc489
commit 8dff168e01
8 changed files with 730 additions and 372 deletions
--- a/vlib/x/json2/decoder.v
+++ b/vlib/x/json2/decoder.v
@@ -3,42 +3,20 @@
 // that can be found in the LICENSE file.
 module json2

-import strings
-import strconv
-import v.scanner
-import v.token
-import v.util
-import v.pref
-
 // `Any` is a sum type that lists the possible types to be decoded and used.
-pub type Any = string | int | i64 | f32 | f64 | bool | Null | []Any | map[string]Any
+pub type Any = Null | []Any | bool | f32 | f64 | i64 | int | map[string]Any | string

 // `Null` struct is a simple representation of the `null` value in JSON.
 pub struct Null {
+	is_null bool = true
 }

-enum ParseMode {
-	array
-	bool
-	invalid
-	null
-	number
-	object
-	string
-}
-
-const (
-	formfeed_err = 'formfeed not allowed.'
-	eof_err      = 'reached eof. data not closed properly.'
-)
-
 struct Parser {
 mut:
-	scanner      &scanner.Scanner
-	p_tok        token.Token
-	tok          token.Token
-	n_tok        token.Token
-	mode         ParseMode = .invalid
+	scanner      &Scanner
+	p_tok        Token
+	tok          Token
+	n_tok        Token
 	n_level      int
 	convert_type bool = true
 }
@@ -49,131 +27,63 @@ fn (mut p Parser) next() {
 	p.n_tok = p.scanner.scan()
 }

-fn (p Parser) emit_error(msg string) string {
-	source := p.scanner.text
-	cur := p.tok
-	mut pp := util.imax(0, util.imin(source.len - 1, cur.pos))
-	if source.len > 0 {
-		for pp >= 0 {
-			if source[pp] !in [`\r`, `\n`] {
-				pp--
-				continue
-			}
-			break
-		}
+fn (mut p Parser) next_with_err() ? {
+	p.next()
+	if p.tok.kind == .error {
+		return error(p.emit_error(p.tok.lit.bytestr()))
 	}
-	column := util.imax(0, cur.pos - pp + cur.len - 1)
-	line := cur.line_nr
-	return '[json] $msg ($line:$column)'
 }

-fn new_parser(srce string, convert_type bool) Parser {
-	mut src := srce
-	// from v/util/util.v
-	if src.len >= 3 {
-		c_text := src.str
+fn (p Parser) emit_error(msg string) string {
+	line := p.tok.line
+	column := p.tok.col + p.tok.lit.len
+	return '[x.json2] $msg ($line:$column)'
+}
+
+// TODO: copied from v.util to avoid the entire module and its functions
+// from being imported. remove later once -skip-unused is enabled by default.
+fn skip_bom(file_content string) string {
+	mut raw_text := file_content
+	// BOM check
+	if raw_text.len >= 3 {
 		unsafe {
+			c_text := raw_text.str
 			if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF {
 				// skip three BOM bytes
 				offset_from_begin := 3
-				src = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin)
+				raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin)
 			}
 		}
 	}
+	return raw_text
+}
+
+fn new_parser(srce string, convert_type bool) Parser {
+	src := skip_bom(srce)
 	return Parser{
-		scanner: scanner.new_scanner(src, .parse_comments, &pref.Preferences{output_mode: .silent})
+		scanner: &Scanner{
+			text: src.bytes()
+		}
 		convert_type: convert_type
 	}
 }

-fn check_valid_hex(str string) ? {
-	if str.len != 4 {
-		return error('hex string must be 4 characters.')
-	}
-	for l in str {
-		if l.is_hex_digit() {
-			continue
-		}
-		return error('provided string is not a hex digit.')
-	}
-}
-
 fn (mut p Parser) decode() ?Any {
-	p.detect_parse_mode()
-	if p.mode == .invalid {
-		return error(p.emit_error('invalid JSON.'))
-	}
-	fi := p.decode_value() or {
-		return error(p.emit_error(err))
-	}
+	p.next()
+	p.next_with_err() ?
+	fi := p.decode_value() ?
 	if p.tok.kind != .eof {
-		return error(p.emit_error('unknown token `$p.tok.kind`.'))
+		return error(p.emit_error('invalid token `$p.tok.kind`'))
 	}
 	return fi
 }

-fn (p Parser) is_formfeed() bool {
-	prev_tok_pos := p.p_tok.pos + p.p_tok.len - 2
-	if prev_tok_pos < p.scanner.text.len && p.scanner.text[prev_tok_pos] == 0x0c {
-		return true
-	}
-	return false
-}
-
-fn (p Parser) is_singlequote() bool {
-	src := p.scanner.text
-	prev_tok_pos := p.p_tok.pos + p.p_tok.len
-	return src[prev_tok_pos] == `\'`
-}
-
-fn (mut p Parser) detect_parse_mode() {
-	src := p.scanner.text
-	if src.len > 1 && src[0].is_digit() && !src[1].is_digit() {
-		p.mode = .invalid
-		return
-	}
-	p.tok = p.scanner.scan()
-	p.n_tok = p.scanner.scan()
-	if src.len == 1 && p.tok.kind == .string && p.n_tok.kind == .eof {
-		p.mode = .invalid
-		return
-	}
-	match p.tok.kind {
-		.lcbr {
-			p.mode = .object
-		}
-		.lsbr {
-			p.mode = .array
-		}
-		.number {
-			p.mode = .number
-		}
-		.key_true, .key_false {
-			p.mode = .bool
-		}
-		.string {
-			p.mode = .string
-		}
-		.name {
-			if p.tok.lit == 'null' {
-				p.mode = .null
-			}
-		}
-		.minus {
-			if p.n_tok.kind == .number {
-				p.mode = .number
-			}
-		}
-		else {}
-	}
-}
-
 fn (mut p Parser) decode_value() ?Any {
 	if p.n_level == 500 {
-		return error('reached maximum nesting level of 500.')
+		return error(p.emit_error('reached maximum nesting level of 500'))
 	}
-	if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr) ||
-		(p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) {
+	if (p.tok.kind == .lsbr && p.n_tok.kind == .lcbr)
+		|| (p.p_tok.kind == p.tok.kind && p.tok.kind == .lsbr) {
 		p.n_level++
 	}
 	match p.tok.kind {
@@ -183,235 +93,76 @@ fn (mut p Parser) decode_value() ?Any {
 		.lcbr {
 			return p.decode_object()
 		}
-		.number {
-			return p.decode_number()
+		.int_, .float {
+			tl := p.tok.lit.bytestr()
+			kind := p.tok.kind
+			p.next_with_err() ?
+			if p.convert_type {
+				return if kind == .float { Any(tl.f64()) } else { Any(tl.i64()) }
+			}
+			return Any(tl)
 		}
-		.key_true {
-			p.next()
-			return if p.convert_type {
-				Any(true)
-			} else {
-				Any('true')
-			}
+		.bool_ {
+			lit := p.tok.lit.bytestr()
+			p.next_with_err() ?
+			return if p.convert_type { Any(lit.bool()) } else { Any(lit) }
 		}
-		.key_false {
-			p.next()
-			return if p.convert_type {
-				Any(false)
-			} else {
-				Any('false')
-			}
+		.null {
+			p.next_with_err() ?
+			return if p.convert_type { Any(null) } else { Any('null') }
 		}
-		.name {
-			if p.tok.lit != 'null' {
-				return error('unknown identifier `$p.tok.lit`')
-			}
-			p.next()
-			return if p.convert_type {
-				Any(Null{})
-			} else {
-				Any('null')
-			}
-		}
-		.string {
-			if p.is_singlequote() {
-				return error('strings must be in double-quotes.')
-			}
-			return p.decode_string()
+		.str_ {
+			str := p.tok.lit.bytestr()
+			p.next_with_err() ?
+			return Any(str)
 		}
 		else {
-			if p.tok.kind == .minus && p.n_tok.kind == .number && p.n_tok.pos == p.tok.pos + 1 {
-				p.next()
-				d_num := p.decode_number() ?
-				return d_num
-			}
-			return error("unknown token '$p.tok.lit' when decoding value")
+			return error(p.emit_error('invalid token `$p.tok.kind`'))
 		}
 	}
-	if p.is_formfeed() {
-		return error(formfeed_err)
-	}
 	return Any{}
 }

-fn (mut p Parser) decode_string() ?Any {
-	mut strwr := strings.new_builder(200)
-	for i := 0; i < p.tok.lit.len; i++ {
-		if ((i - 1 >= 0 && p.tok.lit[i - 1] != `/`) || i == 0) && int(p.tok.lit[i]) in [9, 10, 0] {
-			return error('character must be escaped with a backslash.')
-		}
-		if i == p.tok.lit.len - 1 && p.tok.lit[i] == 92 {
-			return error('invalid backslash escape.')
-		}
-		if i + 1 < p.tok.lit.len && p.tok.lit[i] == 92 {
-			peek := p.tok.lit[i + 1]
-			match peek {
-				`b` {
-					i++
-					strwr.write_b(`\b`)
-					continue
-				}
-				`f` {
-					i++
-					strwr.write_b(`\f`)
-					continue
-				}
-				`n` {
-					i++
-					strwr.write_b(`\n`)
-					continue
-				}
-				`r` {
-					i++
-					strwr.write_b(`\r`)
-					continue
-				}
-				`t` {
-					i++
-					strwr.write_b(`\t`)
-					continue
-				}
-				`u` {
-					if i + 5 < p.tok.lit.len {
-						codepoint := p.tok.lit[i + 2..i + 6]
-						check_valid_hex(codepoint) ?
-						hex_val := strconv.parse_int(codepoint, 16, 0)
-						strwr.write_b(byte(hex_val))
-						i += 5
-						continue
-					} else {
-						return error('incomplete unicode escape.')
-					}
-				}
-				`\\` {
-					i++
-					strwr.write_b(`\\`)
-					continue
-				}
-				`"` {
-					i++
-					strwr.write_b(`\"`)
-					continue
-				}
-				`/` {
-					i++
-					strwr.write_b(`/`)
-					continue
-				}
-				else { return error('invalid backslash escape.') }
-			}
-			if int(peek) == 85 {
-				return error('unicode endpoints must be in lowercase `u`.')
-			}
-			if int(peek) in [9, 229] {
-				return error('unicode endpoint not allowed.')
-			}
-		}
-		strwr.write_b(p.tok.lit[i])
-	}
-	p.next()
-	defer {
-		unsafe { strwr.free() }
-	}
-	str := strwr.str()
-	return Any(str)
-}
-
-// now returns string instead of int or float
-fn (mut p Parser) decode_number() ?Any {
-	src := p.scanner.text
-	mut tl := p.tok.lit
-	mut is_fl := false
-	sep_by_dot := tl.to_lower().split('.')
-	if tl.starts_with('0x') && tl.all_after('0x').len <= 2 {
-		return error('hex numbers should not be less than or equal to two digits.')
-	}
-	if src[p.p_tok.pos + p.p_tok.len] == `0` && src[p.p_tok.pos + p.p_tok.len + 1].is_digit() {
-		return error('leading zeroes in integers are not allowed.')
-	}
-	if tl.starts_with('.') {
-		return error('decimals must start with a digit followed by a dot.')
-	}
-	if tl.ends_with('+') || tl.ends_with('-') {
-		return error('exponents must have a digit before the sign.')
-	}
-	if sep_by_dot.len > 1 {
-		// analyze json number structure
-		// -[digit][dot][digit][E/e][-/+][digit]
-		// float number
-		is_fl = true
-		last := sep_by_dot.last()
-		if last.starts_with('e') {
-			return error('exponents must have a digit before the exponent notation.')
-		}
-	}
-	if p.p_tok.kind == .minus && p.tok.pos == p.p_tok.pos + 1 {
-		tl = '-$tl'
-	}
-	p.next()
-	if p.convert_type {
-		return if is_fl {
-			Any(tl.f64())
-		} else {
-			Any(tl.i64())
-		}
-	}
-	return Any(tl)
-}
-
 fn (mut p Parser) decode_array() ?Any {
 	mut items := []Any{}
-	p.next()
+	p.next_with_err() ?
 	for p.tok.kind != .rsbr {
-		if p.tok.kind == .eof {
-			return error(eof_err)
-		}
 		item := p.decode_value() ?
 		items << item
-		if p.tok.kind == .comma && p.n_tok.kind !in [.rsbr, .comma] {
-			p.next()
-			continue
-		}
-		if p.tok.kind == .rsbr {
+		if p.tok.kind == .comma {
+			p.next_with_err() ?
+			if p.tok.kind == .rsbr || p.tok.kind == .rcbr {
+				return error(p.emit_error('invalid token `$p.tok.lit'))
+			}
+		} else if p.tok.kind == .rsbr {
 			break
+		} else {
+			return error(p.emit_error("unknown token '$p.tok.lit' when decoding array."))
 		}
-		return error("unknown token '$p.tok.lit' when decoding arrays.")
 	}
-	p.next()
+	p.next_with_err() ?
 	return Any(items)
 }

 fn (mut p Parser) decode_object() ?Any {
 	mut fields := map[string]Any{}
-	mut cur_key := ''
-	p.next()
+	p.next_with_err() ?
 	for p.tok.kind != .rcbr {
-		is_key := p.tok.kind == .string && p.n_tok.kind == .colon
-		// todo
-		// if p.is_formfeed() {
-		// return error(formfeed_err)
-		// }
-		if p.tok.kind == .eof {
-			return error(eof_err)
-		}
-		if p.is_singlequote() {
-			return error('object keys must be in single quotes.')
-		}
+		is_key := p.tok.kind == .str_ && p.n_tok.kind == .colon
 		if !is_key {
-			return error("invalid token `$p.tok.lit`, expected \'string\'")
+			return error(p.emit_error('invalid token `$p.tok.kind`, expecting `str_`'))
 		}
-		cur_key = p.tok.lit
-		p.next()
-		p.next()
+		cur_key := p.tok.lit.bytestr()
+		p.next_with_err() ?
+		p.next_with_err() ?
 		fields[cur_key] = p.decode_value() ?
-		if p.tok.kind == .comma && p.n_tok.kind !in [.rcbr, .comma] {
-			p.next()
-			continue
-		} else if p.tok.kind == .rcbr {
-			break
+		if p.tok.kind == .comma {
+			p.next_with_err() ?
+			if p.tok.kind != .str_ {
+				return error(p.emit_error("unknown token '$p.tok.lit' when decoding object."))
+			}
 		}
-		return error("unknown token '$p.tok.lit' when decoding object.")
 	}
-	p.next()
+	p.next_with_err() ?
 	return Any(fields)
 }