toml: add value decoding (#12521)

2023-08-10 21:13:21 +03:00 · 2021-11-20 18:48:44 +01:00
parent 4b9e8e243c
commit f1dd0e3355
8 changed files with 206 additions and 64 deletions
--- a/vlib/toml/decoder/decoder.v
+++ b/vlib/toml/decoder/decoder.v
@ -0,0 +1,148 @@
+// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
+// Use of this source code is governed by an MIT license
+// that can be found in the LICENSE file.
+module decoder
+
+import toml.ast
+import toml.ast.walker
+import toml.token
+import toml.scanner
+import strconv
+
+// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
+pub struct Decoder {
+	scanner &scanner.Scanner
+}
+
+// decode decodes certain `ast.Value`'s and all it's children.
+pub fn (d Decoder) decode(mut n ast.Value) ? {
+	walker.walk_and_modify(d, mut n) ?
+}
+
+fn (d Decoder) modify(mut value ast.Value) ? {
+	match value {
+		ast.Quoted {
+			mut v := &(value as ast.Quoted)
+			d.decode_quoted(mut v) ?
+		}
+		else {}
+	}
+}
+
+// excerpt returns a string of the token's surroundings
+fn (d Decoder) excerpt(tp token.Position) string {
+	return d.scanner.excerpt(tp.pos, 10)
+}
+
+// decode_quoted returns an error if `q` is not a valid quoted TOML string.
+fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
+	d.decode_quoted_escapes(mut q) ?
+}
+
+// decode_quoted_escapes returns an error for any disallowed escape sequences.
+// Delimiters in TOML has significant meaning:
+// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
+// "/""" delimits *basic* strings
+// Allowed escapes in *basic* strings are:
+// \b         - backspace       (U+0008)
+// \t         - tab             (U+0009)
+// \n         - linefeed        (U+000A)
+// \f         - form feed       (U+000C)
+// \r         - carriage return (U+000D)
+// \"         - quote           (U+0022)
+// \\         - backslash       (U+005C)
+// \uXXXX     - Unicode         (U+XXXX)
+// \UXXXXXXXX - Unicode         (U+XXXXXXXX)
+fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? {
+	// Setup a scanner in stack memory for easier navigation.
+	mut s := scanner.new_simple(q.text) ?
+
+	q.text = q.text.replace('\\"', '"')
+
+	// TODO use string builder
+	mut decoded_s := ''
+	// See https://toml.io/en/v1.0.0#string for more info on string types.
+	is_basic := q.quote == `\"`
+	if !is_basic {
+		return
+	}
+	for {
+		ch := s.next()
+		if ch == scanner.end_of_text {
+			break
+		}
+		ch_byte := byte(ch)
+
+		if ch == `\\` {
+			ch_next := byte(s.at())
+
+			if ch_next == `\\` {
+				decoded_s += ch_next.ascii_str()
+				s.next()
+				continue
+			}
+
+			if ch_next == `"` {
+				decoded_s += '"'
+				s.next()
+				continue
+			}
+
+			if ch_next == `n` {
+				decoded_s += '\n'
+				s.next()
+				continue
+			}
+
+			escape := ch_byte.ascii_str() + ch_next.ascii_str()
+			// Decode unicode escapes
+			if escape.to_lower() == '\\u' {
+				// Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters
+				// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
+				// of 9 chars plus one extra.
+				mut decoded := ''
+				if s.remaining() >= 10 {
+					pos := s.state().pos
+					decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or {
+						st := s.state()
+						return error(@MOD + '.' + @STRUCT + '.' + @FN +
+							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
+					}
+					decoded_s += decoded
+					s.skip_n(s.text[pos..pos + 11].len)
+					continue
+				} else {
+					pos := s.state().pos
+					decoded = d.decode_unicode_escape(s.text[pos..]) or {
+						st := s.state()
+						return error(@MOD + '.' + @STRUCT + '.' + @FN +
+							' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...')
+					}
+					decoded_s += decoded
+					s.skip_n(s.text[pos..].len)
+					continue
+				}
+			}
+		}
+		decoded_s += ch_byte.ascii_str()
+	}
+	q.text = decoded_s
+}
+
+// decode_unicode_escape returns an error if `esc_unicode` is not
+// a valid Unicode escape sequence. `esc_unicode` is expected to be
+// prefixed with either `u` or `U`.
+fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string {
+	is_long_esc_type := esc_unicode.starts_with('U')
+	mut sequence := esc_unicode[1..]
+	hex_digits_len := if is_long_esc_type { 8 } else { 4 }
+
+	sequence = sequence[..hex_digits_len]
+
+	mut unicode_point := sequence
+	if unicode_point.len < 8 {
+		unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
+	}
+	rn := rune(strconv.parse_int(unicode_point, 16, 0) ?)
+	return '$rn'
+}