From f1dd0e33552f21a91cc85fcac0c8559ee1b32d66 Mon Sep 17 00:00:00 2001 From: Larpon Date: Sat, 20 Nov 2021 18:48:44 +0100 Subject: [PATCH] toml: add value decoding (#12521) --- vlib/toml/ast/types.v | 3 +- vlib/toml/ast/walker/walker.v | 32 ++++- vlib/toml/checker/checker.v | 2 +- vlib/toml/decoder/decoder.v | 148 ++++++++++++++++++++ vlib/toml/parser/parser.v | 19 ++- vlib/toml/scanner/scanner.v | 7 +- vlib/toml/tests/burntsushi.toml-test_test.v | 55 +------- vlib/toml/tests/strings_test.v | 4 +- 8 files changed, 206 insertions(+), 64 deletions(-) create mode 100644 vlib/toml/decoder/decoder.v diff --git a/vlib/toml/ast/types.v b/vlib/toml/ast/types.v index f3412194f6..d207b0c250 100644 --- a/vlib/toml/ast/types.v +++ b/vlib/toml/ast/types.v @@ -97,8 +97,9 @@ pub fn (n Null) str() string { // Quoted is the data representation of a TOML quoted type (`"quoted-key" = "I'm a quoted value"`). // Quoted types can appear both as keys and values in TOML documents. pub struct Quoted { +pub mut: + text string pub: - text string pos token.Position is_multiline bool quote byte diff --git a/vlib/toml/ast/walker/walker.v b/vlib/toml/ast/walker/walker.v index 8fc5bbe349..a51e714ce1 100644 --- a/vlib/toml/ast/walker/walker.v +++ b/vlib/toml/ast/walker/walker.v @@ -2,11 +2,16 @@ module walker import toml.ast -// Visitor defines a visit method which is invoked by the walker in each Value node it encounters. +// Visitor defines a visit method which is invoked by the walker on each Value node it encounters. pub interface Visitor { visit(value &ast.Value) ? } +// Modifier defines a modify method which is invoked by the walker on each Value node it encounters. +pub interface Modifier { + modify(mut value ast.Value) ? +} + pub type InspectorFn = fn (value &ast.Value, data voidptr) ? struct Inspector { @@ -31,7 +36,32 @@ pub fn walk(visitor Visitor, value &ast.Value) ? { for _, val in value_map { walk(visitor, &val) ? } + } + if value is []ast.Value { + value_array := value as []ast.Value + for val in value_array { + walk(visitor, &val) ? + } } else { visitor.visit(value) ? } } + +// walk_and_modify traverses the AST using the given modifier and lets the visitor +// modify the contents. +pub fn walk_and_modify(modifier Modifier, mut value ast.Value) ? { + if value is map[string]ast.Value { + mut value_map := value as map[string]ast.Value + for _, mut val in value_map { + walk_and_modify(modifier, mut &val) ? + } + } + if value is []ast.Value { + mut value_array := value as []ast.Value + for mut val in value_array { + walk_and_modify(modifier, mut &val) ? + } + } else { + modifier.modify(mut value) ? + } +} diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v index a437c8fee2..1892886e1a 100644 --- a/vlib/toml/checker/checker.v +++ b/vlib/toml/checker/checker.v @@ -400,7 +400,7 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { is_basic := q.quote == `\"` for { ch := s.next() - if ch == -1 { + if ch == scanner.end_of_text { break } ch_byte := byte(ch) diff --git a/vlib/toml/decoder/decoder.v b/vlib/toml/decoder/decoder.v new file mode 100644 index 0000000000..3eb792f2eb --- /dev/null +++ b/vlib/toml/decoder/decoder.v @@ -0,0 +1,148 @@ +// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module decoder + +import toml.ast +import toml.ast.walker +import toml.token +import toml.scanner +import strconv + +// Decoder decode special sequences in a tree of TOML `ast.Value`'s. +pub struct Decoder { + scanner &scanner.Scanner +} + +// decode decodes certain `ast.Value`'s and all it's children. +pub fn (d Decoder) decode(mut n ast.Value) ? { + walker.walk_and_modify(d, mut n) ? +} + +fn (d Decoder) modify(mut value ast.Value) ? { + match value { + ast.Quoted { + mut v := &(value as ast.Quoted) + d.decode_quoted(mut v) ? + } + else {} + } +} + +// excerpt returns a string of the token's surroundings +fn (d Decoder) excerpt(tp token.Position) string { + return d.scanner.excerpt(tp.pos, 10) +} + +// decode_quoted returns an error if `q` is not a valid quoted TOML string. +fn (d Decoder) decode_quoted(mut q ast.Quoted) ? { + d.decode_quoted_escapes(mut q) ? +} + +// decode_quoted_escapes returns an error for any disallowed escape sequences. +// Delimiters in TOML has significant meaning: +// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get) +// "/""" delimits *basic* strings +// Allowed escapes in *basic* strings are: +// \b - backspace (U+0008) +// \t - tab (U+0009) +// \n - linefeed (U+000A) +// \f - form feed (U+000C) +// \r - carriage return (U+000D) +// \" - quote (U+0022) +// \\ - backslash (U+005C) +// \uXXXX - Unicode (U+XXXX) +// \UXXXXXXXX - Unicode (U+XXXXXXXX) +fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? { + // Setup a scanner in stack memory for easier navigation. + mut s := scanner.new_simple(q.text) ? + + q.text = q.text.replace('\\"', '"') + + // TODO use string builder + mut decoded_s := '' + // See https://toml.io/en/v1.0.0#string for more info on string types. + is_basic := q.quote == `\"` + if !is_basic { + return + } + for { + ch := s.next() + if ch == scanner.end_of_text { + break + } + ch_byte := byte(ch) + + if ch == `\\` { + ch_next := byte(s.at()) + + if ch_next == `\\` { + decoded_s += ch_next.ascii_str() + s.next() + continue + } + + if ch_next == `"` { + decoded_s += '"' + s.next() + continue + } + + if ch_next == `n` { + decoded_s += '\n' + s.next() + continue + } + + escape := ch_byte.ascii_str() + ch_next.ascii_str() + // Decode unicode escapes + if escape.to_lower() == '\\u' { + // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters + // we pass in 10 characters from the `u`/`U` which is the longest possible sequence + // of 9 chars plus one extra. + mut decoded := '' + if s.remaining() >= 10 { + pos := s.state().pos + decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...') + } + decoded_s += decoded + s.skip_n(s.text[pos..pos + 11].len) + continue + } else { + pos := s.state().pos + decoded = d.decode_unicode_escape(s.text[pos..]) or { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...') + } + decoded_s += decoded + s.skip_n(s.text[pos..].len) + continue + } + } + } + decoded_s += ch_byte.ascii_str() + } + q.text = decoded_s +} + +// decode_unicode_escape returns an error if `esc_unicode` is not +// a valid Unicode escape sequence. `esc_unicode` is expected to be +// prefixed with either `u` or `U`. +fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string { + is_long_esc_type := esc_unicode.starts_with('U') + mut sequence := esc_unicode[1..] + hex_digits_len := if is_long_esc_type { 8 } else { 4 } + + sequence = sequence[..hex_digits_len] + + mut unicode_point := sequence + if unicode_point.len < 8 { + unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point + } + rn := rune(strconv.parse_int(unicode_point, 16, 0) ?) + return '$rn' +} diff --git a/vlib/toml/parser/parser.v b/vlib/toml/parser/parser.v index 51bec38469..05bfe32747 100644 --- a/vlib/toml/parser/parser.v +++ b/vlib/toml/parser/parser.v @@ -5,6 +5,7 @@ module parser import toml.ast import toml.checker +import toml.decoder import toml.util import toml.token import toml.scanner @@ -69,10 +70,12 @@ mut: // Config is used to configure a Parser instance. // `run_checks` is used to en- or disable running of the strict `checker.Checker` type checks. +// `decode_values` is used to en- or disable decoding of values with the `decoder.Decoder`. pub struct Config { pub: - scanner &scanner.Scanner - run_checks bool = true + scanner &scanner.Scanner + run_checks bool = true + decode_values bool = true } // new_parser returns a new, stack allocated, `Parser`. @@ -104,12 +107,24 @@ fn (mut p Parser) run_checker() ? { } } +// run_decoder decodes values in the parsed `ast.Value` nodes in the +// the generated AST. +fn (mut p Parser) run_decoder() ? { + if p.config.decode_values { + dcoder := decoder.Decoder{ + scanner: p.scanner + } + dcoder.decode(mut p.root_map) ? + } +} + // parse starts parsing the input and returns the root // of the generated AST. pub fn (mut p Parser) parse() ?&ast.Root { p.init() ? p.root_table() ? p.run_checker() ? + p.run_decoder() ? p.ast_root.table = p.root_map return p.ast_root } diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index ce35869dc5..e576417d9c 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -9,9 +9,10 @@ import toml.input import toml.token import toml.util -pub const digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`] - -const end_of_text = -1 +pub const ( + digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`] + end_of_text = -1 +) // Scanner contains the necessary fields for the state of the scan process. // the task the scanner does is also refered to as "lexing" or "tokenizing". diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v index 73831231d4..2113db8605 100644 --- a/vlib/toml/tests/burntsushi.toml-test_test.v +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -19,7 +19,6 @@ const ( valid_value_exceptions = [ // String 'string/escapes.toml', - 'string/escape-tricky.toml', 'string/multiline.toml', // Integer 'integer/long.toml', @@ -199,13 +198,7 @@ fn test_burnt_sushi_tomltest() { fn to_burntsushi(value ast.Value) string { match value { ast.Quoted { - mut json_text := '' - if value.quote == `"` { - json_text = toml_to_json_escapes(value) or { '' } - } else { - json_text = json2.Any(value.text).json_str() - } - + json_text := json2.Any(value.text).json_str() return '{ "type": "string", "value": "$json_text" }' } ast.DateTime { @@ -271,49 +264,3 @@ fn to_burntsushi(value ast.Value) string { } return '' } - -// toml_to_json_escapes is a utility function for normalizing -// TOML basic string to JSON string -fn toml_to_json_escapes(q ast.Quoted) ?string { - mut s := scanner.new_simple(q.text) ? - mut r := '' - for { - ch := s.next() - if ch == scanner.end_of_text { - break - } - ch_byte := byte(ch) - - if ch == `"` { - if byte(s.peek(-1)) != `\\` { - r += '\\' - } - } - - if ch == `\\` { - next_ch := byte(s.at()) - - escape := ch_byte.ascii_str() + next_ch.ascii_str() - if escape.to_lower() == '\\u' { - mut b := s.next() - mut unicode_point := '' - for { - b = s.next() - if b != ` ` && b != scanner.end_of_text { - unicode_point += byte(b).ascii_str() - } else { - break - } - } - if unicode_point.len < 8 { - unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point - } - rn := rune(strconv.parse_int(unicode_point, 16, 0) ?) - r += '$rn' - continue - } - } - r += ch_byte.ascii_str() - } - return r -} diff --git a/vlib/toml/tests/strings_test.v b/vlib/toml/tests/strings_test.v index 961c6e9cc1..a96b2e6574 100644 --- a/vlib/toml/tests/strings_test.v +++ b/vlib/toml/tests/strings_test.v @@ -72,9 +72,9 @@ fn test_unicode_escapes() { mut toml_doc := toml.parse(toml_unicode_escapes) or { panic(err) } mut value := toml_doc.value('short') - assert value.string() == r'\u03B4' + assert value.string() == '\u03B4' // <- This escape is handled by V value = toml_doc.value('long') - assert value.string() == r'\U000003B4' + assert value.string() == 'δ' // <- for the long escape we compare with the unicode point } fn test_literal_strings() {