From e3d3727c0c3b9c85a66ddfc56f91c74ee1f0df28 Mon Sep 17 00:00:00 2001 From: Larpon Date: Wed, 29 Sep 2021 18:28:09 +0200 Subject: [PATCH] toml: fix 7 escape tests (#12017) --- vlib/toml/checker/checker.v | 68 +++++++++++++++++++-- vlib/toml/scanner/scanner.v | 44 +++++++++---- vlib/toml/tests/burntsushi.toml-test_test.v | 11 ++-- 3 files changed, 98 insertions(+), 25 deletions(-) diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v index 109d76464b..b8868a35bf 100644 --- a/vlib/toml/checker/checker.v +++ b/vlib/toml/checker/checker.v @@ -9,6 +9,8 @@ import toml.ast.walker import toml.token import toml.scanner +pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`] + // Checker checks a tree of TOML `ast.Value`'s for common errors. pub struct Checker { scanner &scanner.Scanner @@ -172,12 +174,68 @@ fn (c Checker) check_boolean(b ast.Bool) ? { ' boolean values like "$lit" can only be `true` or `false` literals, not `$lit` in ...${c.excerpt(b.pos)}...') } -fn (c Checker) check_quoted(b ast.Quoted) ? { - lit := b.text - quote := b.quote.ascii_str() +fn (c Checker) check_quoted(q ast.Quoted) ? { + lit := q.text + quote := q.quote.ascii_str() triple_quote := quote + quote + quote - if b.is_multiline && lit.ends_with(triple_quote) { + if q.is_multiline && lit.ends_with(triple_quote) { return error(@MOD + '.' + @STRUCT + '.' + @FN + - ' string values like "$lit" is has unbalanced quote literals `b.quote` in ...${c.excerpt(b.pos)}...') + ' string values like "$lit" is has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...') + } + c.check_quoted_escapes(q) ? +} + +// check_quoted_escapes returns an error for any disallowed escape sequences. +// Delimiters in TOML has significant meaning: +// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get) +// "/""" delimits *basic* strings +// Allowed escapes in *basic* strings are: +// \b - backspace (U+0008) +// \t - tab (U+0009) +// \n - linefeed (U+000A) +// \f - form feed (U+000C) +// \r - carriage return (U+000D) +// \" - quote (U+0022) +// \\ - backslash (U+005C) +// \uXXXX - unicode (U+XXXX) +// \UXXXXXXXX - unicode (U+XXXXXXXX) +fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { + // Setup a scanner in stack memory for easier navigation. + mut s := scanner.new_simple(q.text) ? + + is_basic := q.quote == `\"` + for { + ch := s.next() + if ch == -1 { + break + } + ch_byte := byte(ch) + if ch == `\\` { + next_ch := byte(s.at()) + + if next_ch == `\\` { + s.next() + continue + } + escape := ch_byte.ascii_str() + next_ch.ascii_str() + if is_basic { + if q.is_multiline { + if next_ch == byte(32) && s.peek(1) == byte(92) { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' can not escape whitespaces before escapes in multi-line strings (`\\ \\`) at `$escape` ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...') + } + if next_ch in [`\t`, `\n`, ` `] { + s.next() + continue + } + } + if next_ch !in checker.allowed_basic_escape_chars { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' unknown basic string escape character `$next_ch.ascii_str()` in `$escape` ($st.line_nr,$st.col) in ...${c.excerpt(q.pos)}...') + } + } + } } } diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index 7df0f3cf29..a596014d32 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -25,6 +25,16 @@ mut: mode Mode // sub-mode of the scanner } +// State is a read-only copy of the scanner's internal state. +// See also `Scanner.state()`. +pub struct State { +pub: + col int // current column number (x coordinate) + line_nr int = 1 // current line number (y coordinate) + pos int // current flat/index position in the `text` field + mode Mode // sub-mode of the scanner +} + enum Mode { normal inside_string @@ -426,6 +436,8 @@ fn (mut s Scanner) extract_multiline_string() ?string { } c := s.at() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c (quote type: $quote/$quote.ascii_str())') + if c == `\n` { s.inc_line_number() lit += c.ascii_str() @@ -443,8 +455,6 @@ fn (mut s Scanner) extract_multiline_string() ?string { } } - util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `$c.ascii_str()` / $c') - if c == quote { if s.peek(1) == quote && s.peek(2) == quote { if s.peek(3) == -1 { @@ -469,14 +479,16 @@ fn (mut s Scanner) extract_multiline_string() ?string { return lit } -// handle_escapes +// handle_escapes returns any escape character sequence. +// For escape sequence validation see `Checker.check_quoted_escapes`. fn (mut s Scanner) handle_escapes(quote byte, is_multiline bool) (string, int) { c := s.at() mut lit := c.ascii_str() - if s.peek(1) == byte(92) { - lit += lit - util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') - return lit, 1 + if s.peek(1) == `u` && byte(s.peek(2)).is_hex_digit() && byte(s.peek(3)).is_hex_digit() + && byte(s.peek(4)).is_hex_digit() && byte(s.peek(5)).is_hex_digit() { + lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str() + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `$lit`') + return lit, 4 } else if s.peek(1) == quote { if (!is_multiline && s.peek(2) == `\n`) || (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) { @@ -486,13 +498,9 @@ fn (mut s Scanner) handle_escapes(quote byte, is_multiline bool) (string, int) { lit += quote.ascii_str() util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') return lit, 1 - } else if s.peek(1) == `u` && byte(s.peek(2)).is_hex_digit() && byte(s.peek(3)).is_hex_digit() - && byte(s.peek(4)).is_hex_digit() && byte(s.peek(5)).is_hex_digit() { - lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str() - util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `$lit`') - return lit, 4 } - return '', 0 + lit += byte(s.peek(1)).ascii_str() + return lit, 1 } // extract_number collects and returns a string containing @@ -542,3 +550,13 @@ pub fn (s Scanner) excerpt(pos int, margin int) string { end := if pos + margin < s.text.len { pos + margin } else { s.text.len } return s.text[start..end].replace('\n', r'\n') } + +// state returns a read-only view of the scanner's internal state. +pub fn (s Scanner) state() State { + return State{ + col: s.col + line_nr: s.line_nr + pos: s.pos + mode: s.mode + } +} diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v index 69a8eff371..ccb7dcfd5d 100644 --- a/vlib/toml/tests/burntsushi.toml-test_test.v +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -15,19 +15,12 @@ const ( invalid_exceptions = [ // String 'string/basic-multiline-out-of-range-unicode-escape-1.toml', - 'string/basic-byte-escapes.toml', - 'string/multiline-escape-space.toml', 'string/bad-codepoint.toml', 'string/basic-multiline-out-of-range-unicode-escape-2.toml', - 'string/bad-slash-escape.toml', 'string/basic-out-of-range-unicode-escape-1.toml', 'string/basic-out-of-range-unicode-escape-2.toml', 'string/bad-uni-esc.toml', - 'string/bad-escape.toml', - 'string/basic-multiline-unknown-escape.toml', 'string/missing-quotes.toml', - 'string/bad-byte-escape.toml', - 'string/basic-unknown-escape.toml', // Integer 'integer/capital-bin.toml', 'integer/invalid-bin.toml', @@ -155,6 +148,10 @@ fn test_burnt_sushi_tomltest() { if relative !in invalid_exceptions { println('OK [$i/$invalid_test_files.len] "$invalid_test_file"...') if toml_doc := toml.parse_file(invalid_test_file) { + content_that_should_have_failed := os.read_file(invalid_test_file) or { + panic(err) + } + println(' This TOML should have failed:\n${'-'.repeat(40)}\n$content_that_should_have_failed\n${'-'.repeat(40)}') assert false } else { println(' $err.msg')