From 9a3967bd7d0b1583dcddd7df48b1ab2fc8a181f9 Mon Sep 17 00:00:00 2001 From: Larpon Date: Tue, 26 Oct 2021 15:58:05 +0200 Subject: [PATCH] toml: improve comment support (#12305) --- vlib/toml/ast/ast.v | 3 +- vlib/toml/checker/checker.v | 40 +++++++++++++++++++-- vlib/toml/parser/parser.v | 5 ++- vlib/toml/scanner/scanner.v | 16 +++------ vlib/toml/tests/burntsushi.toml-test_test.v | 3 -- 5 files changed, 49 insertions(+), 18 deletions(-) diff --git a/vlib/toml/ast/ast.v b/vlib/toml/ast/ast.v index 4c35ac4f93..c3f9c3fe5d 100644 --- a/vlib/toml/ast/ast.v +++ b/vlib/toml/ast/ast.v @@ -11,7 +11,8 @@ pub struct Root { pub: input input.Config // User input configuration pub mut: - table Value + comments []Comment + table Value // errors []errors.Error // all the checker errors in the file } diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v index 97fea91462..96af385922 100644 --- a/vlib/toml/checker/checker.v +++ b/vlib/toml/checker/checker.v @@ -5,9 +5,10 @@ module checker import toml.ast import toml.ast.walker -// import toml.util +import toml.util import toml.token import toml.scanner +import encoding.utf8 pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`] @@ -255,9 +256,10 @@ fn (c Checker) check_quoted(q ast.Quoted) ? { triple_quote := quote + quote + quote if q.is_multiline && lit.ends_with(triple_quote) { return error(@MOD + '.' + @STRUCT + '.' + @FN + - ' string values like "$lit" is has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...') + ' string values like "$lit" has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...') } c.check_quoted_escapes(q) ? + c.check_utf8_validity(q) ? } // check_quoted_escapes returns an error for any disallowed escape sequences. @@ -314,3 +316,37 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? { } } } + +// check_utf8_string returns an error if `str` is not valid UTF8. +fn (c Checker) check_utf8_validity(q ast.Quoted) ? { + lit := q.text + if !utf8.validate_str(lit) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' the string value "$lit" is not valid UTF-8 in ...${c.excerpt(q.pos)}...') + } +} + +pub fn (c Checker) check_comment(cmt ast.Comment) ? { + lit := cmt.text + // Setup a scanner in stack memory for easier navigation. + mut s := scanner.new_simple(lit) ? + for { + ch := s.next() + if ch == -1 { + break + } + ch_byte := byte(ch) + // Check for control characters (allow TAB) + if util.is_illegal_ascii_control_character(ch_byte) { + st := s.state() + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' control character `$ch_byte.hex()` is not allowed ($st.line_nr,$st.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(st.pos, 10)}...') + } + } + + // Check for bad UTF-8 encoding + if !utf8.validate_str(lit) { + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' comment "$lit" is not valid UTF-8 in ...${c.excerpt(cmt.pos)}...') + } +} diff --git a/vlib/toml/parser/parser.v b/vlib/toml/parser/parser.v index 6f9a24137c..0494ae8f0b 100644 --- a/vlib/toml/parser/parser.v +++ b/vlib/toml/parser/parser.v @@ -59,6 +59,9 @@ fn (mut p Parser) run_checker() ? { scanner: p.scanner } chckr.check(p.root_map) ? + for comment in p.ast_root.comments { + chckr.check_comment(comment) ? + } } } @@ -240,8 +243,8 @@ pub fn (mut p Parser) root_table() ? { util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind" "$p.tok.lit"') match p.tok.kind { .hash { - // TODO table.comments << p.comment() c := p.comment() + p.ast_root.comments << c util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comment "$c.text"') } //.whitespace, .tab, .nl { diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index b35b887160..4196c68386 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -174,9 +174,7 @@ pub fn (mut s Scanner) scan() ?token.Token { return s.new_token(.quoted, ident_string, ident_string.len) } `#` { - start := s.pos //+ 1 - s.ignore_line() ? - hash := s.text[start..s.pos] + hash := s.ignore_line() ? util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "$hash" ($hash.len)') return s.new_token(.hash, hash, hash.len + 1) } @@ -318,18 +316,14 @@ fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { // ignore_line forwards the scanner to the end of the current line. [direct_array_access; inline] -fn (mut s Scanner) ignore_line() ? { - util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL') +fn (mut s Scanner) ignore_line() ?string { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...') + start := s.pos for c := s.at(); c != -1 && c != `\n`; c = s.at() { - // Check for control characters (allow TAB) - if util.is_illegal_ascii_control_character(c) { - return error(@MOD + '.' + @STRUCT + '.' + @FN + - ' control character `$c.hex()` is not allowed ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...') - } s.next() util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${byte(c).ascii_str()}"') - continue } + return s.text[start..s.pos] } // inc_line_number increases the internal line number. diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v index 4763824fc9..fbc44bf949 100644 --- a/vlib/toml/tests/burntsushi.toml-test_test.v +++ b/vlib/toml/tests/burntsushi.toml-test_test.v @@ -19,9 +19,6 @@ const ( 'string/basic-out-of-range-unicode-escape-1.toml', 'string/basic-out-of-range-unicode-escape-2.toml', 'string/bad-uni-esc.toml', - // Encoding - 'encoding/bad-utf8-in-comment.toml', - 'encoding/bad-utf8-in-string.toml', // Table 'table/rrbrace.toml', 'table/duplicate-table-array2.toml',