From 9a3967bd7d0b1583dcddd7df48b1ab2fc8a181f9 Mon Sep 17 00:00:00 2001
From: Larpon <Larpon@users.noreply.github.com>
Date: Tue, 26 Oct 2021 15:58:05 +0200
Subject: [PATCH] toml: improve comment support (#12305)

---
 vlib/toml/ast/ast.v                         |  3 +-
 vlib/toml/checker/checker.v                 | 40 +++++++++++++++++++--
 vlib/toml/parser/parser.v                   |  5 ++-
 vlib/toml/scanner/scanner.v                 | 16 +++------
 vlib/toml/tests/burntsushi.toml-test_test.v |  3 --
 5 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/vlib/toml/ast/ast.v b/vlib/toml/ast/ast.v
index 4c35ac4f93..c3f9c3fe5d 100644
--- a/vlib/toml/ast/ast.v
+++ b/vlib/toml/ast/ast.v
@@ -11,7 +11,8 @@ pub struct Root {
 pub:
 	input input.Config // User input configuration
 pub mut:
-	table Value
+	comments []Comment
+	table    Value
 	// errors           []errors.Error    // all the checker errors in the file
 }
 
diff --git a/vlib/toml/checker/checker.v b/vlib/toml/checker/checker.v
index 97fea91462..96af385922 100644
--- a/vlib/toml/checker/checker.v
+++ b/vlib/toml/checker/checker.v
@@ -5,9 +5,10 @@ module checker
 
 import toml.ast
 import toml.ast.walker
-// import toml.util
+import toml.util
 import toml.token
 import toml.scanner
+import encoding.utf8
 
 pub const allowed_basic_escape_chars = [`u`, `U`, `b`, `t`, `n`, `f`, `r`, `"`, `\\`]
 
@@ -255,9 +256,10 @@ fn (c Checker) check_quoted(q ast.Quoted) ? {
 	triple_quote := quote + quote + quote
 	if q.is_multiline && lit.ends_with(triple_quote) {
 		return error(@MOD + '.' + @STRUCT + '.' + @FN +
-			' string values like "$lit" is has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...')
+			' string values like "$lit" has unbalanced quote literals `q.quote` in ...${c.excerpt(q.pos)}...')
 	}
 	c.check_quoted_escapes(q) ?
+	c.check_utf8_validity(q) ?
 }
 
 // check_quoted_escapes returns an error for any disallowed escape sequences.
@@ -314,3 +316,37 @@ fn (c Checker) check_quoted_escapes(q ast.Quoted) ? {
 		}
 	}
 }
+
+// check_utf8_string returns an error if `str` is not valid UTF8.
+fn (c Checker) check_utf8_validity(q ast.Quoted) ? {
+	lit := q.text
+	if !utf8.validate_str(lit) {
+		return error(@MOD + '.' + @STRUCT + '.' + @FN +
+			' the string value "$lit" is not valid UTF-8 in ...${c.excerpt(q.pos)}...')
+	}
+}
+
+pub fn (c Checker) check_comment(cmt ast.Comment) ? {
+	lit := cmt.text
+	// Setup a scanner in stack memory for easier navigation.
+	mut s := scanner.new_simple(lit) ?
+	for {
+		ch := s.next()
+		if ch == -1 {
+			break
+		}
+		ch_byte := byte(ch)
+		// Check for control characters (allow TAB)
+		if util.is_illegal_ascii_control_character(ch_byte) {
+			st := s.state()
+			return error(@MOD + '.' + @STRUCT + '.' + @FN +
+				' control character `$ch_byte.hex()` is not allowed ($st.line_nr,$st.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(st.pos, 10)}...')
+		}
+	}
+
+	// Check for bad UTF-8 encoding
+	if !utf8.validate_str(lit) {
+		return error(@MOD + '.' + @STRUCT + '.' + @FN +
+			' comment "$lit" is not valid UTF-8 in ...${c.excerpt(cmt.pos)}...')
+	}
+}
diff --git a/vlib/toml/parser/parser.v b/vlib/toml/parser/parser.v
index 6f9a24137c..0494ae8f0b 100644
--- a/vlib/toml/parser/parser.v
+++ b/vlib/toml/parser/parser.v
@@ -59,6 +59,9 @@ fn (mut p Parser) run_checker() ? {
 			scanner: p.scanner
 		}
 		chckr.check(p.root_map) ?
+		for comment in p.ast_root.comments {
+			chckr.check_comment(comment) ?
+		}
 	}
 }
 
@@ -240,8 +243,8 @@ pub fn (mut p Parser) root_table() ? {
 		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'parsing token "$p.tok.kind" "$p.tok.lit"')
 		match p.tok.kind {
 			.hash {
-				// TODO table.comments << p.comment()
 				c := p.comment()
+				p.ast_root.comments << c
 				util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping comment "$c.text"')
 			}
 			//.whitespace, .tab, .nl {
diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v
index b35b887160..4196c68386 100644
--- a/vlib/toml/scanner/scanner.v
+++ b/vlib/toml/scanner/scanner.v
@@ -174,9 +174,7 @@ pub fn (mut s Scanner) scan() ?token.Token {
 				return s.new_token(.quoted, ident_string, ident_string.len)
 			}
 			`#` {
-				start := s.pos //+ 1
-				s.ignore_line() ?
-				hash := s.text[start..s.pos]
+				hash := s.ignore_line() ?
 				util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "$hash" ($hash.len)')
 				return s.new_token(.hash, hash, hash.len + 1)
 			}
@@ -318,18 +316,14 @@ fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
 
 // ignore_line forwards the scanner to the end of the current line.
 [direct_array_access; inline]
-fn (mut s Scanner) ignore_line() ? {
-	util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL')
+fn (mut s Scanner) ignore_line() ?string {
+	util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...')
+	start := s.pos
 	for c := s.at(); c != -1 && c != `\n`; c = s.at() {
-		// Check for control characters (allow TAB)
-		if util.is_illegal_ascii_control_character(c) {
-			return error(@MOD + '.' + @STRUCT + '.' + @FN +
-				' control character `$c.hex()` is not allowed ($s.line_nr,$s.col) "${byte(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
-		}
 		s.next()
 		util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${byte(c).ascii_str()}"')
-		continue
 	}
+	return s.text[start..s.pos]
 }
 
 // inc_line_number increases the internal line number.
diff --git a/vlib/toml/tests/burntsushi.toml-test_test.v b/vlib/toml/tests/burntsushi.toml-test_test.v
index 4763824fc9..fbc44bf949 100644
--- a/vlib/toml/tests/burntsushi.toml-test_test.v
+++ b/vlib/toml/tests/burntsushi.toml-test_test.v
@@ -19,9 +19,6 @@ const (
 		'string/basic-out-of-range-unicode-escape-1.toml',
 		'string/basic-out-of-range-unicode-escape-2.toml',
 		'string/bad-uni-esc.toml',
-		// Encoding
-		'encoding/bad-utf8-in-comment.toml',
-		'encoding/bad-utf8-in-string.toml',
 		// Table
 		'table/rrbrace.toml',
 		'table/duplicate-table-array2.toml',