From a987440e2f7f197891411caccf458647ab9d9266 Mon Sep 17 00:00:00 2001 From: Larpon Date: Thu, 28 Oct 2021 15:38:49 +0200 Subject: [PATCH] toml: add UTF header support, add BOM tests (#12326) --- vlib/toml/scanner/scanner.v | 54 +++++++++++++++--- .../tests/testdata/toml_with_utf16_bom.toml | Bin 0 -> 334 bytes .../tests/testdata/toml_with_utf32_bom.toml | Bin 0 -> 336 bytes .../tests/testdata/toml_with_utf8_bom.toml | 33 +++++++++++ vlib/toml/tests/toml_bom_test.v | 49 ++++++++++++++++ 5 files changed, 129 insertions(+), 7 deletions(-) create mode 100644 vlib/toml/tests/testdata/toml_with_utf16_bom.toml create mode 100644 vlib/toml/tests/testdata/toml_with_utf32_bom.toml create mode 100644 vlib/toml/tests/testdata/toml_with_utf8_bom.toml create mode 100644 vlib/toml/tests/toml_bom_test.v diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index bbdf235982..494ce0e08f 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -19,9 +19,10 @@ pub: config Config text string // the input TOML text mut: - col int // current column number (x coordinate) - line_nr int = 1 // current line number (y coordinate) - pos int // current flat/index position in the `text` field + col int // current column number (x coordinate) + line_nr int = 1 // current line number (y coordinate) + pos int // current flat/index position in the `text` field + header_len int // Length, how many bytes of header was found } // State is a read-only copy of the scanner's internal state. @@ -73,6 +74,8 @@ pub fn new_simple(toml_input string) ?Scanner { // scan returns the next token from the input. [direct_array_access] pub fn (mut s Scanner) scan() ?token.Token { + s.validate_and_skip_headers() ? + for { c := s.next() byte_c := byte(c) @@ -290,19 +293,23 @@ pub fn (mut s Scanner) reset() { s.pos = 0 s.col = 0 s.line_nr = 1 + s.header_len = 0 } // new_token returns a new `token.Token`. [inline] fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { - // line_offset := 1 // println('new_token($lit)') + mut col := s.col - len + 1 + if s.line_nr == 1 { + col -= s.header_len + } return token.Token{ kind: kind lit: lit - col: mathutil.max(1, s.col - len + 1) - line_nr: s.line_nr + 1 //+ line_offset - pos: s.pos - len + 1 + col: mathutil.max(1, col) + line_nr: s.line_nr + 1 + pos: s.pos - s.header_len - len + 1 len: len } } @@ -605,3 +612,36 @@ pub fn (s Scanner) state() State { pos: s.pos } } + +fn (mut s Scanner) validate_and_skip_headers() ? { + // UTF-16 / UTF-32 headers (BE/LE) + s.check_utf16_or_32_bom() ? + + // NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only. + + // Skip optional UTF-8 heaser, if any. + if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)') + s.header_len = 3 + s.skip_n(s.header_len) + } + + // Check after we've skipped UTF-8 BOM + s.check_utf16_or_32_bom() ? +} + +fn (mut s Scanner) check_utf16_or_32_bom() ? { + if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00) + || (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) { + s.header_len = 4 + s.skip_n(s.header_len) + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' UTF-32 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...') + } + if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) { + s.header_len = 2 + s.skip_n(s.header_len) + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' UTF-16 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...') + } +} diff --git a/vlib/toml/tests/testdata/toml_with_utf16_bom.toml b/vlib/toml/tests/testdata/toml_with_utf16_bom.toml new file mode 100644 index 0000000000000000000000000000000000000000..cc0d8230f03a6dddb995a713e7962bbab74216d0 GIT binary patch literal 334 zcmYk2QAIf5oEDh8ATCy$JeLH>KYW{+rHw9m<4vD@`!|t2ZhVGyApsXZekLyoTbeE0f9# yJ}#bph77!2dfXbX&F10(BW}z_;$C#`FF4UrJ9YPMcqD&gB2! literal 0 HcmV?d00001 diff --git a/vlib/toml/tests/testdata/toml_with_utf32_bom.toml b/vlib/toml/tests/testdata/toml_with_utf32_bom.toml new file mode 100644 index 0000000000000000000000000000000000000000..d21d75487172201383e0f861e8b858589927909e GIT binary patch literal 336 zcmYk2T}y&N5QU#>f5oELh986_bR+0hQ5S|6(S`ZGkf5)j8}{q9XGeorb~t-x&Yamf zpP)lx&L?XQqDx!`_skQIqKX|WzS&aKkY$e6Wi^SVOyboJGr9~J_?l3Z$$V|TGVZyd z<;kJj$Rr-TZFPSxQ+}-8@