diff --git a/vlib/toml/scanner/scanner.v b/vlib/toml/scanner/scanner.v index bbdf235982..494ce0e08f 100644 --- a/vlib/toml/scanner/scanner.v +++ b/vlib/toml/scanner/scanner.v @@ -19,9 +19,10 @@ pub: config Config text string // the input TOML text mut: - col int // current column number (x coordinate) - line_nr int = 1 // current line number (y coordinate) - pos int // current flat/index position in the `text` field + col int // current column number (x coordinate) + line_nr int = 1 // current line number (y coordinate) + pos int // current flat/index position in the `text` field + header_len int // Length, how many bytes of header was found } // State is a read-only copy of the scanner's internal state. @@ -73,6 +74,8 @@ pub fn new_simple(toml_input string) ?Scanner { // scan returns the next token from the input. [direct_array_access] pub fn (mut s Scanner) scan() ?token.Token { + s.validate_and_skip_headers() ? + for { c := s.next() byte_c := byte(c) @@ -290,19 +293,23 @@ pub fn (mut s Scanner) reset() { s.pos = 0 s.col = 0 s.line_nr = 1 + s.header_len = 0 } // new_token returns a new `token.Token`. [inline] fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token { - // line_offset := 1 // println('new_token($lit)') + mut col := s.col - len + 1 + if s.line_nr == 1 { + col -= s.header_len + } return token.Token{ kind: kind lit: lit - col: mathutil.max(1, s.col - len + 1) - line_nr: s.line_nr + 1 //+ line_offset - pos: s.pos - len + 1 + col: mathutil.max(1, col) + line_nr: s.line_nr + 1 + pos: s.pos - s.header_len - len + 1 len: len } } @@ -605,3 +612,36 @@ pub fn (s Scanner) state() State { pos: s.pos } } + +fn (mut s Scanner) validate_and_skip_headers() ? { + // UTF-16 / UTF-32 headers (BE/LE) + s.check_utf16_or_32_bom() ? + + // NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only. + + // Skip optional UTF-8 heaser, if any. + if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF { + util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)') + s.header_len = 3 + s.skip_n(s.header_len) + } + + // Check after we've skipped UTF-8 BOM + s.check_utf16_or_32_bom() ? +} + +fn (mut s Scanner) check_utf16_or_32_bom() ? { + if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00) + || (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) { + s.header_len = 4 + s.skip_n(s.header_len) + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' UTF-32 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...') + } + if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) { + s.header_len = 2 + s.skip_n(s.header_len) + return error(@MOD + '.' + @STRUCT + '.' + @FN + + ' UTF-16 is not a valid TOML encoding at $s.pos ($s.line_nr,$s.col) near ...${s.excerpt(s.pos, 5)}...') + } +} diff --git a/vlib/toml/tests/testdata/toml_with_utf16_bom.toml b/vlib/toml/tests/testdata/toml_with_utf16_bom.toml new file mode 100644 index 0000000000..cc0d8230f0 Binary files /dev/null and b/vlib/toml/tests/testdata/toml_with_utf16_bom.toml differ diff --git a/vlib/toml/tests/testdata/toml_with_utf32_bom.toml b/vlib/toml/tests/testdata/toml_with_utf32_bom.toml new file mode 100644 index 0000000000..d21d754871 Binary files /dev/null and b/vlib/toml/tests/testdata/toml_with_utf32_bom.toml differ diff --git a/vlib/toml/tests/testdata/toml_with_utf8_bom.toml b/vlib/toml/tests/testdata/toml_with_utf8_bom.toml new file mode 100644 index 0000000000..23b51361f8 --- /dev/null +++ b/vlib/toml/tests/testdata/toml_with_utf8_bom.toml @@ -0,0 +1,33 @@ +# This is a TOML document with an UTF-8 BOM header. + +title = "TOML Example" + +[owner] +name = "Tom Preston-Werner" +dob = 1979-05-27T07:32:00-08:00 # First class dates + +[database] +server = "192.168.1.1" +ports = [ 8000, 8001, 8002 ] +connection_max = 5000 +enabled = true + +[servers] + + # Indentation (tabs and/or spaces) is allowed but not required + [servers.alpha] + ip = "10.0.0.1" + dc = "eqdc10" + + [servers.beta] + ip = "10.0.0.2" + dc = "eqdc10" + +[clients] +data = [ ["gamma", "delta"], [1, 2] ] + +# Line breaks are OK when inside arrays +hosts = [ + "alpha", + "omega" +] diff --git a/vlib/toml/tests/toml_bom_test.v b/vlib/toml/tests/toml_bom_test.v new file mode 100644 index 0000000000..7ee81e0e9c --- /dev/null +++ b/vlib/toml/tests/toml_bom_test.v @@ -0,0 +1,49 @@ +import os +import toml +import toml.ast + +const empty_toml_document = toml.Doc{ + ast: &ast.Root(0) +} + +const ( + toml_text_with_utf8_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata', + 'toml_with_utf8_bom' + '.toml'))) or { panic(err) } + toml_text_with_utf16_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata', + 'toml_with_utf16_bom' + '.toml'))) or { panic(err) } + toml_text_with_utf32_bom = os.read_file(os.real_path(os.join_path(os.dir(@FILE), 'testdata', + 'toml_with_utf32_bom' + '.toml'))) or { panic(err) } +) + +fn test_toml_with_bom() { + toml_doc := toml.parse(toml_text_with_utf8_bom) or { panic(err) } + toml_json := toml_doc.to_json() + + title := toml_doc.value('title') + assert title == toml.Any('TOML Example') + assert title as string == 'TOML Example' + + owner := toml_doc.value('owner') as map[string]toml.Any + any_name := owner.value('name') or { panic(err) } + assert any_name.string() == 'Tom Preston-Werner' + + database := toml_doc.value('database') as map[string]toml.Any + db_serv := database['server'] or { + panic('could not access "server" index in "database" variable') + } + assert db_serv as string == '192.168.1.1' + + // Re-cycle bad_toml_doc + mut bad_toml_doc := empty_toml_document + bad_toml_doc = toml.parse(toml_text_with_utf16_bom) or { + println(' $err.msg') + assert true + empty_toml_document + } + + bad_toml_doc = toml.parse(toml_text_with_utf32_bom) or { + println(' $err.msg') + assert true + empty_toml_document + } +}