// Copyright (c) 2021 Lars Pontoppidan. All rights reserved. // Use of this source code is governed by an MIT license // that can be found in the LICENSE file. module decoder import toml.ast import toml.ast.walker import toml.token import toml.scanner import strconv // Decoder decode special sequences in a tree of TOML `ast.Value`'s. pub struct Decoder { scanner &scanner.Scanner } // decode decodes certain `ast.Value`'s and all it's children. pub fn (d Decoder) decode(mut n ast.Value) ? { walker.walk_and_modify(d, mut n) ? } fn (d Decoder) modify(mut value ast.Value) ? { match value { ast.Quoted { mut v := &(value as ast.Quoted) d.decode_quoted(mut v) ? } else {} } } // excerpt returns a string of the token's surroundings fn (d Decoder) excerpt(tp token.Position) string { return d.scanner.excerpt(tp.pos, 10) } // decode_quoted returns an error if `q` is not a valid quoted TOML string. fn (d Decoder) decode_quoted(mut q ast.Quoted) ? { d.decode_quoted_escapes(mut q) ? } // decode_quoted_escapes returns an error for any disallowed escape sequences. // Delimiters in TOML has significant meaning: // '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get) // "/""" delimits *basic* strings // Allowed escapes in *basic* strings are: // \b - backspace (U+0008) // \t - tab (U+0009) // \n - linefeed (U+000A) // \f - form feed (U+000C) // \r - carriage return (U+000D) // \" - quote (U+0022) // \\ - backslash (U+005C) // \uXXXX - Unicode (U+XXXX) // \UXXXXXXXX - Unicode (U+XXXXXXXX) fn (d Decoder) decode_quoted_escapes(mut q ast.Quoted) ? { // Setup a scanner in stack memory for easier navigation. mut s := scanner.new_simple(q.text) ? q.text = q.text.replace('\\"', '"') // TODO use string builder mut decoded_s := '' // See https://toml.io/en/v1.0.0#string for more info on string types. is_basic := q.quote == `\"` if !is_basic { return } for { ch := s.next() if ch == scanner.end_of_text { break } ch_byte := byte(ch) if ch == `\\` { ch_next := byte(s.at()) if ch_next == `\\` { decoded_s += ch_next.ascii_str() s.next() continue } if ch_next == `"` { decoded_s += '"' s.next() continue } if ch_next == `n` { decoded_s += '\n' s.next() continue } escape := ch_byte.ascii_str() + ch_next.ascii_str() // Decode unicode escapes if escape.to_lower() == '\\u' { // Long type Unicode (\UXXXXXXXX) is a maximum of 10 chars: '\' + 'U' + 8 hex characters // we pass in 10 characters from the `u`/`U` which is the longest possible sequence // of 9 chars plus one extra. mut decoded := '' if s.remaining() >= 10 { pos := s.state().pos decoded = d.decode_unicode_escape(s.text[pos..pos + 11]) or { st := s.state() return error(@MOD + '.' + @STRUCT + '.' + @FN + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...') } decoded_s += decoded s.skip_n(s.text[pos..pos + 11].len) continue } else { pos := s.state().pos decoded = d.decode_unicode_escape(s.text[pos..]) or { st := s.state() return error(@MOD + '.' + @STRUCT + '.' + @FN + ' escaped Unicode is invalid. $err.msg.capitalize() ($st.line_nr,$st.col) in ...${d.excerpt(q.pos)}...') } decoded_s += decoded s.skip_n(s.text[pos..].len) continue } } } decoded_s += ch_byte.ascii_str() } q.text = decoded_s } // decode_unicode_escape returns an error if `esc_unicode` is not // a valid Unicode escape sequence. `esc_unicode` is expected to be // prefixed with either `u` or `U`. fn (d Decoder) decode_unicode_escape(esc_unicode string) ?string { is_long_esc_type := esc_unicode.starts_with('U') mut sequence := esc_unicode[1..] hex_digits_len := if is_long_esc_type { 8 } else { 4 } sequence = sequence[..hex_digits_len] mut unicode_point := sequence if unicode_point.len < 8 { unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point } rn := rune(strconv.parse_int(unicode_point, 16, 0) ?) return '$rn' }