2021-11-20 20:48:44 +03:00
|
|
|
// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
|
|
|
|
// Use of this source code is governed by an MIT license
|
|
|
|
// that can be found in the LICENSE file.
|
|
|
|
module decoder
|
|
|
|
|
|
|
|
import toml.ast
|
|
|
|
import toml.ast.walker
|
|
|
|
import toml.token
|
|
|
|
import toml.scanner
|
|
|
|
import strconv
|
|
|
|
|
2021-11-23 12:02:43 +03:00
|
|
|
const (
|
|
|
|
// utf8_max is the largest inclusive value of the Unicodes scalar value ranges.
|
|
|
|
utf8_max = 0x10FFFF
|
|
|
|
)
|
|
|
|
|
2021-11-20 20:48:44 +03:00
|
|
|
// Decoder decode special sequences in a tree of TOML `ast.Value`'s.
|
|
|
|
pub struct Decoder {
|
2022-09-15 07:59:31 +03:00
|
|
|
scanner &scanner.Scanner = unsafe { nil }
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// decode decodes certain `ast.Value`'s and all it's children.
|
|
|
|
pub fn (d Decoder) decode(mut n ast.Value) ? {
|
2022-05-13 06:56:21 +03:00
|
|
|
walker.walk_and_modify(d, mut n)?
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
fn (d Decoder) modify(mut value ast.Value) ? {
|
|
|
|
match value {
|
|
|
|
ast.Quoted {
|
|
|
|
mut v := &(value as ast.Quoted)
|
2022-05-13 06:56:21 +03:00
|
|
|
d.decode_quoted(mut v)?
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
2021-11-23 17:23:16 +03:00
|
|
|
ast.Number {
|
|
|
|
mut v := &(value as ast.Number)
|
2022-05-13 06:56:21 +03:00
|
|
|
d.decode_number(mut v)?
|
2021-11-23 17:23:16 +03:00
|
|
|
}
|
2021-11-24 15:49:23 +03:00
|
|
|
ast.DateTime {
|
|
|
|
mut v := &(value as ast.DateTime)
|
2022-05-13 06:56:21 +03:00
|
|
|
d.decode_date_time(mut v)?
|
2021-11-24 15:49:23 +03:00
|
|
|
}
|
2021-11-20 20:48:44 +03:00
|
|
|
else {}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// excerpt returns a string of the token's surroundings
|
2022-01-26 13:36:28 +03:00
|
|
|
fn (d Decoder) excerpt(tp token.Pos) string {
|
2021-11-20 20:48:44 +03:00
|
|
|
return d.scanner.excerpt(tp.pos, 10)
|
|
|
|
}
|
|
|
|
|
|
|
|
// decode_quoted returns an error if `q` is not a valid quoted TOML string.
|
|
|
|
fn (d Decoder) decode_quoted(mut q ast.Quoted) ? {
|
2022-05-13 06:56:21 +03:00
|
|
|
decode_quoted_escapes(mut q)?
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
|
|
|
|
2021-11-23 17:23:16 +03:00
|
|
|
// decode_number decodes the `n ast.Number` into valid TOML.
|
|
|
|
fn (d Decoder) decode_number(mut n ast.Number) ? {
|
|
|
|
if n.text == '-nan' || n.text == '+nan' {
|
|
|
|
n.text = 'nan'
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-20 20:48:44 +03:00
|
|
|
// decode_quoted_escapes returns an error for any disallowed escape sequences.
|
|
|
|
// Delimiters in TOML has significant meaning:
|
|
|
|
// '/''' delimits *literal* strings (WYSIWYG / What-you-see-is-what-you-get)
|
|
|
|
// "/""" delimits *basic* strings
|
|
|
|
// Allowed escapes in *basic* strings are:
|
|
|
|
// \b - backspace (U+0008)
|
|
|
|
// \t - tab (U+0009)
|
|
|
|
// \n - linefeed (U+000A)
|
|
|
|
// \f - form feed (U+000C)
|
|
|
|
// \r - carriage return (U+000D)
|
|
|
|
// \" - quote (U+0022)
|
|
|
|
// \\ - backslash (U+005C)
|
|
|
|
// \uXXXX - Unicode (U+XXXX)
|
|
|
|
// \UXXXXXXXX - Unicode (U+XXXXXXXX)
|
2021-11-23 12:02:43 +03:00
|
|
|
pub fn decode_quoted_escapes(mut q ast.Quoted) ? {
|
2021-11-20 20:48:44 +03:00
|
|
|
// Setup a scanner in stack memory for easier navigation.
|
2021-11-23 12:02:43 +03:00
|
|
|
mut eat_whitespace := false
|
2021-11-20 20:48:44 +03:00
|
|
|
// TODO use string builder
|
|
|
|
mut decoded_s := ''
|
|
|
|
// See https://toml.io/en/v1.0.0#string for more info on string types.
|
|
|
|
is_basic := q.quote == `\"`
|
|
|
|
if !is_basic {
|
|
|
|
return
|
|
|
|
}
|
2021-11-23 12:02:43 +03:00
|
|
|
|
2022-05-13 06:56:21 +03:00
|
|
|
mut s := scanner.new_simple_text(q.text)?
|
2021-11-23 12:02:43 +03:00
|
|
|
q.text = q.text.replace('\\"', '"')
|
|
|
|
|
2021-11-20 20:48:44 +03:00
|
|
|
for {
|
|
|
|
ch := s.next()
|
|
|
|
if ch == scanner.end_of_text {
|
|
|
|
break
|
|
|
|
}
|
2022-04-15 14:58:56 +03:00
|
|
|
ch_byte := u8(ch)
|
2021-11-20 20:48:44 +03:00
|
|
|
|
2021-11-23 12:02:43 +03:00
|
|
|
if eat_whitespace && ch_byte.is_space() {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
eat_whitespace = false
|
|
|
|
|
2021-11-20 20:48:44 +03:00
|
|
|
if ch == `\\` {
|
2021-11-23 12:02:43 +03:00
|
|
|
ch_next := s.at()
|
2022-04-15 14:58:56 +03:00
|
|
|
ch_next_byte := u8(ch_next)
|
2021-11-20 20:48:44 +03:00
|
|
|
|
|
|
|
if ch_next == `\\` {
|
2021-11-23 12:02:43 +03:00
|
|
|
decoded_s += ch_next_byte.ascii_str()
|
2021-11-20 20:48:44 +03:00
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-11-23 12:02:43 +03:00
|
|
|
if q.is_multiline {
|
|
|
|
if ch_next_byte.is_space() {
|
|
|
|
eat_whitespace = true
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-11-20 20:48:44 +03:00
|
|
|
if ch_next == `"` {
|
|
|
|
decoded_s += '"'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if ch_next == `n` {
|
|
|
|
decoded_s += '\n'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2021-11-23 12:02:43 +03:00
|
|
|
if ch_next == `t` {
|
|
|
|
decoded_s += '\t'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if ch_next == `b` {
|
|
|
|
decoded_s += '\b'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if ch_next == `r` {
|
|
|
|
decoded_s += '\r'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if ch_next == `f` {
|
|
|
|
decoded_s += '\f'
|
|
|
|
s.next()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
escape := ch_byte.ascii_str() + ch_next_byte.ascii_str()
|
2021-11-20 20:48:44 +03:00
|
|
|
// Decode unicode escapes
|
|
|
|
if escape.to_lower() == '\\u' {
|
2022-04-15 14:58:56 +03:00
|
|
|
is_valid_short := u8(s.peek(1)).is_hex_digit() && u8(s.peek(2)).is_hex_digit()
|
|
|
|
&& u8(s.peek(3)).is_hex_digit() && u8(s.peek(4)).is_hex_digit()
|
2021-11-23 12:02:43 +03:00
|
|
|
|
|
|
|
if is_valid_short {
|
2022-04-15 18:25:45 +03:00
|
|
|
is_valid_long := u8(s.peek(5)).is_hex_digit() && u8(s.peek(6)).is_hex_digit()
|
|
|
|
&& u8(s.peek(7)).is_hex_digit() && u8(s.peek(8)).is_hex_digit()
|
2021-12-02 12:19:12 +03:00
|
|
|
// If it's a long type Unicode (\UXXXXXXXX) with a maximum of 10 chars: '\' + 'U' + 8 hex characters
|
2021-11-23 12:02:43 +03:00
|
|
|
// we pass in 10 characters from the `u`/`U` which is the longest possible sequence
|
|
|
|
// of 9 chars plus one extra.
|
2021-12-02 12:19:12 +03:00
|
|
|
// Else it's a short sequence (\uXXXX) with a maximum of 6 chars: '\' + 'U' + 4 hex characters.
|
2021-11-23 12:02:43 +03:00
|
|
|
mut decoded := ''
|
|
|
|
mut sequence_length := 0
|
|
|
|
mut unicode_val := 0
|
2021-12-02 12:19:12 +03:00
|
|
|
mut slen := if is_valid_long { 10 } else { 6 }
|
|
|
|
if slen <= s.remaining() {
|
2021-11-23 12:02:43 +03:00
|
|
|
pos := s.state().pos
|
2021-12-02 12:19:12 +03:00
|
|
|
sequence := s.text[pos..pos + slen + 1]
|
2021-11-23 12:02:43 +03:00
|
|
|
|
|
|
|
decoded, unicode_val, sequence_length = decode_unicode_escape(sequence) or {
|
|
|
|
decoded_s += escape
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if unicode_val > decoder.utf8_max || unicode_val < 0 {
|
|
|
|
decoded_s += escape
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// Check if the Unicode value is actually in the valid Unicode scalar value ranges.
|
|
|
|
if !((unicode_val >= 0x0000 && unicode_val <= 0xD7FF)
|
|
|
|
|| (unicode_val >= 0xE000 && unicode_val <= decoder.utf8_max)) {
|
|
|
|
decoded_s += escape
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
decoded_s += decoded
|
2021-12-02 12:19:12 +03:00
|
|
|
replacement := s.text[pos..pos + sequence_length + 1]
|
|
|
|
s.skip_n(replacement.len)
|
2021-11-23 12:02:43 +03:00
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
pos := s.state().pos
|
|
|
|
sequence := s.text[pos..]
|
|
|
|
decoded, _, _ = decode_unicode_escape(sequence) or {
|
|
|
|
decoded_s += escape
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
decoded_s += decoded
|
|
|
|
s.skip_n(s.text[pos..].len)
|
|
|
|
continue
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
decoded_s += ch_byte.ascii_str()
|
|
|
|
}
|
|
|
|
q.text = decoded_s
|
|
|
|
}
|
|
|
|
|
2021-11-24 15:49:23 +03:00
|
|
|
// decode_unicode_escape decodes the Unicode escape sequence `esc_unicode`.
|
|
|
|
// The sequence is expected to be prefixed with either `u` or `U`.
|
|
|
|
// decode_unicode_escape returns the decoded rune as
|
|
|
|
// a string, it's integer value and it's length.
|
2021-11-23 12:02:43 +03:00
|
|
|
fn decode_unicode_escape(esc_unicode string) ?(string, int, int) {
|
2021-11-20 20:48:44 +03:00
|
|
|
is_long_esc_type := esc_unicode.starts_with('U')
|
|
|
|
mut sequence := esc_unicode[1..]
|
|
|
|
hex_digits_len := if is_long_esc_type { 8 } else { 4 }
|
2021-11-23 12:02:43 +03:00
|
|
|
mut sequence_len := hex_digits_len
|
2021-11-20 20:48:44 +03:00
|
|
|
|
|
|
|
sequence = sequence[..hex_digits_len]
|
|
|
|
|
|
|
|
mut unicode_point := sequence
|
|
|
|
if unicode_point.len < 8 {
|
|
|
|
unicode_point = '0'.repeat(8 - unicode_point.len) + unicode_point
|
|
|
|
}
|
2022-05-13 06:56:21 +03:00
|
|
|
i64_val := strconv.parse_int(unicode_point, 16, 0)?
|
2021-11-23 12:02:43 +03:00
|
|
|
rn := rune(i64_val)
|
|
|
|
return '$rn', int(i64_val), sequence_len
|
2021-11-20 20:48:44 +03:00
|
|
|
}
|
2021-11-24 15:49:23 +03:00
|
|
|
|
|
|
|
// decode_date_time decodes the `dt ast.DateTime`.
|
|
|
|
fn (d Decoder) decode_date_time(mut dt ast.DateTime) ? {
|
|
|
|
// Expand milliseconds that are only 1 char
|
|
|
|
if dt.text.contains('.') {
|
|
|
|
yymmddhhmmss := dt.text.all_before('.')
|
|
|
|
rest := dt.text.all_after('.')
|
|
|
|
z := if rest.contains('Z') { 'Z' } else { '' }
|
|
|
|
mut ms := rest
|
|
|
|
mut offset := ''
|
|
|
|
if rest.contains('+') {
|
|
|
|
offset = '+' + rest.all_after('+')
|
|
|
|
ms = rest.all_before('+')
|
|
|
|
} else if rest.contains('-') {
|
|
|
|
offset = '-' + rest.all_after('-')
|
|
|
|
ms = rest.all_before('-')
|
|
|
|
}
|
|
|
|
if z != '' {
|
|
|
|
ms = ms.replace('Z', '')
|
|
|
|
}
|
|
|
|
if ms.len > 1 {
|
|
|
|
return
|
|
|
|
}
|
2021-11-29 23:15:22 +03:00
|
|
|
ms = ms + '0'.repeat(4 - ms.len) + z
|
2021-11-24 15:49:23 +03:00
|
|
|
dt.text = yymmddhhmmss + '.' + ms + offset
|
|
|
|
}
|
|
|
|
}
|