1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00
v/vlib/toml/scanner/scanner.v

691 lines
22 KiB
V
Raw Normal View History

2021-09-24 21:13:52 +03:00
// Copyright (c) 2021 Lars Pontoppidan. All rights reserved.
// Use of this source code is governed by an MIT license
// that can be found in the LICENSE file.
module scanner
import toml.input
import toml.token
import toml.util
2021-11-20 20:48:44 +03:00
pub const (
digit_extras = [`_`, `.`, `x`, `o`, `b`, `e`, `E`]
end_of_text = u32(~0)
2021-11-20 20:48:44 +03:00
)
2021-09-24 21:13:52 +03:00
// Scanner contains the necessary fields for the state of the scan process.
// the task the scanner does is also refered to as "lexing" or "tokenizing".
// The Scanner methods are based on much of the work in `vlib/strings/textscanner`.
pub struct Scanner {
pub:
config Config
text string // the input TOML text
mut:
col int // current column number (x coordinate)
line_nr int = 1 // current line number (y coordinate)
pos int // current flat/index position in the `text` field
header_len int // Length, how many bytes of header was found
// Quirks
is_left_of_assign bool = true // indicates if the scanner is on the *left* side of an assignment
2021-09-24 21:13:52 +03:00
}
2021-09-29 19:28:09 +03:00
// State is a read-only copy of the scanner's internal state.
// See also `Scanner.state()`.
pub struct State {
pub:
2021-10-26 16:58:58 +03:00
col int // current column number (x coordinate)
2021-09-29 19:28:09 +03:00
line_nr int = 1 // current line number (y coordinate)
2021-10-26 16:58:58 +03:00
pos int // current flat/index position in the `text` field
2021-09-24 21:13:52 +03:00
}
// Config is used to configure a Scanner instance.
// Only one of the fields `text` and `file_path` is allowed to be set at time of configuration.
pub struct Config {
pub:
2021-11-04 10:15:50 +03:00
input input.Config
tokenize_formatting bool = true // if true, generate tokens for `\n`, ` `, `\t`, `\r` etc.
2021-09-24 21:13:52 +03:00
}
// new_scanner returns a new *heap* allocated `Scanner` instance, based on the file in config.input.file_path,
// or based on the text in config.input.text .
pub fn new_scanner(config Config) !&Scanner {
2021-09-24 21:13:52 +03:00
mut s := &Scanner{
config: config
text: config.input.read_input()!
2021-09-24 21:13:52 +03:00
}
return s
}
// new_simple returns a new *stack* allocated `Scanner` instance.
pub fn new_simple(config Config) !Scanner {
return Scanner{
config: config
text: config.input.read_input()!
}
}
// new_simple_text returns a new *stack* allocated `Scanner` instance
// ready for parsing TOML in `text`.
pub fn new_simple_text(text string) !Scanner {
in_config := input.Config{
text: text
}
config := Config{
input: in_config
}
return Scanner{
config: config
text: config.input.read_input()!
}
}
// new_simple_file returns a new *stack* allocated `Scanner` instance
// ready for parsing TOML in file read from `path`.
pub fn new_simple_file(path string) !Scanner {
in_config := input.Config{
file_path: path
}
config := Config{
input: in_config
}
return Scanner{
config: config
text: config.input.read_input()!
}
}
2021-09-24 21:13:52 +03:00
// scan returns the next token from the input.
[direct_array_access]
pub fn (mut s Scanner) scan() !token.Token {
s.validate_and_skip_headers()!
2021-09-24 21:13:52 +03:00
for {
c := s.next()
2022-04-15 14:58:56 +03:00
byte_c := u8(c)
if c == scanner.end_of_text {
2021-09-24 21:13:52 +03:00
s.inc_line_number()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'reached EOF')
return s.new_token(.eof, '', 1)
}
ascii := byte_c.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'current char "${ascii}"')
2021-09-24 21:13:52 +03:00
2022-04-15 14:58:56 +03:00
if byte_c == u8(0x0) {
s.reset()
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' NULL control character `${c.hex()}` is not allowed at (${s.line_nr},${s.col}) "${ascii}" near ...${s.excerpt(s.pos, 5)}...')
}
is_sign := c == `+` || c == `-`
2021-09-24 21:13:52 +03:00
// (+/-)nan & (+/-)inf
peek_1 := s.peek(1)
peek_2 := s.peek(2)
is_nan := c == `n` && s.at() == `a` && peek_1 == `n`
is_inf := !is_nan && c == `i` && s.at() == `n` && peek_1 == `f`
is_signed_nan := is_sign && s.at() == `n` && peek_1 == `a` && peek_2 == `n`
is_signed_inf := !is_signed_nan && is_sign && s.at() == `i` && peek_1 == `n`
&& peek_2 == `f`
if !s.is_left_of_assign && (is_nan || is_inf || is_signed_nan || is_signed_inf) {
num := s.extract_nan_or_inf_number()!
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a special number "${num}" (${num.len})')
return s.new_token(.number, num, num.len)
2021-09-24 21:13:52 +03:00
}
2022-04-15 14:58:56 +03:00
is_signed_number := is_sign && u8(s.at()).is_digit() && !u8(s.peek(-1)).is_digit()
2021-09-24 21:13:52 +03:00
is_digit := byte_c.is_digit()
if is_digit || is_signed_number {
num := s.extract_number()!
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a number "${num}" (${num.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.number, num, num.len)
}
if util.is_key_char(byte_c) {
key := s.extract_key()
if !s.is_left_of_assign && (key == 'true' || key == 'false') {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a boolean "${key}" (${key.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.boolean, key, key.len)
}
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified a bare key "${key}" (${key.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.bare, key, key.len)
}
match rune(c) {
` `, `\t`, `\n`, `\r` {
if c == `\n` {
s.inc_line_number()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'incremented line nr to ${s.line_nr}')
2021-09-24 21:13:52 +03:00
}
// Date-Time in RFC 3339 is allowed to have a space between the date and time in supplement to the 'T'
// so we allow space characters to slip through to the parser if the space is between two digits...
2022-04-15 14:58:56 +03:00
// util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, '"'+u8(s.peek(-1)).ascii_str()+'" < "$ascii" > "'+u8(s.at()).ascii_str()+'"')
if c == ` ` && u8(s.peek(-1)).is_digit() && u8(s.at()).is_digit() {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified, what could be, a space between a RFC 3339 date and time ("${ascii}") (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(token.Kind.whitespace, ascii, ascii.len)
}
2021-11-04 10:15:50 +03:00
if s.config.tokenize_formatting {
2021-09-24 21:13:52 +03:00
mut kind := token.Kind.whitespace
if c == `\t` {
kind = token.Kind.tab
2021-11-04 10:15:50 +03:00
} else if c == `\r` {
kind = token.Kind.cr
2021-09-24 21:13:52 +03:00
} else if c == `\n` {
kind = token.Kind.nl
}
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified formatting character ("${ascii}") (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(kind, ascii, ascii.len)
} else {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping " ", "\\t" or "\\n" ("${ascii}") (${ascii.len})')
2021-09-24 21:13:52 +03:00
}
continue
}
`-` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified minus "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.minus, ascii, ascii.len)
}
`_` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified underscore "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.underscore, ascii, ascii.len)
}
`+` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified plus "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.plus, ascii, ascii.len)
}
`=` {
s.is_left_of_assign = false
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified assignment "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.assign, ascii, ascii.len)
}
`"`, `'` { // ... some string "/'
ident_string := s.extract_string()!
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified quoted string `${ident_string}`')
return s.new_token(.quoted, ident_string, ident_string.len)
2021-09-24 21:13:52 +03:00
}
`#` {
hash := s.ignore_line()!
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comment hash "${hash}" (${hash.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.hash, hash, hash.len + 1)
}
`{` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified left curly bracket "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.lcbr, ascii, ascii.len)
}
`}` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified right curly bracket "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.rcbr, ascii, ascii.len)
}
`[` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified left square bracket "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.lsbr, ascii, ascii.len)
}
`]` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified right square bracket "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.rsbr, ascii, ascii.len)
}
`:` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified colon "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.colon, ascii, ascii.len)
}
`,` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified comma "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.comma, ascii, ascii.len)
}
`.` {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified period "${ascii}" (${ascii.len})')
2021-09-24 21:13:52 +03:00
return s.new_token(.period, ascii, ascii.len)
}
else {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' could not scan character `${ascii}` / ${c} at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
2021-09-24 21:13:52 +03:00
}
}
}
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'unknown character code at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos,
2021-09-24 21:13:52 +03:00
5)}...')
return s.new_token(.unknown, '', 0)
}
// free frees all allocated resources.
[unsafe]
pub fn (mut s Scanner) free() {
unsafe {
s.text.free()
}
}
// remaining returns how many characters remain in the text input.
[inline]
pub fn (s &Scanner) remaining() int {
return s.text.len - s.pos
}
// next returns the next character code from the input text.
2021-11-04 10:15:50 +03:00
// next returns `end_of_text` if it can't reach the next character.
2021-09-24 21:13:52 +03:00
[direct_array_access; inline]
pub fn (mut s Scanner) next() u32 {
2021-09-24 21:13:52 +03:00
if s.pos < s.text.len {
opos := s.pos
s.pos++
s.col++
c := s.text[opos]
return c
}
2021-11-04 10:15:50 +03:00
return scanner.end_of_text
2021-09-24 21:13:52 +03:00
}
// skip skips one character ahead.
[inline]
pub fn (mut s Scanner) skip() {
if s.pos + 1 < s.text.len {
s.pos++
s.col++
}
}
// skip_n skips ahead `n` characters.
// If the skip goes out of bounds from the length of `Scanner.text`,
// the scanner position will be sat to the last character possible.
[inline]
pub fn (mut s Scanner) skip_n(n int) {
s.pos += n
if s.pos > s.text.len {
s.pos = s.text.len
}
s.col = s.pos
}
// at returns the *current* character code from the input text.
2021-11-04 10:15:50 +03:00
// at returns `end_of_text` if it can't get the current character.
2021-09-24 21:13:52 +03:00
// unlike `next()`, `at()` does not change the state of the scanner.
[direct_array_access; inline]
pub fn (s &Scanner) at() u32 {
2021-09-24 21:13:52 +03:00
if s.pos < s.text.len {
return s.text[s.pos]
}
2021-11-04 10:15:50 +03:00
return scanner.end_of_text
2021-09-24 21:13:52 +03:00
}
// at_crlf returns `true` if the scanner is at a `\r` character
// and the next character is a `\n`.
fn (s Scanner) at_crlf() bool {
return s.at() == `\r` && s.peek(1) == `\n`
}
2021-09-24 21:13:52 +03:00
// peek returns the character code from the input text at position + `n`.
2021-11-04 10:15:50 +03:00
// peek returns `end_of_text` if it can't peek `n` characters ahead.
2021-09-24 21:13:52 +03:00
[direct_array_access; inline]
pub fn (s &Scanner) peek(n int) u32 {
2021-09-24 21:13:52 +03:00
if s.pos + n < s.text.len {
// Allow peeking back - needed for spaces between date and time in RFC 3339 format :/
if n - 1 < 0 && s.pos + n - 1 >= 0 {
// util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'LOOKING BAAAA-AACK - OOOVER MY SHOOOOULDEEEER "${s.text[s.pos + n-1]}"')
return s.text[s.pos + n - 1]
}
return s.text[s.pos + n]
}
2021-11-04 10:15:50 +03:00
return scanner.end_of_text
2021-09-24 21:13:52 +03:00
}
// reset resets the internal state of the scanner.
pub fn (mut s Scanner) reset() {
s.pos = 0
s.col = 0
s.line_nr = 1
s.header_len = 0
2021-09-24 21:13:52 +03:00
}
// new_token returns a new `token.Token`.
[inline]
fn (mut s Scanner) new_token(kind token.Kind, lit string, len int) token.Token {
// println('new_token($lit)')
mut col := s.col - len + 1
if s.line_nr == 1 {
col -= s.header_len
}
2021-09-24 21:13:52 +03:00
return token.Token{
kind: kind
lit: lit
col: if col < 1 { 1 } else { col }
line_nr: s.line_nr + 1
pos: s.pos - s.header_len - len + 1
2021-09-24 21:13:52 +03:00
len: len
}
}
// ignore_line forwards the scanner to the end of the current line.
[direct_array_access; inline]
fn (mut s Scanner) ignore_line() !string {
2021-10-26 16:58:05 +03:00
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, ' ignoring until EOL...')
start := s.pos
for c := s.at(); c != scanner.end_of_text && c != `\n`; c = s.at() {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping "${u8(c).ascii_str()} / ${c}"')
if s.at_crlf() {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'letting `\\r\\n` slip through')
break
}
s.next()
2021-09-24 21:13:52 +03:00
}
2021-10-26 16:58:05 +03:00
return s.text[start..s.pos]
2021-09-24 21:13:52 +03:00
}
// inc_line_number increases the internal line number.
[inline]
fn (mut s Scanner) inc_line_number() {
s.col = 0
s.line_nr++
s.is_left_of_assign = true
2021-09-24 21:13:52 +03:00
}
// extract_key parses and returns a TOML key as a string.
[direct_array_access; inline]
fn (mut s Scanner) extract_key() string {
s.pos--
s.col--
start := s.pos
for s.pos < s.text.len {
2022-04-15 14:58:56 +03:00
c := u8(s.at())
2021-09-24 21:13:52 +03:00
if !(util.is_key_char(c) || c.is_digit() || c in [`_`, `-`]) {
break
}
s.pos++
s.col++
}
key := s.text[start..s.pos]
return key
}
// extract_string collects and returns a string containing
// any bytes recognized as a TOML string.
// TOML strings are everything found between two double or single quotation marks (`"`/`'`).
[direct_array_access; inline]
fn (mut s Scanner) extract_string() !string {
2021-09-24 21:13:52 +03:00
// extract_string is called when the scanner has already reached
// a byte that is the start of a string so we rewind it to start at the correct
s.pos--
s.col--
2022-04-15 14:58:56 +03:00
quote := u8(s.at())
2021-09-24 21:13:52 +03:00
start := s.pos
mut lit := quote.ascii_str()
2021-09-24 21:13:52 +03:00
is_multiline := s.text[s.pos + 1] == quote && s.text[s.pos + 2] == quote
// Check for escaped multiline quote
if is_multiline {
mls := s.extract_multiline_string()!
return mls
2021-09-24 21:13:52 +03:00
}
for {
s.pos++
s.col++
if s.pos >= s.text.len {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
2021-09-24 21:13:52 +03:00
}
2022-04-15 14:58:56 +03:00
c := u8(s.at())
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
2021-09-24 21:13:52 +03:00
// Check for escaped chars
2022-04-15 14:58:56 +03:00
if c == u8(92) {
2021-09-24 21:13:52 +03:00
esc, skip := s.handle_escapes(quote, is_multiline)
lit += esc
if skip > 0 {
s.pos += skip
s.col += skip
continue
}
}
// Check for control characters (allow TAB)
if util.is_illegal_ascii_control_character(c) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
2021-09-24 21:13:52 +03:00
if c == quote {
s.pos++
s.col++
return lit + quote.ascii_str()
2021-09-24 21:13:52 +03:00
}
lit += c.ascii_str()
// Don't eat multiple lines in single-line mode
if lit.contains('\n') {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' unfinished single-line string literal `${quote.ascii_str()}` started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
2021-09-24 21:13:52 +03:00
}
return lit
2021-09-24 21:13:52 +03:00
}
// extract_multiline_string collects and returns a string containing
// any bytes recognized as a TOML string.
// TOML strings are everything found between two double or single quotation marks (`"`/`'`).
[direct_array_access; inline]
fn (mut s Scanner) extract_multiline_string() !string {
2021-09-24 21:13:52 +03:00
// extract_multiline_string is called from extract_string so we know the 3 first
// characters is the quotes
2022-04-15 14:58:56 +03:00
quote := u8(s.at())
2021-09-24 21:13:52 +03:00
start := s.pos
mut lit := quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
2021-09-24 21:13:52 +03:00
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'multi-line `${quote.ascii_str()}${s.text[
s.pos + 1].ascii_str()}${s.text[s.pos + 2].ascii_str()}` string started at pos ${start} (${s.line_nr},${s.col}) (quote type: ${quote.ascii_str()} / ${quote})')
2021-09-24 21:13:52 +03:00
s.pos += 2
s.col += 2
for {
s.pos++
s.col++
if s.pos >= s.text.len {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' unfinished multi-line string literal (${quote.ascii_str()}${quote.ascii_str()}${quote.ascii_str()}) started at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
2021-09-24 21:13:52 +03:00
}
2022-04-15 14:58:56 +03:00
c := u8(s.at())
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `${c.ascii_str()}` / ${c} (quote type: ${quote}/${quote.ascii_str()})')
2021-09-29 19:28:09 +03:00
2021-09-24 21:13:52 +03:00
if c == `\n` {
s.inc_line_number()
lit += c.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'c: `\\n` / ${c}')
2021-09-24 21:13:52 +03:00
continue
}
// Check for escaped chars
2022-04-15 14:58:56 +03:00
if c == u8(92) {
2021-09-24 21:13:52 +03:00
esc, skip := s.handle_escapes(quote, true)
lit += esc
if skip > 0 {
s.pos += skip
s.col += skip
continue
}
}
// Check for control characters (allow TAB)
if util.is_illegal_ascii_control_character(c) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' control character `${c.hex()}` is not allowed at ${start} (${s.line_nr},${s.col}) "${u8(s.at()).ascii_str()}" near ...${s.excerpt(s.pos, 5)}...')
}
2021-09-24 21:13:52 +03:00
if c == quote {
if s.peek(1) == quote && s.peek(2) == quote {
if s.peek(3) == scanner.end_of_text {
2021-09-24 21:13:52 +03:00
s.pos += 3
s.col += 3
lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'returning at ${c.ascii_str()} `${lit}`')
2021-09-24 21:13:52 +03:00
return lit
} else if s.peek(3) != quote {
// lit += c.ascii_str()
// lit += quote.ascii_str()
s.pos += 3
s.col += 3
lit += quote.ascii_str() + quote.ascii_str() + quote.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'returning at ${c.ascii_str()} `${lit}`')
2021-09-24 21:13:52 +03:00
return lit
}
}
}
lit += c.ascii_str()
}
return lit
}
2021-09-29 19:28:09 +03:00
// handle_escapes returns any escape character sequence.
// For escape sequence validation see `Checker.check_quoted_escapes`.
2022-04-15 18:25:45 +03:00
fn (mut s Scanner) handle_escapes(quote u8, is_multiline bool) (string, int) {
2022-04-15 14:58:56 +03:00
c := u8(s.at())
2021-09-24 21:13:52 +03:00
mut lit := c.ascii_str()
is_literal_string := quote == `'`
if !is_literal_string {
2022-04-15 14:58:56 +03:00
if s.peek(1) == `u` && u8(s.peek(2)).is_hex_digit() && u8(s.peek(3)).is_hex_digit()
&& u8(s.peek(4)).is_hex_digit() && u8(s.peek(5)).is_hex_digit() {
lit += s.text[s.pos + 1..s.pos + 6] //.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped unicode `${lit}`')
return lit, 5
} else if s.peek(1) == quote {
if (!is_multiline && s.peek(2) == `\n`)
|| (is_multiline && s.peek(2) == quote && s.peek(3) == quote && s.peek(4) == `\n`) {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'ignore special case escaped `${lit}` at end of string')
return '', 0
}
lit += quote.ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
return lit, 1
}
}
if is_literal_string {
if s.peek(1) == quote {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'ignore escape `${lit}${u8(s.peek(1)).ascii_str()}` in literal string')
2021-09-24 21:13:52 +03:00
return '', 0
}
}
2022-04-15 14:58:56 +03:00
lit += u8(s.peek(1)).ascii_str()
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'gulp escaped `${lit}`')
2021-09-29 19:28:09 +03:00
return lit, 1
2021-09-24 21:13:52 +03:00
}
// extract_number collects and returns a string containing
// any bytes recognized as a TOML number except for "(+/-)nan" and "(+/-)inf".
2021-09-24 21:13:52 +03:00
// TOML numbers can include digits 0-9 and `_`.
[direct_array_access; inline]
fn (mut s Scanner) extract_number() !string {
2021-09-24 21:13:52 +03:00
// extract_number is called when the scanner has already reached
// a byte that is a number or +/- - so we rewind it to start at the correct
// position to get the complete number. Even if it's only one digit
s.pos--
s.col--
start := s.pos
mut c := s.at()
2022-04-15 14:58:56 +03:00
is_digit := u8(c).is_digit()
2021-09-24 21:13:52 +03:00
if !(is_digit || c in [`+`, `-`]) {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
2022-04-15 14:58:56 +03:00
' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
2021-09-24 21:13:52 +03:00
}
s.pos++
s.col++
for s.pos < s.text.len {
c = s.at()
// Handle signed exponent notation. I.e.: 3e2, 3E2, 3e-2, 3E+2, 3e0, 3.1e2, 3.1E2, -1E-1
2022-04-15 14:58:56 +03:00
if c in [`e`, `E`] && s.peek(1) in [`+`, `-`] && u8(s.peek(2)).is_digit() {
2021-09-24 21:13:52 +03:00
s.pos += 2
s.col += 2
}
c = s.at()
2022-04-15 18:25:45 +03:00
if !(u8(c).is_hex_digit() || c in scanner.digit_extras) || (c == `.` && s.is_left_of_assign) {
2021-09-24 21:13:52 +03:00
break
}
s.pos++
s.col++
}
key := s.text[start..s.pos]
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified number "${key}" in range [${start} .. ${s.pos}]')
2021-09-24 21:13:52 +03:00
return key
}
// extract_nan_or_inf_number collects and returns a string containing
// any bytes recognized as infinity or not-a-number TOML numbers.
[direct_array_access; inline]
fn (mut s Scanner) extract_nan_or_inf_number() !string {
// extract_nan_or_inf_number is called when the scanner has already identified that
// +/- or 'nan'/'inf' bytes is up but we rewind it to start at the correct position
s.pos--
s.col--
start := s.pos
mut c := s.at()
if c !in [`+`, `-`, `n`, `i`] {
return error(@MOD + '.' + @STRUCT + '.' + @FN +
2022-04-15 14:58:56 +03:00
' ${u8(c).ascii_str()} is not a number at ${s.excerpt(s.pos, 10)}')
}
s.pos++
s.col++
for s.pos < s.text.len {
c = s.at()
if c !in [`n`, `a`, `i`, `f`] {
break
}
s.pos++
s.col++
}
key := s.text[start..s.pos]
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'identified special number "${key}" in range [${start} .. ${s.pos}]')
return key
}
2021-09-24 21:13:52 +03:00
// excerpt returns a string excerpt of the input text centered
// at `pos`. The `margin` argument defines how many chacters
// on each side of `pos` is returned
pub fn (s Scanner) excerpt(pos int, margin int) string {
start := if pos > 0 && pos >= margin { pos - margin } else { 0 }
end := if pos + margin < s.text.len { pos + margin } else { s.text.len }
return s.text[start..end].replace('\n', r'\n')
}
2021-09-29 19:28:09 +03:00
// state returns a read-only view of the scanner's internal state.
pub fn (s Scanner) state() State {
return State{
col: s.col
line_nr: s.line_nr
pos: s.pos
}
}
fn (mut s Scanner) validate_and_skip_headers() ! {
// UTF-16 / UTF-32 headers (BE/LE)
s.check_utf16_or_32_bom()!
// NICE-TO-HAVE-TODO Check other types of (UTF-?) headers and yield an error. TOML is UTF-8 only.
// Skip optional UTF-8 heaser, if any.
if s.at() == 0xEF && s.peek(1) == 0xBB && s.peek(2) == 0xBF {
util.printdbg(@MOD + '.' + @STRUCT + '.' + @FN, 'skipping UTF-8 byte order mark (BOM)')
s.header_len = 3
s.skip_n(s.header_len)
}
// Check after we've skipped UTF-8 BOM
s.check_utf16_or_32_bom()!
}
fn (mut s Scanner) check_utf16_or_32_bom() ! {
if (s.at() == 0xFF && s.peek(1) == 0xFE && s.peek(2) == 0x00 && s.peek(3) == 0x00)
|| (s.at() == 0x00 && s.peek(1) == 0x00 && s.peek(2) == 0xFE && s.peek(3) == 0xFF) {
s.header_len = 4
s.skip_n(s.header_len)
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' UTF-32 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
}
if (s.at() == 0xFE && s.peek(1) == 0xFF) || (s.at() == 0xFF && s.peek(1) == 0xFE) {
s.header_len = 2
s.skip_n(s.header_len)
return error(@MOD + '.' + @STRUCT + '.' + @FN +
' UTF-16 is not a valid TOML encoding at ${s.pos} (${s.line_nr},${s.col}) near ...${s.excerpt(s.pos, 5)}...')
}
}