From ab642cac43c0d20bc2ce796bae0522b194bbb3fc Mon Sep 17 00:00:00 2001 From: jeffmikels Date: Tue, 11 Jan 2022 15:36:18 -0500 Subject: [PATCH] scanner, cgen: improve support for escape codes in backticks/runes (#13127) --- vlib/v/checker/check_types.v | 12 +- vlib/v/checker/tests/hex_literal_overflow.out | 18 --- .../tests/import_mod_sub_as_sub_err.out | 10 +- .../checker/tests/minus_op_wrong_type_err.out | 2 +- .../mut_array_get_element_address_err.out | 14 +-- .../v/checker/tests/string_escape_x_err_a.out | 2 +- .../v/checker/tests/string_escape_x_err_b.out | 2 +- vlib/v/checker/tests/unknown_function.out | 2 +- vlib/v/gen/c/cgen.v | 15 ++- vlib/v/gen/c/str.v | 4 +- vlib/v/scanner/scanner.v | 119 +++++++++++++++++- vlib/v/scanner/scanner_test.v | 97 ++++++++++++++ 12 files changed, 249 insertions(+), 48 deletions(-) diff --git a/vlib/v/checker/check_types.v b/vlib/v/checker/check_types.v index fb97c47d3d..55587d5df2 100644 --- a/vlib/v/checker/check_types.v +++ b/vlib/v/checker/check_types.v @@ -617,8 +617,10 @@ pub fn (mut c Checker) string_inter_lit(mut node ast.StringInterLiteral) ast.Typ return ast.string_type } -const hex_lit_overflow_message = 'hex character literal overflows string' +const unicode_lit_overflow_message = 'unicode character exceeds max allowed value of 0x10ffff, consider using a unicode literal (\\u####)' +// unicode character literals are limited to a maximum value of 0x10ffff +// https://stackoverflow.com/questions/52203351/why-unicode-is-restricted-to-0x10ffff pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { mut idx := 0 for idx < node.val.len { @@ -631,7 +633,7 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { start_idx := idx idx++ next_ch := node.val[idx] or { return ast.string_type } - if next_ch == `x` { + if next_ch == `u` { idx++ mut ch := node.val[idx] or { return ast.string_type } mut hex_char_count := 0 @@ -647,13 +649,13 @@ pub fn (mut c Checker) string_lit(mut node ast.StringLiteral) ast.Type { first_digit := node.val[idx - 5] - 48 second_digit := node.val[idx - 4] - 48 if first_digit > 1 { - c.error(checker.hex_lit_overflow_message, end_pos) + c.error(checker.unicode_lit_overflow_message, end_pos) } else if first_digit == 1 && second_digit > 0 { - c.error(checker.hex_lit_overflow_message, end_pos) + c.error(checker.unicode_lit_overflow_message, end_pos) } } else { - c.error(checker.hex_lit_overflow_message, end_pos) + c.error(checker.unicode_lit_overflow_message, end_pos) } } idx++ diff --git a/vlib/v/checker/tests/hex_literal_overflow.out b/vlib/v/checker/tests/hex_literal_overflow.out index 9475150399..e69de29bb2 100644 --- a/vlib/v/checker/tests/hex_literal_overflow.out +++ b/vlib/v/checker/tests/hex_literal_overflow.out @@ -1,18 +0,0 @@ -vlib/v/checker/tests/hex_literal_overflow.vv:1:7: error: hex character literal overflows string - 1 | a := '\x11ffff' - | ~~~~~~~~ - 2 | b := '\x20ffff' - 3 | c := '\x10fffff' -vlib/v/checker/tests/hex_literal_overflow.vv:2:7: error: hex character literal overflows string - 1 | a := '\x11ffff' - 2 | b := '\x20ffff' - | ~~~~~~~~ - 3 | c := '\x10fffff' - 4 | println(a) -vlib/v/checker/tests/hex_literal_overflow.vv:3:7: error: hex character literal overflows string - 1 | a := '\x11ffff' - 2 | b := '\x20ffff' - 3 | c := '\x10fffff' - | ~~~~~~~~~ - 4 | println(a) - 5 | println(b) diff --git a/vlib/v/checker/tests/import_mod_sub_as_sub_err.out b/vlib/v/checker/tests/import_mod_sub_as_sub_err.out index 7cf934d4a9..7cb2cb7eef 100644 --- a/vlib/v/checker/tests/import_mod_sub_as_sub_err.out +++ b/vlib/v/checker/tests/import_mod_sub_as_sub_err.out @@ -1,5 +1,5 @@ -vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant - 1 | import encoding.utf8 as utf8 - | ~~~~ - 2 | - 3 | fn main() { +vlib/v/checker/tests/import_mod_sub_as_sub_err.vv:1:25: error: import alias `encoding.utf8 as utf8` is redundant + 1 | import encoding.utf8 as utf8 + | ~~~~ + 2 | + 3 | fn main() { diff --git a/vlib/v/checker/tests/minus_op_wrong_type_err.out b/vlib/v/checker/tests/minus_op_wrong_type_err.out index b6895aa7d1..cad020716d 100644 --- a/vlib/v/checker/tests/minus_op_wrong_type_err.out +++ b/vlib/v/checker/tests/minus_op_wrong_type_err.out @@ -1,5 +1,5 @@ vlib/v/checker/tests/minus_op_wrong_type_err.vv:10:10: error: mismatched types `Aaa` and `int literal` - 8 | + 8 | 9 | fn main() { 10 | println(Aaa{} - 10) | ~~~~~~~~~~ diff --git a/vlib/v/checker/tests/mut_array_get_element_address_err.out b/vlib/v/checker/tests/mut_array_get_element_address_err.out index a3a5f4821b..a98d17b88a 100644 --- a/vlib/v/checker/tests/mut_array_get_element_address_err.out +++ b/vlib/v/checker/tests/mut_array_get_element_address_err.out @@ -1,7 +1,7 @@ -vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks - 1 | fn main() { - 2 | mut arr_int := [int(23), 45, 7, 8] - 3 | ele := &arr_int[1] - | ~~~ - 4 | println(ele) - 5 | } +vlib/v/checker/tests/mut_array_get_element_address_err.vv:3:20: error: cannot take the address of mutable array elements outside unsafe blocks + 1 | fn main() { + 2 | mut arr_int := [int(23), 45, 7, 8] + 3 | ele := &arr_int[1] + | ~~~ + 4 | println(ele) + 5 | } diff --git a/vlib/v/checker/tests/string_escape_x_err_a.out b/vlib/v/checker/tests/string_escape_x_err_a.out index f441565e7e..3329b0588c 100644 --- a/vlib/v/checker/tests/string_escape_x_err_a.out +++ b/vlib/v/checker/tests/string_escape_x_err_a.out @@ -1,4 +1,4 @@ -vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used with no following hex digits +vlib/v/checker/tests/string_escape_x_err_a.vv:2:15: error: `\x` used without two following hex digits 1 | fn main() { 2 | println('\x') | ^ diff --git a/vlib/v/checker/tests/string_escape_x_err_b.out b/vlib/v/checker/tests/string_escape_x_err_b.out index da90fd10d7..47bc6b7767 100644 --- a/vlib/v/checker/tests/string_escape_x_err_b.out +++ b/vlib/v/checker/tests/string_escape_x_err_b.out @@ -1,4 +1,4 @@ -vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used with no following hex digits +vlib/v/checker/tests/string_escape_x_err_b.vv:2:15: error: `\x` used without two following hex digits 1 | fn main() { 2 | println('\xhh') | ^ diff --git a/vlib/v/checker/tests/unknown_function.out b/vlib/v/checker/tests/unknown_function.out index 955d354fd1..b3f1be6747 100644 --- a/vlib/v/checker/tests/unknown_function.out +++ b/vlib/v/checker/tests/unknown_function.out @@ -1,5 +1,5 @@ vlib/v/checker/tests/unknown_function.vv:4:15: error: unknown function: math.max_i64 - 2 | + 2 | 3 | fn main() { 4 | println(math.max_i64()) | ~~~~~~~~~ diff --git a/vlib/v/gen/c/cgen.v b/vlib/v/gen/c/cgen.v index c410d79f9c..501c6f71e7 100644 --- a/vlib/v/gen/c/cgen.v +++ b/vlib/v/gen/c/cgen.v @@ -2510,6 +2510,19 @@ fn (mut g Gen) expr_with_cast(expr ast.Expr, got_type_raw ast.Type, expected_typ g.expr(expr) } +fn cescape_nonascii(original string) string { + mut b := strings.new_builder(original.len) + for c in original { + if c < 32 || c > 126 { + b.write_string('\\${c:03o}') + continue + } + b.write_b(c) + } + res := b.str() + return res +} + // cestring returns a V string, properly escaped for embeddeding in a C string literal. fn cestring(s string) string { return s.replace('\\', '\\\\').replace('"', "'") @@ -2517,7 +2530,7 @@ fn cestring(s string) string { // ctoslit returns a '_SLIT("$s")' call, where s is properly escaped. fn ctoslit(s string) string { - return '_SLIT("' + cestring(s) + '")' + return '_SLIT("' + cescape_nonascii(cestring(s)) + '")' } fn (mut g Gen) gen_attrs(attrs []ast.Attr) { diff --git a/vlib/v/gen/c/str.v b/vlib/v/gen/c/str.v index cdbb52436c..604e14d1d4 100644 --- a/vlib/v/gen/c/str.v +++ b/vlib/v/gen/c/str.v @@ -6,7 +6,7 @@ import v.ast import v.util fn (mut g Gen) string_literal(node ast.StringLiteral) { - escaped_val := util.smart_quote(node.val, node.is_raw) + escaped_val := cescape_nonascii(util.smart_quote(node.val, node.is_raw)) if node.language == .c { g.write('"$escaped_val"') } else { @@ -25,7 +25,7 @@ fn (mut g Gen) string_inter_literal_sb_optimized(call_expr ast.CallExpr) { is_nl := call_expr.name == 'writeln' // println('optimize sb $call_expr.name') for i, val in node.vals { - escaped_val := util.smart_quote(val, false) + escaped_val := cescape_nonascii(util.smart_quote(val, false)) // if val == '' { // break // continue diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index d6d9c527ee..76c5a73673 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1174,6 +1174,7 @@ fn (mut s Scanner) ident_string() string { } s.is_inside_string = false mut u_escapes_pos := []int{} // pos list of \uXXXX + mut h_escapes_pos := []int{} // pos list of \xXX mut backslash_count := if start_char == scanner.backslash { 1 } else { 0 } for { s.pos++ @@ -1221,8 +1222,12 @@ fn (mut s Scanner) ident_string() string { // Escape `\x` `\u` if backslash_count % 2 == 1 && !is_raw && !is_cstr { // Escape `\x` - if c == `x` && (s.text[s.pos + 1] == s.quote || !s.text[s.pos + 1].is_hex_digit()) { - s.error(r'`\x` used with no following hex digits') + if c == `x` { + if s.text[s.pos + 1] == s.quote || !(s.text[s.pos + 1].is_hex_digit() + && s.text[s.pos + 2].is_hex_digit()) { + s.error(r'`\x` used without two following hex digits') + } + h_escapes_pos << s.pos - 1 } // Escape `\u` if c == `u` { @@ -1266,6 +1271,9 @@ fn (mut s Scanner) ident_string() string { if !s.is_fmt && u_escapes_pos.len > 0 { string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos) } + if !s.is_fmt && h_escapes_pos.len > 0 { + string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos) + } if n_cr_chars > 0 { string_so_far = string_so_far.replace('\r', '') } @@ -1278,6 +1286,27 @@ fn (mut s Scanner) ident_string() string { return lit } +// only handle single-byte inline escapes like '\xc0' +fn decode_h_escapes(s string, start int, escapes_pos []int) string { + if escapes_pos.len == 0 { + return s + } + mut ss := []string{cap: escapes_pos.len * 2 + 1} + ss << s[..escapes_pos.first() - start] + for i, pos in escapes_pos { + idx := pos - start + end_idx := idx + 4 // "\xXX".len == 4 + // notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0 + ss << [byte(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr() + if i + 1 < escapes_pos.len { + ss << s[end_idx..escapes_pos[i + 1] - start] + } else { + ss << s[end_idx..] + } + } + return ss.join('') +} + fn decode_u_escapes(s string, start int, escapes_pos []int) string { if escapes_pos.len == 0 { return s @@ -1312,10 +1341,32 @@ fn trim_slash_line_break(s string) string { return ret_str } +/// ident_char is called when a backtick "single-char" is parsed from the code +/// it is needed because some runes (chars) are written with escape sequences +/// the string it returns should be a standardized, simplified version of the character +/// as it would appear in source code +/// possibilities: +/// single chars like `a`, `b` => 'a', 'b' +/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n' +/// escaped hex bytes like `\x01`, `\x61` => '\x01', 'a' +/// escaped multibyte runes like `\xe29885` => (★) +/// escaped unicode literals like `\u2605` fn (mut s Scanner) ident_char() string { - start := s.pos + lspos := token.Position{ + line_nr: s.line_nr + pos: s.pos + col: s.pos - s.last_nl_pos - 1 + } + + start := s.pos // the string position of the first backtick char slash := `\\` mut len := 0 + + // set flags for advanced escapes first + escaped_hex := s.expect('\\x', start + 1) + escaped_unicode := s.expect('\\u', start + 1) + + // walk the string to get characters up to the next backtick for { s.pos++ if s.pos >= s.text.len { @@ -1334,12 +1385,68 @@ fn (mut s Scanner) ident_char() string { } } len-- - c := s.text[start + 1..s.pos] + mut c := s.text[start + 1..s.pos] if len != 1 { + // if the content expresses an escape code, it will have an even number of characters + // e.g. \x61 or \u2605 + if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) { + if escaped_unicode { + c = decode_u_escapes(c, 0, [0]) + } else { + // we have to handle hex ourselves + ascii_0 := byte(0x30) + ascii_a := byte(0x61) + mut accumulated := []byte{} + val := c[2..c.len].to_lower() // 0A -> 0a + mut offset := 0 + // take two characters at a time, parse as hex and add to bytes + for { + if offset >= val.len - 1 { + break + } + mut byteval := byte(0) + big := val[offset] + little := val[offset + 1] + if !big.is_hex_digit() { + accumulated.clear() + break + } + if !little.is_hex_digit() { + accumulated.clear() + break + } + + if big.is_digit() { + byteval |= (big - ascii_0) << 4 + } else { + byteval |= (big - ascii_a + 10) << 4 + } + if little.is_digit() { + byteval |= (little - ascii_0) + } else { + byteval |= (little - ascii_a + 10) + } + + accumulated << byteval + offset += 2 + } + if accumulated.len > 0 { + c = accumulated.bytestr() + } + } + } + + // the string inside the backticks is longer than one character + // but we might only have one rune, say in the case u := c.runes() if u.len != 1 { - s.error('invalid character literal (more than one character)\n' + - 'use quotes for strings, backticks for characters') + if escaped_hex || escaped_unicode { + s.error('invalid character literal (escape sequence did not refer to a singular rune)') + } else { + s.add_error_detail_with_pos('use quotes for strings, backticks for characters', + lspos) + s.error('invalid character literal (more than one character)') + } } } // Escapes a `'` character diff --git a/vlib/v/scanner/scanner_test.v b/vlib/v/scanner/scanner_test.v index 07fdb43e30..8cdae1a43b 100644 --- a/vlib/v/scanner/scanner_test.v +++ b/vlib/v/scanner/scanner_test.v @@ -19,6 +19,19 @@ fn scan_kinds(text string) []token.Kind { return token_kinds } +fn scan_tokens(text string) []token.Token { + mut scanner := new_scanner(text, .parse_comments, &pref.Preferences{}) + mut tokens := []token.Token{} + for { + tok := scanner.scan() + if tok.kind == .eof { + break + } + tokens << tok + } + return tokens +} + fn test_scan() { token_kinds := scan_kinds('println(2 + 3)') assert token_kinds.len == 6 @@ -138,6 +151,90 @@ fn test_ref_ref_array_ref_ref_foo() { } fn test_escape_string() { + // these assertions aren't helpful... + // they test the vlib built-in to the compiler, + // but we want to test this module before compilation assert '\x61' == 'a' assert '\x62' == 'b' + // assert `\x61` == `a` // will work after pull request goes through + + // SINGLE CHAR ESCAPES + // SINGLE CHAR APOSTROPHE + mut result := scan_tokens(r"`'`") + assert result[0].kind == .chartoken + assert result[0].lit == r"\'" + + // SINGLE CHAR BACKTICK + result = scan_tokens(r'`\``') + assert result[0].kind == .chartoken + assert result[0].lit == r'\`' + + // SINGLE CHAR SLASH + result = scan_tokens(r'`\\`') + assert result[0].kind == .chartoken + assert result[0].lit == r'\\' + + // SINGLE CHAR UNICODE ESCAPE + result = scan_tokens(r'`\u2605`') + assert result[0].kind == .chartoken + assert result[0].lit == r'★' + + // SINGLE CHAR ESCAPED ASCII + result = scan_tokens(r'`\x61`') + assert result[0].kind == .chartoken + assert result[0].lit == r'a' + + // SINGLE CHAR INCORRECT ESCAPE + // result = scan_tokens(r'`\x61\x61`') // should always result in an error + + // SINGLE CHAR MULTI-BYTE UTF-8 + // Compilation blocked by vlib/v/checker/check_types.v, but works in the repl + result = scan_tokens(r'`\xe29885`') + assert result[0].lit == r'★' + + // STRING ESCAPES ================= + // STRING APOSTROPHE + result = scan_tokens(r"'\''") + assert result[0].kind == .string + assert result[0].lit == r"\'" + + // STRING BACKTICK + result = scan_tokens(r"'\`'") + assert result[0].kind == .string + assert result[0].lit == r'\`' + + // STRING SLASH + result = scan_tokens(r"'\\'") + assert result[0].kind == .string + assert result[0].lit == r'\\' + + // STRING UNICODE ESCAPE + result = scan_tokens(r"'\u2605'") + assert result[0].kind == .string + assert result[0].lit == r'★' + + // STRING ESCAPED ASCII + result = scan_tokens(r"'\x61'") + assert result[0].kind == .string + assert result[0].lit == r'a' + + // STRING ESCAPED EXTENDED ASCII + // (should not be converted to unicode) + result = scan_tokens(r"'\xe29885'") + assert result[0].kind == .string + assert result[0].lit.bytes() == [byte(0xe2), `9`, `8`, `8`, `5`] + + // SHOULD RESULT IN ERRORS + // result = scan_tokens(r'`\x61\x61`') // should always result in an error + // result = scan_tokens(r"'\x'") // should always result in an error + // result = scan_tokens(r'`hello`') // should always result in an error +} + +fn test_comment_string() { + mut result := scan_tokens('// single line comment will get an \\x01 prepended') + assert result[0].kind == .comment + assert result[0].lit[0] == byte(1) // \x01 + // result = scan_tokens('/// doc comment will keep third / at beginning') + // result = scan_tokens('/* block comment will be stripped of whitespace */') + // result = scan_tokens('a := 0 // line end comment also gets \\x01 prepended') }