From 78f8b9eb288da5731dc701410f0c9c9c1d18281b Mon Sep 17 00:00:00 2001 From: Carl Xiong Date: Thu, 22 Sep 2022 18:44:55 +0800 Subject: [PATCH] v.scanner: fix an unicode string escape handling bug (#15834) --- vlib/v/scanner/scanner.v | 69 +++++++++++++++++++++++++++-------- vlib/v/scanner/scanner_test.v | 19 ++++++++++ 2 files changed, 73 insertions(+), 15 deletions(-) diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 736ca0c909..6ae5d8129f 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1236,12 +1236,38 @@ fn (mut s Scanner) ident_string() string { } if start <= s.pos { mut string_so_far := s.text[start..end] - if !s.is_fmt && u_escapes_pos.len > 0 { - string_so_far = s.decode_u_escapes(string_so_far, start, u_escapes_pos) - } - if !s.is_fmt && h_escapes_pos.len > 0 { - string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos) + if !s.is_fmt { + mut segment_idx := 0 + mut str_segments := []string{} + if u_escapes_pos.len + h_escapes_pos.len > 0 { + mut all_pos := []int{} + all_pos << u_escapes_pos + all_pos << h_escapes_pos + if u_escapes_pos.len != 0 && h_escapes_pos.len != 0 { + all_pos.sort() + } + for pos in all_pos { + str_segments << string_so_far[segment_idx..(pos - start)] + segment_idx = pos - start + + if pos in u_escapes_pos { + end_idx, segment := s.decode_u_escape_single(string_so_far, segment_idx) + str_segments << segment + segment_idx = end_idx + } + if pos in h_escapes_pos { + end_idx, segment := decode_h_escape_single(string_so_far, segment_idx) + str_segments << segment + segment_idx = end_idx + } + } + } + if segment_idx < string_so_far.len { + str_segments << string_so_far[segment_idx..] + } + string_so_far = str_segments.join('') } + if n_cr_chars > 0 { string_so_far = string_so_far.replace('\r', '') } @@ -1254,6 +1280,13 @@ fn (mut s Scanner) ident_string() string { return lit } +fn decode_h_escape_single(str string, idx int) (int, string) { + end_idx := idx + 4 // "\xXX".len == 4 + + // notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0 + return end_idx, [u8(strconv.parse_uint(str[idx + 2..end_idx], 16, 8) or { 0 })].bytestr() +} + // only handle single-byte inline escapes like '\xc0' fn decode_h_escapes(s string, start int, escapes_pos []int) string { if escapes_pos.len == 0 { @@ -1263,9 +1296,9 @@ fn decode_h_escapes(s string, start int, escapes_pos []int) string { ss << s[..escapes_pos.first() - start] for i, pos in escapes_pos { idx := pos - start - end_idx := idx + 4 // "\xXX".len == 4 - // notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0 - ss << [u8(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr() + end_idx, segment := decode_h_escape_single(s, idx) + ss << segment + if i + 1 < escapes_pos.len { ss << s[end_idx..escapes_pos[i + 1] - start] } else { @@ -1296,6 +1329,17 @@ fn decode_o_escapes(s string, start int, escapes_pos []int) string { return ss.join('') } +fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) { + end_idx := idx + 6 // "\uXXXX".len == 6 + escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 } + // Check if Escaped Code Point is invalid or not + if rune(escaped_code_point).length_in_bytes() == -1 { + s.error('invalid unicode point `$str`') + } + + return end_idx, utf32_to_str(u32(escaped_code_point)) +} + // decode the flagged unicode escape sequences into their utf-8 bytes fn (mut s Scanner) decode_u_escapes(str string, start int, escapes_pos []int) string { if escapes_pos.len == 0 { @@ -1305,13 +1349,8 @@ fn (mut s Scanner) decode_u_escapes(str string, start int, escapes_pos []int) st ss << str[..escapes_pos.first() - start] for i, pos in escapes_pos { idx := pos - start - end_idx := idx + 6 // "\uXXXX".len == 6 - escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 } - // Check if Escaped Code Point is invalid or not - if rune(escaped_code_point).length_in_bytes() == -1 { - s.error('invalid unicode point `$str`') - } - ss << utf32_to_str(u32(escaped_code_point)) + end_idx, segment := s.decode_u_escape_single(str, idx) + ss << segment if i + 1 < escapes_pos.len { ss << str[end_idx..escapes_pos[i + 1] - start] } else { diff --git a/vlib/v/scanner/scanner_test.v b/vlib/v/scanner/scanner_test.v index 62a7f5cdb0..c3524bb57d 100644 --- a/vlib/v/scanner/scanner_test.v +++ b/vlib/v/scanner/scanner_test.v @@ -234,6 +234,9 @@ fn test_escape_string() { result = scan_tokens(r"'\u2605'") assert result[0].kind == .string assert result[0].lit == r'★' + result = scan_tokens(r"'H\u2605H'") + assert result[0].kind == .string + assert result[0].lit == r'H★H' // STRING ESCAPED ASCII result = scan_tokens(r"'\x61'") @@ -246,6 +249,22 @@ fn test_escape_string() { assert result[0].kind == .string assert result[0].lit.bytes() == [u8(0xe2), `9`, `8`, `8`, `5`] + // MIX STRING ESCAPES + result = scan_tokens(r"'\x61\u2605'") + assert result[0].kind == .string + assert result[0].lit == r'a★' + result = scan_tokens(r"'\u2605\x61'") + assert result[0].kind == .string + assert result[0].lit == r'★a' + + // MIX STRING ESCAPES with offset + result = scan_tokens(r"'x \x61\u2605\x61'") + assert result[0].kind == .string + assert result[0].lit == r'x a★a' + result = scan_tokens(r"'x \u2605\x61\u2605'") + assert result[0].kind == .string + assert result[0].lit == r'x ★a★' + // SHOULD RESULT IN ERRORS // result = scan_tokens(r'`\x61\x61`') // should always result in an error // result = scan_tokens(r"'\x'") // should always result in an error