From f727433929b1e3280507e2c1949c90180fc7f8ba Mon Sep 17 00:00:00 2001 From: Swastik Baranwal Date: Mon, 22 Aug 2022 13:26:51 +0530 Subject: [PATCH] scanner: add check for invalid unicode (#15485) --- vlib/v/scanner/scanner.v | 21 ++++++++++++-------- vlib/v/scanner/tests/invalid_unicode_err.out | 4 ++++ vlib/v/scanner/tests/invalid_unicode_err.vv | 2 ++ 3 files changed, 19 insertions(+), 8 deletions(-) create mode 100644 vlib/v/scanner/tests/invalid_unicode_err.out create mode 100644 vlib/v/scanner/tests/invalid_unicode_err.vv diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 854402286d..8e89b26a38 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -1237,7 +1237,7 @@ fn (mut s Scanner) ident_string() string { if start <= s.pos { mut string_so_far := s.text[start..end] if !s.is_fmt && u_escapes_pos.len > 0 { - string_so_far = decode_u_escapes(string_so_far, start, u_escapes_pos) + string_so_far = s.decode_u_escapes(string_so_far, start, u_escapes_pos) } if !s.is_fmt && h_escapes_pos.len > 0 { string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos) @@ -1297,20 +1297,25 @@ fn decode_o_escapes(s string, start int, escapes_pos []int) string { } // decode the flagged unicode escape sequences into their utf-8 bytes -fn decode_u_escapes(s string, start int, escapes_pos []int) string { +fn (mut s Scanner) decode_u_escapes(str string, start int, escapes_pos []int) string { if escapes_pos.len == 0 { - return s + return str } mut ss := []string{cap: escapes_pos.len * 2 + 1} - ss << s[..escapes_pos.first() - start] + ss << str[..escapes_pos.first() - start] for i, pos in escapes_pos { idx := pos - start end_idx := idx + 6 // "\uXXXX".len == 6 - ss << utf32_to_str(u32(strconv.parse_uint(s[idx + 2..end_idx], 16, 32) or { 0 })) + escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 } + // Check if Escaped Code Point is invalid or not + if rune(escaped_code_point).length_in_bytes() == -1 { + s.error('invalid unicode point `$str`') + } + ss << utf32_to_str(u32(escaped_code_point)) if i + 1 < escapes_pos.len { - ss << s[end_idx..escapes_pos[i + 1] - start] + ss << str[end_idx..escapes_pos[i + 1] - start] } else { - ss << s[end_idx..] + ss << str[end_idx..] } } return ss.join('') @@ -1391,7 +1396,7 @@ fn (mut s Scanner) ident_char() string { if (c.len % 2 == 0) && (escaped_hex || escaped_unicode || escaped_octal) { if escaped_unicode { // there can only be one, so attempt to decode it now - c = decode_u_escapes(c, 0, [0]) + c = s.decode_u_escapes(c, 0, [0]) } else { // find escape sequence start positions mut escapes_pos := []int{} diff --git a/vlib/v/scanner/tests/invalid_unicode_err.out b/vlib/v/scanner/tests/invalid_unicode_err.out new file mode 100644 index 0000000000..95ab20e1b8 --- /dev/null +++ b/vlib/v/scanner/tests/invalid_unicode_err.out @@ -0,0 +1,4 @@ +vlib/v/scanner/tests/invalid_unicode_err.vv:1:13: error: invalid unicode point `\uD8FF` + 1 | a := '\uD8FF' + | ^ + 2 | println(a) diff --git a/vlib/v/scanner/tests/invalid_unicode_err.vv b/vlib/v/scanner/tests/invalid_unicode_err.vv new file mode 100644 index 0000000000..9f54314eae --- /dev/null +++ b/vlib/v/scanner/tests/invalid_unicode_err.vv @@ -0,0 +1,2 @@ +a := '\uD8FF' +println(a)