1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

v.scanner: fix an unicode string escape handling bug (#15834)

This commit is contained in:
Carl Xiong 2022-09-22 18:44:55 +08:00 committed by GitHub
parent 27305d1a5f
commit 78f8b9eb28
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 73 additions and 15 deletions

View File

@ -1236,12 +1236,38 @@ fn (mut s Scanner) ident_string() string {
}
if start <= s.pos {
mut string_so_far := s.text[start..end]
if !s.is_fmt && u_escapes_pos.len > 0 {
string_so_far = s.decode_u_escapes(string_so_far, start, u_escapes_pos)
if !s.is_fmt {
mut segment_idx := 0
mut str_segments := []string{}
if u_escapes_pos.len + h_escapes_pos.len > 0 {
mut all_pos := []int{}
all_pos << u_escapes_pos
all_pos << h_escapes_pos
if u_escapes_pos.len != 0 && h_escapes_pos.len != 0 {
all_pos.sort()
}
if !s.is_fmt && h_escapes_pos.len > 0 {
string_so_far = decode_h_escapes(string_so_far, start, h_escapes_pos)
for pos in all_pos {
str_segments << string_so_far[segment_idx..(pos - start)]
segment_idx = pos - start
if pos in u_escapes_pos {
end_idx, segment := s.decode_u_escape_single(string_so_far, segment_idx)
str_segments << segment
segment_idx = end_idx
}
if pos in h_escapes_pos {
end_idx, segment := decode_h_escape_single(string_so_far, segment_idx)
str_segments << segment
segment_idx = end_idx
}
}
}
if segment_idx < string_so_far.len {
str_segments << string_so_far[segment_idx..]
}
string_so_far = str_segments.join('')
}
if n_cr_chars > 0 {
string_so_far = string_so_far.replace('\r', '')
}
@ -1254,6 +1280,13 @@ fn (mut s Scanner) ident_string() string {
return lit
}
fn decode_h_escape_single(str string, idx int) (int, string) {
end_idx := idx + 4 // "\xXX".len == 4
// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
return end_idx, [u8(strconv.parse_uint(str[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
}
// only handle single-byte inline escapes like '\xc0'
fn decode_h_escapes(s string, start int, escapes_pos []int) string {
if escapes_pos.len == 0 {
@ -1263,9 +1296,9 @@ fn decode_h_escapes(s string, start int, escapes_pos []int) string {
ss << s[..escapes_pos.first() - start]
for i, pos in escapes_pos {
idx := pos - start
end_idx := idx + 4 // "\xXX".len == 4
// notice this function doesn't do any decoding... it just replaces '\xc0' with the byte 0xc0
ss << [u8(strconv.parse_uint(s[idx + 2..end_idx], 16, 8) or { 0 })].bytestr()
end_idx, segment := decode_h_escape_single(s, idx)
ss << segment
if i + 1 < escapes_pos.len {
ss << s[end_idx..escapes_pos[i + 1] - start]
} else {
@ -1296,6 +1329,17 @@ fn decode_o_escapes(s string, start int, escapes_pos []int) string {
return ss.join('')
}
fn (mut s Scanner) decode_u_escape_single(str string, idx int) (int, string) {
end_idx := idx + 6 // "\uXXXX".len == 6
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
if rune(escaped_code_point).length_in_bytes() == -1 {
s.error('invalid unicode point `$str`')
}
return end_idx, utf32_to_str(u32(escaped_code_point))
}
// decode the flagged unicode escape sequences into their utf-8 bytes
fn (mut s Scanner) decode_u_escapes(str string, start int, escapes_pos []int) string {
if escapes_pos.len == 0 {
@ -1305,13 +1349,8 @@ fn (mut s Scanner) decode_u_escapes(str string, start int, escapes_pos []int) st
ss << str[..escapes_pos.first() - start]
for i, pos in escapes_pos {
idx := pos - start
end_idx := idx + 6 // "\uXXXX".len == 6
escaped_code_point := strconv.parse_uint(str[idx + 2..end_idx], 16, 32) or { 0 }
// Check if Escaped Code Point is invalid or not
if rune(escaped_code_point).length_in_bytes() == -1 {
s.error('invalid unicode point `$str`')
}
ss << utf32_to_str(u32(escaped_code_point))
end_idx, segment := s.decode_u_escape_single(str, idx)
ss << segment
if i + 1 < escapes_pos.len {
ss << str[end_idx..escapes_pos[i + 1] - start]
} else {

View File

@ -234,6 +234,9 @@ fn test_escape_string() {
result = scan_tokens(r"'\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'★'
result = scan_tokens(r"'H\u2605H'")
assert result[0].kind == .string
assert result[0].lit == r'H★H'
// STRING ESCAPED ASCII
result = scan_tokens(r"'\x61'")
@ -246,6 +249,22 @@ fn test_escape_string() {
assert result[0].kind == .string
assert result[0].lit.bytes() == [u8(0xe2), `9`, `8`, `8`, `5`]
// MIX STRING ESCAPES
result = scan_tokens(r"'\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'a★'
result = scan_tokens(r"'\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'★a'
// MIX STRING ESCAPES with offset
result = scan_tokens(r"'x \x61\u2605\x61'")
assert result[0].kind == .string
assert result[0].lit == r'x a★a'
result = scan_tokens(r"'x \u2605\x61\u2605'")
assert result[0].kind == .string
assert result[0].lit == r'x ★a★'
// SHOULD RESULT IN ERRORS
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
// result = scan_tokens(r"'\x'") // should always result in an error