mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
scanner: multibyte rune literals now support unicode, hex, and octal escape codes (#13140)
This commit is contained in:
@@ -15,4 +15,4 @@ vlib/v/checker/tests/sum_type_ref_variant_err.vv:9:18: error: sum type cannot ho
|
||||
7 | type Alphabet1 = Abc | string | &Xyz
|
||||
8 | type Alphabet2 = Abc | &Xyz | string
|
||||
9 | type Alphabet3 = &Xyz | Abc | string
|
||||
| ~~~~
|
||||
| ~~~~
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
vlib/v/parser/tests/duplicate_type_a.vv:3:11: error: cannot register interface `Foo`, another type with this name exists
|
||||
1 | struct Foo {}
|
||||
2 |
|
||||
2 |
|
||||
3 | interface Foo {}
|
||||
| ~~~
|
||||
|
||||
@@ -1307,6 +1307,28 @@ fn decode_h_escapes(s string, start int, escapes_pos []int) string {
|
||||
return ss.join('')
|
||||
}
|
||||
|
||||
// handle single-byte inline octal escapes like '\###'
|
||||
fn decode_o_escapes(s string, start int, escapes_pos []int) string {
|
||||
if escapes_pos.len == 0 {
|
||||
return s
|
||||
}
|
||||
mut ss := []string{cap: escapes_pos.len}
|
||||
ss << s[..escapes_pos.first() - start] // everything before the first escape code position
|
||||
for i, pos in escapes_pos {
|
||||
idx := pos - start
|
||||
end_idx := idx + 4 // "\XXX".len == 4
|
||||
// notice this function doesn't do any decoding... it just replaces '\141' with the byte 0o141
|
||||
ss << [byte(strconv.parse_uint(s[idx + 1..end_idx], 8, 8) or { 0 })].bytestr()
|
||||
if i + 1 < escapes_pos.len {
|
||||
ss << s[end_idx..escapes_pos[i + 1] - start]
|
||||
} else {
|
||||
ss << s[end_idx..]
|
||||
}
|
||||
}
|
||||
return ss.join('')
|
||||
}
|
||||
|
||||
// decode the flagged unicode escape sequences into their utf-8 bytes
|
||||
fn decode_u_escapes(s string, start int, escapes_pos []int) string {
|
||||
if escapes_pos.len == 0 {
|
||||
return s
|
||||
@@ -1348,9 +1370,10 @@ fn trim_slash_line_break(s string) string {
|
||||
/// possibilities:
|
||||
/// single chars like `a`, `b` => 'a', 'b'
|
||||
/// escaped single chars like `\\`, `\``, `\n` => '\\', '`', '\n'
|
||||
/// escaped hex bytes like `\x01`, `\x61` => '\x01', 'a'
|
||||
/// escaped multibyte runes like `\xe29885` => (★)
|
||||
/// escaped single hex bytes like `\x01`, `\x61` => '\x01', 'a'
|
||||
/// escaped unicode literals like `\u2605`
|
||||
/// escaped utf8 runes in hex like `\xe2\x98\x85` => (★)
|
||||
/// escaped utf8 runes in octal like `\342\230\205` => (★)
|
||||
fn (mut s Scanner) ident_char() string {
|
||||
lspos := token.Position{
|
||||
line_nr: s.line_nr
|
||||
@@ -1365,6 +1388,7 @@ fn (mut s Scanner) ident_char() string {
|
||||
// set flags for advanced escapes first
|
||||
escaped_hex := s.expect('\\x', start + 1)
|
||||
escaped_unicode := s.expect('\\u', start + 1)
|
||||
escaped_octal := !escaped_hex && !escaped_unicode && s.expect('\\', start + 1)
|
||||
|
||||
// walk the string to get characters up to the next backtick
|
||||
for {
|
||||
@@ -1390,65 +1414,40 @@ fn (mut s Scanner) ident_char() string {
|
||||
return c
|
||||
}
|
||||
if len != 1 {
|
||||
// the string inside the backticks is longer than one character
|
||||
// but we might only have one rune... attempt to decode escapes
|
||||
// if the content expresses an escape code, it will have an even number of characters
|
||||
// e.g. \x61 or \u2605
|
||||
if (c.len % 2 == 0) && (escaped_hex || escaped_unicode) {
|
||||
// e.g. (octal) \141 (hex) \x61 or (unicode) \u2605
|
||||
// we don't handle binary escape codes in rune literals
|
||||
orig := c
|
||||
if (c.len % 2 == 0) && (escaped_hex || escaped_unicode || escaped_octal) {
|
||||
if escaped_unicode {
|
||||
// there can only be one, so attempt to decode it now
|
||||
c = decode_u_escapes(c, 0, [0])
|
||||
} else {
|
||||
// we have to handle hex ourselves
|
||||
ascii_0 := byte(0x30)
|
||||
ascii_a := byte(0x61)
|
||||
mut accumulated := []byte{}
|
||||
val := c[2..c.len].to_lower() // 0A -> 0a
|
||||
mut offset := 0
|
||||
// take two characters at a time, parse as hex and add to bytes
|
||||
for {
|
||||
if offset >= val.len - 1 {
|
||||
break
|
||||
// find escape sequence start positions
|
||||
mut escapes_pos := []int{}
|
||||
for i, v in c {
|
||||
if v == `\\` {
|
||||
escapes_pos << i
|
||||
}
|
||||
mut byteval := byte(0)
|
||||
big := val[offset]
|
||||
little := val[offset + 1]
|
||||
if !big.is_hex_digit() {
|
||||
accumulated.clear()
|
||||
break
|
||||
}
|
||||
if !little.is_hex_digit() {
|
||||
accumulated.clear()
|
||||
break
|
||||
}
|
||||
|
||||
if big.is_digit() {
|
||||
byteval |= (big - ascii_0) << 4
|
||||
} else {
|
||||
byteval |= (big - ascii_a + 10) << 4
|
||||
}
|
||||
if little.is_digit() {
|
||||
byteval |= (little - ascii_0)
|
||||
} else {
|
||||
byteval |= (little - ascii_a + 10)
|
||||
}
|
||||
|
||||
accumulated << byteval
|
||||
offset += 2
|
||||
}
|
||||
if accumulated.len > 0 {
|
||||
c = accumulated.bytestr()
|
||||
if escaped_hex {
|
||||
c = decode_h_escapes(c, 0, escapes_pos)
|
||||
} else {
|
||||
c = decode_o_escapes(c, 0, escapes_pos)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// the string inside the backticks is longer than one character
|
||||
// but we might only have one rune, say in the case
|
||||
u := c.runes()
|
||||
if u.len != 1 {
|
||||
if escaped_hex || escaped_unicode {
|
||||
s.error('invalid character literal (escape sequence did not refer to a singular rune)')
|
||||
s.error('invalid character literal `$orig` => `$c` ($u) (escape sequence did not refer to a singular rune)')
|
||||
} else {
|
||||
s.add_error_detail_with_pos('use quotes for strings, backticks for characters',
|
||||
lspos)
|
||||
s.error('invalid character literal (more than one character)')
|
||||
s.error('invalid character literal `$orig` => `$c` ($u) (more than one character)')
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -150,13 +150,19 @@ fn test_ref_ref_array_ref_ref_foo() {
|
||||
assert result[6] == .name
|
||||
}
|
||||
|
||||
fn test_escape_string() {
|
||||
// these assertions aren't helpful...
|
||||
// they test the vlib built-in to the compiler,
|
||||
// but we want to test this module before compilation
|
||||
assert '\x61' == 'a'
|
||||
assert '\x62' == 'b'
|
||||
// assert `\x61` == `a` // will work after pull request goes through
|
||||
fn test_escape_rune() {
|
||||
// these lines work if the v compiler is working
|
||||
// will not work until v compiler on github is updated
|
||||
// assert `\x61` == `a`
|
||||
// assert `\u0061` == `a`
|
||||
|
||||
// will not work until PR is accepted
|
||||
// assert `\141` == `a`
|
||||
// assert `\xe2\x98\x85` == `★`
|
||||
// assert `\342\230\205` == `★`
|
||||
|
||||
// the following lines test the scanner module
|
||||
// even before it is compiled into the v executable
|
||||
|
||||
// SINGLE CHAR ESCAPES
|
||||
// SINGLE CHAR APOSTROPHE
|
||||
@@ -187,14 +193,30 @@ fn test_escape_string() {
|
||||
// SINGLE CHAR INCORRECT ESCAPE
|
||||
// result = scan_tokens(r'`\x61\x61`') // should always result in an error
|
||||
|
||||
// SINGLE CHAR MULTI-BYTE UTF-8
|
||||
// Compilation blocked by vlib/v/checker/check_types.v, but works in the repl
|
||||
result = scan_tokens(r'`\xe29885`')
|
||||
// SINGLE CHAR MULTI-BYTE UTF-8 (hex)
|
||||
result = scan_tokens(r'`\xe2\x98\x85`')
|
||||
assert result[0].lit == r'★'
|
||||
|
||||
// SINGLE CHAR MULTI-BYTE UTF-8 (octal)
|
||||
result = scan_tokens(r'`\342\230\205`')
|
||||
assert result[0].lit == r'★'
|
||||
}
|
||||
|
||||
fn test_escape_string() {
|
||||
// these lines work if the v compiler is working
|
||||
assert '\x61' == 'a'
|
||||
assert '\x62' == 'b'
|
||||
assert '\u0061' == 'a'
|
||||
assert '\141' == 'a'
|
||||
assert '\xe2\x98\x85' == '★'
|
||||
assert '\342\230\205' == '★'
|
||||
|
||||
// the following lines test the scanner module
|
||||
// even before it is compiled into the v executable
|
||||
|
||||
// STRING ESCAPES =================
|
||||
// STRING APOSTROPHE
|
||||
result = scan_tokens(r"'\''")
|
||||
mut result := scan_tokens(r"'\''")
|
||||
assert result[0].kind == .string
|
||||
assert result[0].lit == r"\'"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user