2020-07-22 18:36:24 +03:00
|
|
|
module utf8
|
2020-05-26 13:50:37 +03:00
|
|
|
|
2020-04-08 15:22:31 +03:00
|
|
|
struct Utf8State {
|
2020-05-26 13:50:37 +03:00
|
|
|
mut:
|
|
|
|
index int
|
2020-04-08 15:22:31 +03:00
|
|
|
subindex int
|
2020-05-26 13:50:37 +03:00
|
|
|
failed bool
|
2020-04-08 15:22:31 +03:00
|
|
|
}
|
2020-05-26 13:50:37 +03:00
|
|
|
|
2022-04-30 12:31:23 +03:00
|
|
|
// validate_str reports if str consists of valid UTF-8 runes
|
2020-07-22 18:36:24 +03:00
|
|
|
pub fn validate_str(str string) bool {
|
|
|
|
return validate(str.str, str.len)
|
|
|
|
}
|
|
|
|
|
2022-04-30 12:31:23 +03:00
|
|
|
// validate reports if data consists of valid UTF-8 runes
|
2022-04-15 18:25:45 +03:00
|
|
|
pub fn validate(data &u8, len int) bool {
|
2020-04-08 15:22:31 +03:00
|
|
|
mut state := Utf8State{}
|
|
|
|
for i := 0; i < len; i++ {
|
2021-04-11 10:28:19 +03:00
|
|
|
s := unsafe { data[i] }
|
2020-05-26 13:50:37 +03:00
|
|
|
if s == 0 {
|
|
|
|
break
|
|
|
|
}
|
2020-04-08 15:22:31 +03:00
|
|
|
state.next_state(s)
|
|
|
|
if state.failed {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return !state.failed && state.subindex <= 0
|
|
|
|
}
|
|
|
|
|
2020-10-17 16:26:56 +03:00
|
|
|
fn (mut s Utf8State) seq(r0 bool, r1 bool, is_tail bool) bool {
|
2020-07-22 17:31:22 +03:00
|
|
|
if s.subindex == 0 || (s.index > 1 && s.subindex == 1) || (s.index >= 6 && s.subindex == 2) {
|
2020-04-08 15:22:31 +03:00
|
|
|
if (s.subindex == 0 && r0) || (s.subindex == 1 && r1) || (s.subindex == 2 && is_tail) {
|
|
|
|
s.subindex++
|
|
|
|
return true
|
|
|
|
}
|
2020-05-26 13:50:37 +03:00
|
|
|
} else {
|
2020-04-08 15:22:31 +03:00
|
|
|
s.failed = true
|
|
|
|
if is_tail {
|
|
|
|
s.index = 0
|
|
|
|
s.subindex = 0
|
|
|
|
s.failed = false
|
|
|
|
}
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
s.index++
|
|
|
|
s.subindex = 0
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2022-04-15 18:25:45 +03:00
|
|
|
fn (mut s Utf8State) next_state(c u8) {
|
2020-05-26 13:50:37 +03:00
|
|
|
// sequence 1
|
2020-04-08 15:22:31 +03:00
|
|
|
if s.index == 0 {
|
2020-04-24 08:32:51 +03:00
|
|
|
if (c >= 0x00 + 1 && c <= 0x7F) || c == 0x00 {
|
2020-04-08 15:22:31 +03:00
|
|
|
return
|
|
|
|
}
|
|
|
|
s.index++
|
|
|
|
s.subindex = 0
|
|
|
|
}
|
|
|
|
is_tail := c >= 0x80 && c <= 0xBF
|
2020-05-26 13:50:37 +03:00
|
|
|
// sequence 2
|
|
|
|
if s.index == 1 && s.seq(c >= 0xC2 && c <= 0xDF, false, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// sequence 3
|
|
|
|
if s.index == 2 && s.seq(c == 0xE0, c >= 0xA0 && c <= 0xBF, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s.index == 3 && s.seq(c >= 0xE1 && c <= 0xEC, c >= 0x80 && c <= 0xBF, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s.index == 4 && s.seq(c == 0xED, c >= 0x80 && c <= 0x9F, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s.index == 5 && s.seq(c >= 0xEE && c <= 0xEF, c >= 0x80 && c <= 0xBF, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// sequence 4
|
|
|
|
if s.index == 6 && s.seq(c == 0xF0, c >= 0x90 && c <= 0xBF, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s.index == 7 && s.seq(c >= 0xF1 && c <= 0xF3, c >= 0x80 && c <= 0xBF, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if s.index == 8 && s.seq(c == 0xF4, c >= 0x80 && c <= 0x8F, is_tail) {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
// we should never reach here
|
2020-04-08 15:22:31 +03:00
|
|
|
s.failed = true
|
2020-05-26 13:50:37 +03:00
|
|
|
}
|