mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
regex: implement negation groups, more flexibility for bsls, small fixes (#12981)
* removed memory allocations in cleaning during clear calls * first test implementation of negative groups, more flexibility for bsls * fixed bsls failed tests * fmt * added \n to regex tests
This commit is contained in:
parent
14648fa41e
commit
dadc965082
@ -17,30 +17,31 @@ module regex
|
||||
import strings
|
||||
|
||||
pub const (
|
||||
v_regex_version = '1.0 alpha' // regex module version
|
||||
v_regex_version = '1.0 alpha' // regex module version
|
||||
|
||||
max_code_len = 256 // default small base code len for the regex programs
|
||||
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||
max_code_len = 256 // default small base code len for the regex programs
|
||||
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
|
||||
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
|
||||
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
|
||||
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
|
||||
// new line chars for now only '\n'
|
||||
new_line_list = [`\n`, `\r`]
|
||||
new_line_list = [`\n`, `\r`]
|
||||
|
||||
// Results
|
||||
no_match_found = -1
|
||||
no_match_found = -1
|
||||
|
||||
// Errors
|
||||
compile_ok = 0 // the regex string compiled, all ok
|
||||
err_char_unknown = -2 // the char used is unknow to the system
|
||||
err_undefined = -3 // the compiler symbol is undefined
|
||||
err_internal_error = -4 // Bug in the regex system!!
|
||||
err_cc_alloc_overflow = -5 // memory for char class full!!
|
||||
err_syntax_error = -6 // syntax error in regex compiling
|
||||
err_groups_overflow = -7 // max number of groups reached
|
||||
err_groups_max_nested = -8 // max number of nested group reached
|
||||
err_group_not_balanced = -9 // group not balanced
|
||||
err_group_qm_notation = -10 // group invalid notation
|
||||
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
|
||||
compile_ok = 0 // the regex string compiled, all ok
|
||||
err_char_unknown = -2 // the char used is unknow to the system
|
||||
err_undefined = -3 // the compiler symbol is undefined
|
||||
err_internal_error = -4 // Bug in the regex system!!
|
||||
err_cc_alloc_overflow = -5 // memory for char class full!!
|
||||
err_syntax_error = -6 // syntax error in regex compiling
|
||||
err_groups_overflow = -7 // max number of groups reached
|
||||
err_groups_max_nested = -8 // max number of nested group reached
|
||||
err_group_not_balanced = -9 // group not balanced
|
||||
err_group_qm_notation = -10 // group invalid notation
|
||||
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
|
||||
err_neg_group_quantifier = -12 // negation groups can not have quantifier
|
||||
)
|
||||
|
||||
const (
|
||||
@ -198,6 +199,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
|
||||
regex.err_group_not_balanced { return 'err_group_not_balanced' }
|
||||
regex.err_group_qm_notation { return 'err_group_qm_notation' }
|
||||
regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
|
||||
regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
|
||||
else { return 'err_unknown' }
|
||||
}
|
||||
}
|
||||
@ -246,13 +248,15 @@ mut:
|
||||
// validator function pointer
|
||||
validator FnValidator
|
||||
// groups variables
|
||||
group_rep int // repetition of the group
|
||||
group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
|
||||
group_rep int // repetition of the group
|
||||
group_id int = -1 // id of the group
|
||||
goto_pc int = -1 // jump to this PC if is needed
|
||||
// OR flag for the token
|
||||
next_is_or bool // true if the next token is an OR
|
||||
// dot_char token variables
|
||||
dot_check_pc int = -1 // pc of the next token to check
|
||||
dot_check_pc int = -1 // pc of the next token to check for dots
|
||||
bsls_check_pc int = -1 // pc of the next token to check for bsls
|
||||
last_dot_flag bool // if true indicate that is the last dot_char in the regex
|
||||
// debug fields
|
||||
source_index int
|
||||
@ -333,7 +337,17 @@ fn (mut re RE) reset() {
|
||||
|
||||
// init groups array
|
||||
if re.group_count > 0 {
|
||||
re.groups = []int{len: re.group_count * 2, init: -1}
|
||||
if re.groups.len == 0 {
|
||||
// first run alloc memory
|
||||
re.groups = []int{len: re.group_count * 2, init: -1}
|
||||
} else {
|
||||
// subsequent executions, only clean up the memory
|
||||
i = 0
|
||||
for i < re.groups.len {
|
||||
re.groups[i] = -1
|
||||
i++
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// reset group_csave
|
||||
@ -811,8 +825,8 @@ enum Group_parse_state {
|
||||
finish
|
||||
}
|
||||
|
||||
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
|
||||
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
|
||||
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
|
||||
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
|
||||
mut status := Group_parse_state.start
|
||||
mut i := in_i
|
||||
mut name := ''
|
||||
@ -836,10 +850,16 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
|
||||
continue
|
||||
}
|
||||
|
||||
// negate group
|
||||
if status == .q_mark1 && ch == `!` {
|
||||
i += char_len
|
||||
return 0, false, true, name, i
|
||||
}
|
||||
|
||||
// non capturing group
|
||||
if status == .q_mark1 && ch == `:` {
|
||||
i += char_len
|
||||
return 0, false, name, i
|
||||
return 0, false, false, name, i
|
||||
}
|
||||
|
||||
// enter in P section
|
||||
@ -852,7 +872,7 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
|
||||
// not a valid q mark found
|
||||
if status == .q_mark1 {
|
||||
// println("NO VALID Q MARK")
|
||||
return -2, true, name, i
|
||||
return -2, true, false, name, i
|
||||
}
|
||||
|
||||
if status == .p_status && ch == `<` {
|
||||
@ -878,20 +898,20 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
|
||||
// end name
|
||||
if status == .p_in_name && ch == `>` {
|
||||
i += char_len
|
||||
return 0, true, name, i
|
||||
return 0, true, false, name, i
|
||||
}
|
||||
|
||||
// error on name group
|
||||
if status == .p_in_name {
|
||||
return -2, true, name, i
|
||||
return -2, true, false, name, i
|
||||
}
|
||||
|
||||
// normal group, nothig to do, exit
|
||||
return 0, true, name, i
|
||||
return 0, true, false, name, i
|
||||
}
|
||||
// UNREACHABLE
|
||||
// println("ERROR!! NOT MEANT TO BE HERE!!1")
|
||||
return -2, true, name, i
|
||||
return -2, true, false, name, i
|
||||
}
|
||||
|
||||
const (
|
||||
@ -949,7 +969,8 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
return regex.err_groups_max_nested, i + 1
|
||||
}
|
||||
|
||||
tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt, i)
|
||||
tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
|
||||
i)
|
||||
|
||||
// manage question mark format error
|
||||
if tmp_res < -1 {
|
||||
@ -984,6 +1005,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
re.prog[pc].rep_min = 1
|
||||
re.prog[pc].rep_max = 1
|
||||
|
||||
// manage negation groups
|
||||
if negate_flag == true {
|
||||
re.prog[pc].group_neg = true
|
||||
re.prog[pc].rep_min = 0 // may be not catched, but it is ok
|
||||
}
|
||||
|
||||
// set the group id
|
||||
if cgroup_flag == false {
|
||||
// println("NO CAPTURE GROUP")
|
||||
@ -1015,6 +1042,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
|
||||
// re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
|
||||
|
||||
if re.prog[goto_pc].group_neg == true {
|
||||
re.prog[pc].group_neg = re.prog[goto_pc].group_neg
|
||||
re.prog[pc].rep_min = re.prog[goto_pc].rep_min
|
||||
}
|
||||
|
||||
pc = pc + 1
|
||||
i = i + char_len
|
||||
continue
|
||||
@ -1050,6 +1082,12 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
char_next, char_next_len = re.get_char(in_txt, i + char_len)
|
||||
}
|
||||
mut quant_flag := true
|
||||
|
||||
// negation groups can not have quantifiers
|
||||
if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
|
||||
return regex.err_neg_group_quantifier, i
|
||||
}
|
||||
|
||||
match byte(char_tmp) {
|
||||
`?` {
|
||||
// println("q: ${char_tmp:c}")
|
||||
@ -1215,6 +1253,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
dot_char_count++
|
||||
mut pc2 := pc1 + 1
|
||||
for pc2 < pc {
|
||||
// consecutive dot chars is an error
|
||||
if re.prog[pc2].ist == regex.ist_dot_char {
|
||||
return regex.err_syntax_error, 0
|
||||
}
|
||||
@ -1246,6 +1285,49 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// manage bsls_char
|
||||
//
|
||||
|
||||
// find the checks for bsls, if any...
|
||||
pc1 = 0
|
||||
mut bsls_char_count := 0
|
||||
mut last_bsls_char_pc := -1
|
||||
for pc1 < pc {
|
||||
if re.prog[pc1].ist == regex.ist_bsls_char {
|
||||
// println("bsls_char pc: $pc1")
|
||||
last_bsls_char_pc = pc1
|
||||
bsls_char_count++
|
||||
mut pc2 := pc1 + 1
|
||||
for pc2 < pc {
|
||||
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
|
||||
regex.ist_group_start] {
|
||||
// println("Next bsls check is PC: ${pc2}")
|
||||
re.prog[pc1].bsls_check_pc = pc2
|
||||
break
|
||||
}
|
||||
pc2++
|
||||
}
|
||||
}
|
||||
pc1++
|
||||
}
|
||||
|
||||
// println("last_bsls_char_pc: $last_bsls_char_pc")
|
||||
if last_bsls_char_pc >= 0 {
|
||||
pc1 = last_bsls_char_pc + 1
|
||||
mut is_last_bsls := true
|
||||
for pc1 < pc {
|
||||
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
|
||||
is_last_bsls = false
|
||||
break
|
||||
}
|
||||
pc1++
|
||||
}
|
||||
if is_last_bsls {
|
||||
re.prog[last_bsls_char_pc].last_dot_flag = true
|
||||
}
|
||||
}
|
||||
|
||||
//******************************************
|
||||
|
||||
// OR branch
|
||||
@ -1405,14 +1487,15 @@ pub fn (re RE) get_query() string {
|
||||
|
||||
// GROUP start
|
||||
if ch == regex.ist_group_start {
|
||||
if re.debug == 0 {
|
||||
res.write_string('(')
|
||||
} else {
|
||||
if tk.group_id == -1 {
|
||||
res.write_string('(?:') // non capturing group
|
||||
} else {
|
||||
res.write_string('#${tk.group_id}(')
|
||||
}
|
||||
if re.debug > 0 {
|
||||
res.write_string('#$tk.group_id')
|
||||
}
|
||||
res.write_string('(')
|
||||
|
||||
if tk.group_neg == true {
|
||||
res.write_string('?!') // negation group
|
||||
} else if tk.group_id == -1 {
|
||||
res.write_string('?:') // non capturing group
|
||||
}
|
||||
|
||||
for x in re.group_map.keys() {
|
||||
@ -1470,7 +1553,7 @@ pub fn (re RE) get_query() string {
|
||||
}
|
||||
|
||||
// quantifier
|
||||
if !(tk.rep_min == 1 && tk.rep_max == 1) {
|
||||
if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
|
||||
if tk.rep_min == 0 && tk.rep_max == 1 {
|
||||
res.write_string('?')
|
||||
} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
|
||||
@ -2081,6 +2164,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
|
||||
continue
|
||||
}
|
||||
// check bsls
|
||||
/*
|
||||
else if ist == regex.ist_bsls_char {
|
||||
state.match_flag = false
|
||||
tmp_res := re.prog[state.pc].validator(byte(ch))
|
||||
@ -2103,6 +2187,101 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
|
||||
m_state = .ist_quant_n
|
||||
continue
|
||||
}
|
||||
*/
|
||||
else if ist == regex.ist_bsls_char {
|
||||
// println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
|
||||
|
||||
// check next token to be false
|
||||
mut next_check_flag := false
|
||||
|
||||
// if we are done with max go on dot char are dedicated case!!
|
||||
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
|
||||
re.state_list.pop()
|
||||
m_state = .ist_next
|
||||
continue
|
||||
}
|
||||
|
||||
if re.prog[state.pc].bsls_check_pc >= 0
|
||||
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
|
||||
// load the char
|
||||
// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
|
||||
ch_t := ch
|
||||
chk_pc := re.prog[state.pc].bsls_check_pc
|
||||
|
||||
// simple char
|
||||
if re.prog[chk_pc].ist == regex.ist_simple_char {
|
||||
if re.prog[chk_pc].ch == ch_t {
|
||||
next_check_flag = true
|
||||
}
|
||||
// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
|
||||
}
|
||||
// char char_class
|
||||
else if re.prog[chk_pc].ist == regex.ist_char_class_pos
|
||||
|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
|
||||
mut cc_neg := false
|
||||
if re.prog[chk_pc].ist == regex.ist_char_class_neg {
|
||||
cc_neg = true
|
||||
}
|
||||
mut cc_res := re.check_char_class(chk_pc, ch_t)
|
||||
|
||||
if cc_neg {
|
||||
cc_res = !cc_res
|
||||
}
|
||||
next_check_flag = cc_res
|
||||
// println("Check [ist_char_class] => $next_check_flag")
|
||||
}
|
||||
// check bsls
|
||||
else if re.prog[chk_pc].ist == regex.ist_bsls_char {
|
||||
next_check_flag = re.prog[chk_pc].validator(byte(ch_t))
|
||||
// println("Check [ist_bsls_char] => $next_check_flag")
|
||||
}
|
||||
}
|
||||
|
||||
// check if we must continue or pass to the next IST
|
||||
if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
|
||||
// println("save the state!!")
|
||||
mut dot_state := StateObj{
|
||||
group_index: state.group_index
|
||||
match_flag: state.match_flag
|
||||
match_index: state.match_index
|
||||
first_match: state.first_match
|
||||
pc: state.pc
|
||||
i: state.i + char_len
|
||||
char_len: char_len
|
||||
last_dot_pc: state.pc
|
||||
}
|
||||
// if we are mananging a .* stay on the same char on return
|
||||
if re.prog[state.pc].rep_min == 0 {
|
||||
dot_state.i -= char_len
|
||||
}
|
||||
|
||||
re.state_list << dot_state
|
||||
|
||||
m_state = .ist_quant_n
|
||||
// println("dot_char stack len: ${re.state_list.len}")
|
||||
continue
|
||||
}
|
||||
|
||||
tmp_res := re.prog[state.pc].validator(byte(ch))
|
||||
if tmp_res == false {
|
||||
m_state = .ist_quant_n
|
||||
continue
|
||||
}
|
||||
// println("${ch} => ${tmp_res}")
|
||||
|
||||
state.match_flag = true
|
||||
l_ist = u32(regex.ist_dot_char)
|
||||
|
||||
if state.first_match < 0 {
|
||||
state.first_match = state.i
|
||||
}
|
||||
state.match_index = state.i
|
||||
re.prog[state.pc].rep++ // increase repetitions
|
||||
|
||||
state.i += char_len
|
||||
m_state = .ist_quant_p
|
||||
continue
|
||||
}
|
||||
// simple char IST
|
||||
else if ist == regex.ist_simple_char {
|
||||
// println("ist_simple_char")
|
||||
@ -2213,6 +2392,13 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) {
|
||||
tmp_pc = re.group_data[state.group_index]
|
||||
}
|
||||
|
||||
if re.prog[tmp_pc].group_neg == true {
|
||||
// println("***** Negation of the group")
|
||||
result = regex.no_match_found
|
||||
m_state = .stop
|
||||
continue
|
||||
}
|
||||
|
||||
rep := re.prog[tmp_pc].group_rep
|
||||
|
||||
if rep < re.prog[tmp_pc].rep_min {
|
||||
|
@ -760,3 +760,27 @@ fn test_long_query() {
|
||||
//println("$start, $end")
|
||||
assert start >= 0 && end == base_string.len
|
||||
}
|
||||
|
||||
|
||||
struct Test_negation_group {
|
||||
src string
|
||||
res bool
|
||||
}
|
||||
const(
|
||||
negation_groups = [
|
||||
Test_negation_group{'automobile',false},
|
||||
Test_negation_group{'botomobile',true},
|
||||
Test_negation_group{'auto_caravan',false},
|
||||
Test_negation_group{'moto_mobile',true},
|
||||
Test_negation_group{'pippole',true},
|
||||
Test_negation_group{'boring test',false},
|
||||
]
|
||||
)
|
||||
fn test_negation_groups() {
|
||||
mut query := r"(?!auto)\w+le"
|
||||
mut re := regex.regex_opt(query) or { panic(err) }
|
||||
for test in negation_groups {
|
||||
start, end := re.match_string(test.src)
|
||||
assert (start >= 0) == test.res
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user