1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

regex: fix a bug for CC token not correctly parsed in groups (#16272)

This commit is contained in:
penguindark 2022-11-01 07:58:29 +01:00 committed by GitHub
parent 47a10f3181
commit 362adfae3a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 83 additions and 14 deletions

View File

@ -259,6 +259,7 @@ mut:
// dot_char token variables // dot_char token variables
dot_check_pc int = -1 // pc of the next token to check for dots dot_check_pc int = -1 // pc of the next token to check for dots
bsls_check_pc int = -1 // pc of the next token to check for bsls bsls_check_pc int = -1 // pc of the next token to check for bsls
cc_check_pc int = -1 // pc of the next token to check for CC
last_dot_flag bool // if true indicate that is the last dot_char in the regex last_dot_flag bool // if true indicate that is the last dot_char in the regex
// debug fields // debug fields
source_index int source_index int
@ -1270,7 +1271,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
pc1++ pc1++
} }
// println("last_dot_char_pc: $last_dot_char_pc") // println("last_dot_char_pc: ${last_dot_char_pc}")
if last_dot_char_pc >= 0 { if last_dot_char_pc >= 0 {
pc1 = last_dot_char_pc + 1 pc1 = last_dot_char_pc + 1
mut is_last_dot := true mut is_last_dot := true
@ -1313,7 +1314,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
pc1++ pc1++
} }
// println('last_bsls_char_pc: $last_bsls_char_pc') // println('last_bsls_char_pc: ${last_bsls_char_pc}')
if last_bsls_char_pc >= 0 { if last_bsls_char_pc >= 0 {
pc1 = last_bsls_char_pc + 1 pc1 = last_bsls_char_pc + 1
mut is_last_bsls := true mut is_last_bsls := true
@ -1329,6 +1330,46 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
} }
} }
//
// manage CC
//
pc1 = 0
mut cc_char_count := 0
mut last_cc_char_pc := -1
for pc1 < pc {
if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
last_cc_char_pc = pc1
cc_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next CC check is PC: ${pc2}")
re.prog[pc1].cc_check_pc = pc2
break
}
pc2++
}
}
pc1++
}
// println('last_cc_char_pc: ${last_cc_char_pc}')
if last_cc_char_pc >= 0 {
pc1 = last_cc_char_pc + 1
mut is_last_cc := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
is_last_cc = false
break
}
pc1++
}
if is_last_cc {
re.prog[last_cc_char_pc].last_dot_flag = true
}
}
//****************************************** //******************************************
// OR branch // OR branch
@ -1417,6 +1458,9 @@ pub fn (re RE) get_code() string {
ist := tk.ist ist := tk.ist
if ist == regex.ist_bsls_char { if ist == regex.ist_bsls_char {
res.write_string('[\\${tk.ch:1c}] BSLS') res.write_string('[\\${tk.ch:1c}] BSLS')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_prog_end { } else if ist == regex.ist_prog_end {
res.write_string('PROG_END') res.write_string('PROG_END')
stop_flag = true stop_flag = true
@ -1424,8 +1468,14 @@ pub fn (re RE) get_code() string {
res.write_string('OR ') res.write_string('OR ')
} else if ist == regex.ist_char_class_pos { } else if ist == regex.ist_char_class_pos {
res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS') res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_char_class_neg { } else if ist == regex.ist_char_class_neg {
res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG') res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_dot_char { } else if ist == regex.ist_dot_char {
res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc') res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc')
if tk.last_dot_flag == true { if tk.last_dot_flag == true {
@ -1788,9 +1838,15 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
// println("Finished text!!") // println("Finished text!!")
src_end = true src_end = true
// we have fished the text, we must manage out pf bound indexes
if state.i >= in_txt_len {
state.i = in_txt_len - 1
}
// manage groups // manage groups
if state.group_index >= 0 && state.match_index >= 0 { if state.group_index >= 0 && state.match_index >= 0 {
// println("End text with open groups!") // println("End text with open groups!")
// println("state.group_index: ${state.group_index}")
// close the groups // close the groups
for state.group_index >= 0 { for state.group_index >= 0 {
tmp_pc := re.group_data[state.group_index] tmp_pc := re.group_data[state.group_index]
@ -1804,15 +1860,13 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
// save group results // save group results
g_index := re.prog[tmp_pc].group_id * 2 g_index := re.prog[tmp_pc].group_id * 2
// println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
if start_i >= 0 { if start_i >= 0 {
re.groups[g_index] = start_i re.groups[g_index] = start_i
} else { } else {
re.groups[g_index] = 0 re.groups[g_index] = 0
} }
// we have fished the text, we must manage out pf bound indexes
if state.i >= in_txt_len {
state.i = in_txt_len - 1
}
re.groups[g_index + 1] = state.i re.groups[g_index + 1] = state.i
if re.groups[g_index + 1] >= in_txt_len { if re.groups[g_index + 1] >= in_txt_len {
@ -1827,6 +1881,8 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
} }
} }
// println("re.groups: ${re.groups}")
// the text is finished and the groups closed and we are the last group, ok exit // the text is finished and the groups closed and we are the last group, ok exit
if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
// println("Last group end") // println("Last group end")
@ -1847,12 +1903,17 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
return state.first_match, state.i return state.first_match, state.i
} }
// we are in a last dot_ char case if l_ist in [
if l_ist == regex.ist_dot_char { rune(regex.ist_char_class_neg),
// println("***** We have a last dot_char") regex.ist_char_class_pos,
regex.ist_bsls_char,
regex.ist_dot_char,
] {
// println("***** We have a last special token")
// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}") // println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}") // println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
// println("first match: ${state.first_match}") // println("first match: ${state.first_match}")
if re.prog[state.pc].last_dot_flag == true if re.prog[state.pc].last_dot_flag == true
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min && re.prog[state.pc].rep >= re.prog[state.pc].rep_min
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max { && re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
@ -1860,10 +1921,6 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
} }
// println("Not fitted!!") // println("Not fitted!!")
} }
// m_state = .end
// break
// no groups open, check the last token quantifier // no groups open, check the last token quantifier
if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end { if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
if re.prog[state.pc].rep >= re.prog[state.pc].rep_min if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
@ -1873,7 +1930,7 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
} }
} }
// print("No good exit!!") // println("No good exit!!")
return regex.no_match_found, state.i return regex.no_match_found, state.i
} }

View File

@ -364,6 +364,18 @@ find_all_test_suite = [
r"\+{3}.*\+{3}", r"\+{3}.*\+{3}",
[0, 11, 18, 32, 33, 44], [0, 11, 18, 32, 33, 44],
['+++pippo+++', '+++ pippo2 +++', '+++ oggi+++'] ['+++pippo+++', '+++ pippo2 +++', '+++ oggi+++']
},
Test_find_all{
"ab",
r"[^\n]*",
[0, 2],
['ab']
},
Test_find_all{
"ab",
r"([^\n]*)",
[0, 2],
['ab']
} }
] ]