mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
regex: fix a bug for CC token not correctly parsed in groups (#16272)
This commit is contained in:
parent
47a10f3181
commit
362adfae3a
@ -259,6 +259,7 @@ mut:
|
|||||||
// dot_char token variables
|
// dot_char token variables
|
||||||
dot_check_pc int = -1 // pc of the next token to check for dots
|
dot_check_pc int = -1 // pc of the next token to check for dots
|
||||||
bsls_check_pc int = -1 // pc of the next token to check for bsls
|
bsls_check_pc int = -1 // pc of the next token to check for bsls
|
||||||
|
cc_check_pc int = -1 // pc of the next token to check for CC
|
||||||
last_dot_flag bool // if true indicate that is the last dot_char in the regex
|
last_dot_flag bool // if true indicate that is the last dot_char in the regex
|
||||||
// debug fields
|
// debug fields
|
||||||
source_index int
|
source_index int
|
||||||
@ -1270,7 +1271,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
|||||||
pc1++
|
pc1++
|
||||||
}
|
}
|
||||||
|
|
||||||
// println("last_dot_char_pc: $last_dot_char_pc")
|
// println("last_dot_char_pc: ${last_dot_char_pc}")
|
||||||
if last_dot_char_pc >= 0 {
|
if last_dot_char_pc >= 0 {
|
||||||
pc1 = last_dot_char_pc + 1
|
pc1 = last_dot_char_pc + 1
|
||||||
mut is_last_dot := true
|
mut is_last_dot := true
|
||||||
@ -1313,7 +1314,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
|||||||
pc1++
|
pc1++
|
||||||
}
|
}
|
||||||
|
|
||||||
// println('last_bsls_char_pc: $last_bsls_char_pc')
|
// println('last_bsls_char_pc: ${last_bsls_char_pc}')
|
||||||
if last_bsls_char_pc >= 0 {
|
if last_bsls_char_pc >= 0 {
|
||||||
pc1 = last_bsls_char_pc + 1
|
pc1 = last_bsls_char_pc + 1
|
||||||
mut is_last_bsls := true
|
mut is_last_bsls := true
|
||||||
@ -1329,6 +1330,46 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// manage CC
|
||||||
|
//
|
||||||
|
pc1 = 0
|
||||||
|
mut cc_char_count := 0
|
||||||
|
mut last_cc_char_pc := -1
|
||||||
|
for pc1 < pc {
|
||||||
|
if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
|
||||||
|
last_cc_char_pc = pc1
|
||||||
|
cc_char_count++
|
||||||
|
mut pc2 := pc1 + 1
|
||||||
|
for pc2 < pc {
|
||||||
|
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
|
||||||
|
regex.ist_group_start] {
|
||||||
|
// println("Next CC check is PC: ${pc2}")
|
||||||
|
re.prog[pc1].cc_check_pc = pc2
|
||||||
|
break
|
||||||
|
}
|
||||||
|
pc2++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pc1++
|
||||||
|
}
|
||||||
|
|
||||||
|
// println('last_cc_char_pc: ${last_cc_char_pc}')
|
||||||
|
if last_cc_char_pc >= 0 {
|
||||||
|
pc1 = last_cc_char_pc + 1
|
||||||
|
mut is_last_cc := true
|
||||||
|
for pc1 < pc {
|
||||||
|
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
|
||||||
|
is_last_cc = false
|
||||||
|
break
|
||||||
|
}
|
||||||
|
pc1++
|
||||||
|
}
|
||||||
|
if is_last_cc {
|
||||||
|
re.prog[last_cc_char_pc].last_dot_flag = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//******************************************
|
//******************************************
|
||||||
|
|
||||||
// OR branch
|
// OR branch
|
||||||
@ -1417,6 +1458,9 @@ pub fn (re RE) get_code() string {
|
|||||||
ist := tk.ist
|
ist := tk.ist
|
||||||
if ist == regex.ist_bsls_char {
|
if ist == regex.ist_bsls_char {
|
||||||
res.write_string('[\\${tk.ch:1c}] BSLS')
|
res.write_string('[\\${tk.ch:1c}] BSLS')
|
||||||
|
if tk.last_dot_flag == true {
|
||||||
|
res.write_string(' last!')
|
||||||
|
}
|
||||||
} else if ist == regex.ist_prog_end {
|
} else if ist == regex.ist_prog_end {
|
||||||
res.write_string('PROG_END')
|
res.write_string('PROG_END')
|
||||||
stop_flag = true
|
stop_flag = true
|
||||||
@ -1424,8 +1468,14 @@ pub fn (re RE) get_code() string {
|
|||||||
res.write_string('OR ')
|
res.write_string('OR ')
|
||||||
} else if ist == regex.ist_char_class_pos {
|
} else if ist == regex.ist_char_class_pos {
|
||||||
res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
|
res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
|
||||||
|
if tk.last_dot_flag == true {
|
||||||
|
res.write_string(' last!')
|
||||||
|
}
|
||||||
} else if ist == regex.ist_char_class_neg {
|
} else if ist == regex.ist_char_class_neg {
|
||||||
res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
|
res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
|
||||||
|
if tk.last_dot_flag == true {
|
||||||
|
res.write_string(' last!')
|
||||||
|
}
|
||||||
} else if ist == regex.ist_dot_char {
|
} else if ist == regex.ist_dot_char {
|
||||||
res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc')
|
res.write_string('. DOT_CHAR nx chk: $tk.dot_check_pc')
|
||||||
if tk.last_dot_flag == true {
|
if tk.last_dot_flag == true {
|
||||||
@ -1788,9 +1838,15 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
// println("Finished text!!")
|
// println("Finished text!!")
|
||||||
src_end = true
|
src_end = true
|
||||||
|
|
||||||
|
// we have fished the text, we must manage out pf bound indexes
|
||||||
|
if state.i >= in_txt_len {
|
||||||
|
state.i = in_txt_len - 1
|
||||||
|
}
|
||||||
|
|
||||||
// manage groups
|
// manage groups
|
||||||
if state.group_index >= 0 && state.match_index >= 0 {
|
if state.group_index >= 0 && state.match_index >= 0 {
|
||||||
// println("End text with open groups!")
|
// println("End text with open groups!")
|
||||||
|
// println("state.group_index: ${state.group_index}")
|
||||||
// close the groups
|
// close the groups
|
||||||
for state.group_index >= 0 {
|
for state.group_index >= 0 {
|
||||||
tmp_pc := re.group_data[state.group_index]
|
tmp_pc := re.group_data[state.group_index]
|
||||||
@ -1804,15 +1860,13 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
|
|
||||||
// save group results
|
// save group results
|
||||||
g_index := re.prog[tmp_pc].group_id * 2
|
g_index := re.prog[tmp_pc].group_id * 2
|
||||||
|
// println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
|
||||||
if start_i >= 0 {
|
if start_i >= 0 {
|
||||||
re.groups[g_index] = start_i
|
re.groups[g_index] = start_i
|
||||||
} else {
|
} else {
|
||||||
re.groups[g_index] = 0
|
re.groups[g_index] = 0
|
||||||
}
|
}
|
||||||
// we have fished the text, we must manage out pf bound indexes
|
|
||||||
if state.i >= in_txt_len {
|
|
||||||
state.i = in_txt_len - 1
|
|
||||||
}
|
|
||||||
re.groups[g_index + 1] = state.i
|
re.groups[g_index + 1] = state.i
|
||||||
|
|
||||||
if re.groups[g_index + 1] >= in_txt_len {
|
if re.groups[g_index + 1] >= in_txt_len {
|
||||||
@ -1827,6 +1881,8 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// println("re.groups: ${re.groups}")
|
||||||
|
|
||||||
// the text is finished and the groups closed and we are the last group, ok exit
|
// the text is finished and the groups closed and we are the last group, ok exit
|
||||||
if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
|
if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
|
||||||
// println("Last group end")
|
// println("Last group end")
|
||||||
@ -1847,12 +1903,17 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
return state.first_match, state.i
|
return state.first_match, state.i
|
||||||
}
|
}
|
||||||
|
|
||||||
// we are in a last dot_ char case
|
if l_ist in [
|
||||||
if l_ist == regex.ist_dot_char {
|
rune(regex.ist_char_class_neg),
|
||||||
// println("***** We have a last dot_char")
|
regex.ist_char_class_pos,
|
||||||
|
regex.ist_bsls_char,
|
||||||
|
regex.ist_dot_char,
|
||||||
|
] {
|
||||||
|
// println("***** We have a last special token")
|
||||||
// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
|
// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
|
||||||
// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
|
// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
|
||||||
// println("first match: ${state.first_match}")
|
// println("first match: ${state.first_match}")
|
||||||
|
|
||||||
if re.prog[state.pc].last_dot_flag == true
|
if re.prog[state.pc].last_dot_flag == true
|
||||||
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min
|
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min
|
||||||
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
|
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
|
||||||
@ -1860,10 +1921,6 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
}
|
}
|
||||||
// println("Not fitted!!")
|
// println("Not fitted!!")
|
||||||
}
|
}
|
||||||
|
|
||||||
// m_state = .end
|
|
||||||
// break
|
|
||||||
|
|
||||||
// no groups open, check the last token quantifier
|
// no groups open, check the last token quantifier
|
||||||
if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
|
if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
|
||||||
if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
|
if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
|
||||||
@ -1873,7 +1930,7 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// print("No good exit!!")
|
// println("No good exit!!")
|
||||||
return regex.no_match_found, state.i
|
return regex.no_match_found, state.i
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -364,6 +364,18 @@ find_all_test_suite = [
|
|||||||
r"\+{3}.*\+{3}",
|
r"\+{3}.*\+{3}",
|
||||||
[0, 11, 18, 32, 33, 44],
|
[0, 11, 18, 32, 33, 44],
|
||||||
['+++pippo+++', '+++ pippo2 +++', '+++ oggi+++']
|
['+++pippo+++', '+++ pippo2 +++', '+++ oggi+++']
|
||||||
|
},
|
||||||
|
Test_find_all{
|
||||||
|
"ab",
|
||||||
|
r"[^\n]*",
|
||||||
|
[0, 2],
|
||||||
|
['ab']
|
||||||
|
},
|
||||||
|
Test_find_all{
|
||||||
|
"ab",
|
||||||
|
r"([^\n]*)",
|
||||||
|
[0, 2],
|
||||||
|
['ab']
|
||||||
}
|
}
|
||||||
|
|
||||||
]
|
]
|
||||||
|
Loading…
Reference in New Issue
Block a user