regex: fix a bug for CC token not correctly parsed in groups (#16272)

2023-08-10 21:13:21 +03:00 · 2022-11-01 07:58:29 +01:00 · 2022-11-01 07:58:29 +01:00 · 362adfae3a
commit 362adfae3a
parent 47a10f3181
2 changed files with 83 additions and 14 deletions
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -259,6 +259,7 @@ mut:
 	// dot_char token variables
 	dot_check_pc  int = -1 // pc of the next token to check for dots
 	bsls_check_pc int = -1 // pc of the next token to check for bsls
+	cc_check_pc   int = -1 // pc of the next token to check for CC
 	last_dot_flag bool // if true indicate that is the last dot_char in the regex
 	// debug fields
 	source_index int
@ -1270,7 +1271,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 		pc1++
 	}

-	// println("last_dot_char_pc: $last_dot_char_pc")
+	// println("last_dot_char_pc: ${last_dot_char_pc}")
 	if last_dot_char_pc >= 0 {
 		pc1 = last_dot_char_pc + 1
 		mut is_last_dot := true
@ -1313,7 +1314,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 		pc1++
 	}

-	// println('last_bsls_char_pc: $last_bsls_char_pc')
+	// println('last_bsls_char_pc: ${last_bsls_char_pc}')
 	if last_bsls_char_pc >= 0 {
 		pc1 = last_bsls_char_pc + 1
 		mut is_last_bsls := true
@ -1329,6 +1330,46 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 		}
 	}

+	//
+	// manage CC
+	//
+	pc1 = 0
+	mut cc_char_count := 0
+	mut last_cc_char_pc := -1
+	for pc1 < pc {
+		if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
+			last_cc_char_pc = pc1
+			cc_char_count++
+			mut pc2 := pc1 + 1
+			for pc2 < pc {
+				if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
+					regex.ist_group_start] {
+					// println("Next CC check is PC: ${pc2}")
+					re.prog[pc1].cc_check_pc = pc2
+					break
+				}
+				pc2++
+			}
+		}
+		pc1++
+	}
+
+	// println('last_cc_char_pc: ${last_cc_char_pc}')
+	if last_cc_char_pc >= 0 {
+		pc1 = last_cc_char_pc + 1
+		mut is_last_cc := true
+		for pc1 < pc {
+			if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
+				is_last_cc = false
+				break
+			}
+			pc1++
+		}
+		if is_last_cc {
+			re.prog[last_cc_char_pc].last_dot_flag = true
+		}
+	}
+
 	//******************************************

 	// OR branch
@ -1417,6 +1458,9 @@ pub fn (re RE) get_code() string {
 		ist := tk.ist
 		if ist == regex.ist_bsls_char {
 			res.write_string('[\\${tk.ch:1c}]     BSLS')
+			if tk.last_dot_flag == true {
+				res.write_string(' last!')
+			}
 		} else if ist == regex.ist_prog_end {
 			res.write_string('PROG_END')
 			stop_flag = true
@ -1424,8 +1468,14 @@ pub fn (re RE) get_code() string {
 			res.write_string('OR      ')
 		} else if ist == regex.ist_char_class_pos {
 			res.write_string('[${re.get_char_class(pc1)}]     CHAR_CLASS_POS')
+			if tk.last_dot_flag == true {
+				res.write_string(' last!')
+			}
 		} else if ist == regex.ist_char_class_neg {
 			res.write_string('[^${re.get_char_class(pc1)}]    CHAR_CLASS_NEG')
+			if tk.last_dot_flag == true {
+				res.write_string(' last!')
+			}
 		} else if ist == regex.ist_dot_char {
 			res.write_string('.        DOT_CHAR nx chk: $tk.dot_check_pc')
 			if tk.last_dot_flag == true {
@ -1788,9 +1838,15 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
 			// println("Finished text!!")
 			src_end = true

+			// we have fished the text, we must manage out pf bound indexes
+			if state.i >= in_txt_len {
+				state.i = in_txt_len - 1
+			}
+
 			// manage groups
 			if state.group_index >= 0 && state.match_index >= 0 {
 				// println("End text with open groups!")
+				// println("state.group_index: ${state.group_index}")
 				// close the groups
 				for state.group_index >= 0 {
 					tmp_pc := re.group_data[state.group_index]
@ -1804,15 +1860,13 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {

 						// save group results
 						g_index := re.prog[tmp_pc].group_id * 2
+						// println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
 						if start_i >= 0 {
 							re.groups[g_index] = start_i
 						} else {
 							re.groups[g_index] = 0
 						}
-						// we have fished the text, we must manage out pf bound indexes
-						if state.i >= in_txt_len {
-							state.i = in_txt_len - 1
-						}
+
 						re.groups[g_index + 1] = state.i

 						if re.groups[g_index + 1] >= in_txt_len {
@ -1827,6 +1881,8 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
 				}
 			}

+			// println("re.groups: ${re.groups}")
+
 			// the text is finished and the groups closed and we are the last group, ok exit
 			if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
 				// println("Last group end")
@ -1847,12 +1903,17 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
 				return state.first_match, state.i
 			}

-			// we are in a last dot_ char case
-			if l_ist == regex.ist_dot_char {
-				// println("***** We have a last dot_char")
+			if l_ist in [
+				rune(regex.ist_char_class_neg),
+				regex.ist_char_class_pos,
+				regex.ist_bsls_char,
+				regex.ist_dot_char,
+			] {
+				// println("***** We have a last special token")
 				// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
 				// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
 				// println("first match: ${state.first_match}")
+
 				if re.prog[state.pc].last_dot_flag == true
 					&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min
 					&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
@ -1860,10 +1921,6 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
 				}
 				// println("Not fitted!!")
 			}
-
-			// m_state = .end
-			// break
-
 			// no groups open, check the last token quantifier
 			if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
 				if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
@ -1873,7 +1930,7 @@ pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
 				}
 			}

-			// print("No good exit!!")
+			// println("No good exit!!")
 			return regex.no_match_found, state.i
 		}

--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -364,6 +364,18 @@ find_all_test_suite = [
 		r"\+{3}.*\+{3}",
 		[0, 11, 18, 32, 33, 44],
 		['+++pippo+++', '+++ pippo2 +++', '+++ oggi+++']
+	},
+	Test_find_all{
+		"ab",
+		r"[^\n]*",
+		[0, 2],
+		['ab']
+	},
+	Test_find_all{
+		"ab",
+		r"([^\n]*)",
+		[0, 2],
+		['ab']
 	}

 ]