regex fixes

2023-08-10 21:13:21 +03:00 · 2020-01-25 19:12:23 +01:00
parent 222fc4b04f
commit 15a63b5bcb
2 changed files with 159 additions and 52 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@@ -159,6 +159,91 @@ for gi < re.groups.len {

 **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*

+### Groups Continuous saving
+
+In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`.
+
+This feature allow to collect data in a  continuous way.
+
+In the example we pass a text followed by a integer list that we want collect. 
+To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`.
+
+The array will be filled with the following logic:
+
+`re.group_csave[0]` number of total saved records
+
+`re.group_csave[1+n*3]` id of the saved group
+`re.group_csave[1+n*3]` start index in the source string of the saved group
+`re.group_csave[1+n*3]` end index in the source string of the saved group
+
+The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved.
+
+```v
+fn example2() {
+	test_regex()
+
+	text := "tst: 01,23,45 ,56, 78"
+	query:= r".*:(\s*\d+[\s,]*)+"
+
+	mut re := regex.new_regex()
+	//re.debug = 2
+	re.group_csave = [-1].repeat(3*20+1)  // we expect max 20 records
+
+	re_err, err_pos := re.compile(query)
+	if re_err == regex.COMPILE_OK {
+		q_str := re.get_query()
+		println("Query: $q_str")
+	
+		start, end := re.match_string(text)
+		if start < 0 {
+			println("ERROR : ${re.get_parse_error_string(start)}, $start")
+		} else {
+			println("found in [$start, $end] => [${text[start..end]}]")
+		}
+
+		// groups capture
+		mut gi := 0
+		for gi < re.groups.len {
+			if re.groups[gi] >= 0 {
+				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
+			}
+			gi += 2
+		}
+
+		// continuous saving
+		gi = 0
+		println("num: ${re.group_csave[0]}")
+		for gi < re.group_csave[0] {
+			id := re.group_csave[1+gi*3]
+			st := re.group_csave[1+gi*3+1]
+			en := re.group_csave[1+gi*3+2]
+			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
+			gi++
+		}
+	} else {
+		println("query: $query")
+		lc := "-".repeat(err_pos)
+		println("err  : $lc^")
+		err_str := re.get_parse_error_string(re_err)
+		println("ERROR: $err_str")	
+	}
+}
+```
+
+The output will be:
+
+```
+Query: .*:(\s*\d+[\s,]*)+
+found in [0, 21] => [tst: 01,23,45 ,56, 78]
+0 19,21 :[78]
+num: 5
+cg id: 0 [4, 8] => [ 01,]
+cg id: 0 [8, 11] => [23,]
+cg id: 0 [11, 15] => [45 ,]
+cg id: 0 [15, 19] => [56, ]
+cg id: 0 [19, 21] => [78] 
+```
+
 ## Flags

 It is possible to set some flags in the regex parser that change the behavior of the parser itself.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@@ -266,15 +266,12 @@ fn (tok mut Token) reset() {
 *
 ******************************************************************************/
 pub const (
-	//F_FND = 0x00000001  // check until the end of the input string, it act like a "find first match", not efficient!!
-	//F_PM  = 0x00000004  // partial match: if the source text finish and the match is positive until then return true
+	F_NL  = 0x00000001  // end the match when find a new line symbol
+	F_MS  = 0x00000002  // match true only if the match is at the start of the string
+	F_ME  = 0x00000004  // match true only if the match is at the end of the string 

-	F_NL  = 0x00000002  // end the match when find a new line symbol
-	F_MS  = 0x00000008  // match true only if the match is at the start of the string
-	F_ME  = 0x00000010  // match true only if the match is at the end of the string 
-
-	F_EFM = 0x01000000  // exit on first token matched, used by search
-	F_BIN = 0x02000000  // work only on bytes, ignore utf-8
+	F_EFM = 0x00000100  // exit on first token matched, used by search
+	F_BIN = 0x00000200  // work only on bytes, ignore utf-8
 )

 struct StateDotObj{
@@ -282,7 +279,7 @@ mut:
 	i  int                = -1  // char index in the input buffer
 	pc int                = -1  // program counter saved
 	mi int                = -1  // match_index saved
-	group_stack_index int = -1  // group index stack pointer saved
+	group_stack_index int = -1  // continuous save on capturing groups
 }

 pub
@@ -305,6 +302,9 @@ pub mut:
 	group_max_nested int = 3   // max nested group
 	group_max int        = 8   // max allowed number of different groups

+	group_csave []int    = []int  // groups continuous save array
+	group_csave_index int= -1     // groups continuous save index
+
 	// flags
 	flag int             = 0   // flag for optional parameters

@@ -328,6 +328,12 @@ fn (re mut RE) reset(){
 	re.groups = [-1].repeat(re.group_count*2)

 	re.state_stack_index = -1
+
+	// reset group_csave
+	if re.group_csave.len > 0 {
+		re.group_csave_index = 1
+		re.group_csave[0] = 0     // reset the capture count
+	}
 }

 /******************************************************************************
@@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 			}
 		}

-
-
 		// not  a {} quantifier, exit
 		return ERR_SYNTAX_ERROR, i, 0, false
 	}
@@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 	// Post processing
 	//******************************************

-
 	// count IST_DOT_CHAR to set the size of the state stack
 	mut pc1 := 0
 	mut tmp_count := 0
@@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 		pc1++
 	}

-	
 	//******************************************
 	// DEBUG PRINT REGEX GENERATED CODE
 	//******************************************
@@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string {
 		mut stop_flag := false

 		for pc1 <= re.prog.len {
+			tk := re.prog[pc1]
 			res.write("PC:${pc1:3d}")
 			
 		    res.write(" ist: ")
-		    res.write("${re.prog[pc1].ist:8x}".replace(" ","0") )
+		    res.write("${tk.ist:8x}".replace(" ","0") )
 		    res.write(" ")
-			ist :=re.prog[pc1].ist
+			ist :=tk.ist
 			if ist == IST_BSLS_CHAR {
-				res.write("[\\${re.prog[pc1].ch:1c}]     BSLS")
+				res.write("[\\${tk.ch:1c}]     BSLS")
 			} else if ist == IST_PROG_END {
 				res.write("PROG_END")
 				stop_flag = true
@@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string {
 			} else if ist == IST_DOT_CHAR {
 				res.write(".        DOT_CHAR")
 			} else if ist == IST_GROUP_START {
-				res.write("(        GROUP_START #:${re.prog[pc1].group_id}")
+				res.write("(        GROUP_START #:${tk.group_id}")
 			} else if ist == IST_GROUP_END {
-				res.write(")        GROUP_END   #:${re.prog[pc1].group_id}")
+				res.write(")        GROUP_END   #:${tk.group_id}")
 			} else if ist == IST_SIMPLE_CHAR {
-				res.write("[${re.prog[pc1].ch:1c}]      query_ch")
+				res.write("[${tk.ch:1c}]      query_ch")
 			}

-			if re.prog[pc1].rep_max == MAX_QUANTIFIER {
-				res.write(" {${re.prog[pc1].rep_min:3d},MAX}")
+			if tk.rep_max == MAX_QUANTIFIER {
+				res.write(" {${tk.rep_min:3d},MAX}")
 			}else{
 				if ist == IST_OR_BRANCH {
-					res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}")
+					res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
 				} else {
-					res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}")
+					res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
 				}
-				if re.prog[pc1].greedy == true {
+				if tk.greedy == true {
 					res.write("?")
 				}
 			}
@@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string {

 		res.write("========================================\n")
 		return res.str()
-
 }

 // get_query return a string with a reconstruction of the query starting from the regex program code
-
 pub fn (re RE) get_query() string {
 	mut res := strings.new_builder(re.query.len*2)

@@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string {

 	mut i := 0
 	for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
-		ch := re.prog[i].ist
+		tk := &re.prog[i]
+		ch := tk.ist
 		
 		// GROUP start
 		if ch == IST_GROUP_START {
 			if re.debug == 0 {
 				res.write("(")
 			} else {
-				res.write("#${re.prog[i].group_id}(")
+				res.write("#${tk.group_id}(")
 			}
 			i++
 			continue
@@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string {
 		if ch == IST_OR_BRANCH {
 			res.write("|")
 			if re.debug > 0 {
-				res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
+				res.write("{${tk.rep_min},${tk.rep_max}}")
 			}
 			i++
 			continue
@@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string {

 		// bsls char
 		if ch == IST_BSLS_CHAR {
-			res.write("\\${re.prog[i].ch:1c}")
+			res.write("\\${tk.ch:1c}")
 		}

 		// IST_DOT_CHAR
@@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string {
 			if byte(ch) in BSLS_ESCAPE_LIST {
 				res.write("\\")
 			}
-			res.write("${re.prog[i].ch:c}")
+			res.write("${tk.ch:c}")
 		}

 		// quantifier
-		if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) {
-			if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 {
+		if !(tk.rep_min == 1 && tk.rep_max == 1) {
+			if tk.rep_min == 0 && tk.rep_max == 1 {
 				res.write("?")
-			} else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER {
+			} else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
 				res.write("+")
-			} else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER {
+			} else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
 				res.write("*")
 			} else {
-				if re.prog[i].rep_max == MAX_QUANTIFIER {
-					res.write("{${re.prog[i].rep_min},MAX}")
+				if tk.rep_max == MAX_QUANTIFIER {
+					res.write("{${tk.rep_min},MAX}")
 				} else {
-					res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}")
+					res.write("{${tk.rep_min},${tk.rep_max}}")
 				}
-				if re.prog[i].greedy == true {
+				if tk.greedy == true {
 					res.write("?")
 				}
 			}
 		}
-
 		i++
 	}
 	if (re.flag & F_ME) != 0 {
@@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 							re.groups[g_index] = 0
 						}
 						re.groups[g_index+1] = i
+
+						// continuous save, save until we have space
+						if re.group_csave_index > 0 {
+							// check if we have space to save the record
+							if (re.group_csave_index + 3) < re.group_csave.len {
+								// incrment counter
+								re.group_csave[0]++
+								// save the record  
+								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
+								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
+							}
+						}
+
 					}

 					group_index--
@@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						}
 						re.groups[g_index+1] = i
 						//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])
+
+						// continuous save, save until we have space
+						if re.group_csave_index > 0 {
+							// check if we have space to save the record
+							if (re.group_csave_index + 3) < re.group_csave.len {
+								// incrment counter
+								re.group_csave[0]++
+								// save the record  
+								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
+								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
+							}
+						}
 					}
 					
 					re.prog[pc].group_rep++ // increase repetitions
@@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 			if rep < re.prog[tmp_pc].rep_min {
 				//C.printf("ist_quant_pg UNDER RANGE\n")
 				pc = re.prog[tmp_pc].goto_pc 
-				//group_index--
-				
 				m_state = .ist_next
 				continue
 			}
@@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				m_state = .ist_next // go to next ist
 				continue
 			}
-
-			// match failed
-			else if rep == 0 && re.prog[pc].rep_min > 0 {
-				//C.printf("ist_quant_n NO MATCH\n")
-				// dummy
-			}
 			// match + or *
 			else if rep >= re.prog[pc].rep_min {
 				//C.printf("ist_quant_n MATCH RANGE\n")
@@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					m_state = .ist_next
 					continue
 				}
-				
 				m_state = .ist_load
 				continue
 			}
@@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) {
 			return NO_MATCH_FOUND, 0
 		}
 		if (re.flag & F_ME) != 0 && end < in_txt.len {
+			if in_txt[end] in NEW_LINE_LIST {
+				return start, end
+			}
 			return NO_MATCH_FOUND, 0
 		}
 		return start, end
@@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) {
 	for i < in_txt.len {
 		
 		// test only the first part of the query string
-		re.flag &= F_EFM // set to exit on the first token match
+		re.flag |= F_EFM // set to exit on the first token match
 		mut tmp_end := i+re.query.len
 		if tmp_end > in_txt.len { tmp_end = in_txt.len }
 		tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }