regex: add a find_all_str function (#7517)

2023-08-10 21:13:21 +03:00 · 2020-12-24 06:27:46 +01:00
parent 36dcace0a7
commit 2824e07baa
5 changed files with 248 additions and 198 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@@ -473,7 +473,7 @@ pub fn new() RE

 ```
 #### **Custom initialization**
-For some particular need it is possible initialize a fully customized regex:
+For some particular needs it is possible initialize a fully manually customized regex:
 ```v ignore
 pattern = r"ab(.*)(ac)"
 // init custom regex
@@ -484,6 +484,8 @@ re.cc   = []CharClass{len: pattern.len}     // can not be more char class the th
 re.group_csave_flag = false          // true enable continuos group saving if needed
 re.group_max_nested = 128            // set max 128 group nested possible
 re.group_max        = pattern.len>>1 // we can't have more groups than the half of the pattern legth
+re.group_stack = []int{len: re.group_max, init: -1}
+re.group_data  = []int{len: re.group_max, init: -1}
 ```
 ### Compiling

@@ -494,22 +496,14 @@ After an initializer is used, the regex expression must be compiled with:
 pub fn (re mut RE) compile_opt(in_txt string) ?
 ```

-### Operative Functions
+### Matching Functions

-These are the operative functions
+These are the matching functions

 ```v ignore
 // match_string try to match the input string, return start and end index if found else start is -1
 pub fn (re mut RE) match_string(in_txt string) (int,int)

-// find try to find the first match in the input string, return start and end index if found else start is -1
-pub fn (re mut RE) find(in_txt string) (int,int)
-
-// find_all find all the "non overlapping" occurrences of the matching pattern, return a list of start end indexes
-pub fn (re mut RE) find_all(in_txt string) []int
-
-// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
-pub fn (re mut RE) replace(in_txt string, repl string) string
 ```

 ## Find and Replace
@@ -519,13 +513,19 @@ There are the following find  and replace functions:
 #### Find functions

 ```v ignore
-// find try to find the first match in the input string, return start and end index if found else start is -1
+// find try to find the first match in the input string
+// return start and end index if found else start is -1
 pub fn (re mut RE) find(in_txt string) (int,int)

 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of start end indexes like: [3,4,6,8] 
 // the matches are [3,4] and [6,8]
 pub fn (re mut RE) find_all(in_txt string) []int
+
+// find_all find all the "non overlapping" occurrences of the matching pattern
+// return a list of strings
+// the result is like ["first match","secon match"]
+pub fn (mut re RE) find_all_str(in_txt string) []string
 ```

 #### Replace functions
@@ -543,10 +543,12 @@ The`replace_by_fn` use a custom replace function making possible customizations.
 The custom function must be of the type:

 ```v ignore
-// re RE struct
-// in_txt all the text passed to the regex expression
-// the match is: in_txt[start..end]
-fn (re RE, in_txt string, start int, end int) string
+// type of function used for custom replace
+// in_txt  source text
+// start   index of the start of the match in in_txt
+// end     index of the end   of the match in in_txt
+// --- the match is in in_txt[start..end] ---
+fn (re RE, in_txt string, start int, end int) string 
 ```

 The following example will clarify the use:
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@@ -1554,14 +1554,14 @@ fn state_str(s Match_state) string {

 struct StateObj {
 pub mut:
-	group_index int = -1  // group id used to know how many groups are open
-	match_flag  bool
-	match_index int = -1
-	first_match int = -1  //index of the first match
-	pc int = -1           // program counter
-	i  int = -1           // source string index
-	char_len int
-	last_dot_pc int = -1      // last dot chat pc
+	group_index int  = -1  // group id used to know how many groups are open
+	match_flag  bool       // indicate if we are in a match condition
+	match_index int  = -1  // index of the last match
+	first_match int  = -1  // index of the first match
+	pc          int  = -1  // program counter
+	i           int  = -1  // source string index
+	char_len    int        // last char legth
+	last_dot_pc int  = -1  // last dot chat pc
 }

 [direct_array_access]
@@ -1579,13 +1579,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	mut ist   := rune(0)              // actual instruction
 	mut l_ist := rune(0)              // last matched instruction

-	//mut state_list := []StateObj{}
-
-	//mut group_stack := []int{len: re.group_max, init: -1}
-	//mut group_data  := []int{len: re.group_max, init: -1}
-
-	//mut group_index := -1           // group id used to know how many groups are open
-
 	mut step_count  := 0              // stats for debug
 	mut dbg_line    := 0              // count debug line printed

@@ -1900,7 +1893,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					//println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
 					if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
 	 					start_i   := re.group_stack[state.group_index]
-	 					//re.group_stack[state.group_index]=-1

 	 					// save group results
 						g_index := re.prog[state.pc].group_id*2
@@ -1960,8 +1952,6 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				// check next token to be false
 				mut next_check_flag := false
 				
-				//if re.prog[state.pc].rep >= re.prog[state.pc].rep_min && 
-				
 				// if we are done with max go on dot char are dedicated case!!
 				if	re.prog[state.pc].rep >= re.prog[state.pc].rep_max 
 				{
@@ -2415,113 +2405,3 @@ pub fn (mut re RE) match_string(in_txt string) (int,int) {
 	}
 	return start, end
 }
-
-//
-// Finders
-//
-
-// find try to find the first match in the input string
-[direct_array_access]
-pub fn (mut re RE) find(in_txt string) (int,int) {
-	old_flag := re.flag
-	
-	re.flag |= f_src  // enable search mode
-	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
-	//print("Find [$start,$end] '${in_txt[start..end]}'")
-	if end > in_txt.len {
-		end = in_txt.len
-	}
-	re.flag = old_flag
-
-	if start >= 0 && end > start {
-		return start, end
-	}
-	return no_match_found, 0
-}
-
-// find all the non overlapping occurrences of the match pattern
-[direct_array_access]
-pub fn (mut re RE) find_all(in_txt string) []int {
-	mut i := 0
-	mut res := []int{}
-	mut ls := -1
-	for i < in_txt.len {
-		s,e := re.find(in_txt[i..])
-		if s >= 0 && e > s && i+s > ls {
-			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
-			res << i+s
-			res << i+e
-			ls = i+s
-			i = i+e
-			continue
-		} else {
-			i++
-		}
-
-	}
-	return res
-}
-
-// replace return a string where the matches are replaced with the replace string
-pub fn (mut re RE) replace(in_txt string, repl string) string {
-	pos := re.find_all(in_txt)
-	if pos.len > 0 {
-		mut res := ""
-		mut i := 0
-
-		mut s1 := 0
-		mut e1 := in_txt.len
-
-		for i < pos.len {
-			e1 = pos[i]
-			res += in_txt[s1..e1] + repl
-			s1 = pos[i+1]
-			i += 2
-		}
-
-		res += in_txt[s1..]
-		return res
-	}
-	return in_txt
-}
-
-pub type FnReplace = fn (re RE, in_txt string, start int, end int) string 
-
-// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
-pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
-	mut i := 0
-	mut res := ""
-	mut ls := -1
-
-	mut s1 := 0
-	//mut e1 := in_txt.len
-	
-	for i < in_txt.len {
-		s,e := re.find(in_txt[i..])
-		if s >= 0 && e > s && i+s > ls {
-			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
-			start := i + s
-			end   := i + e
-			// update grups index diplacement
-			mut gi := 0
-			for gi < re.groups.len {
-				re.groups[gi] += i
-				gi++
-			}
-			repl  := repl_fn(re, in_txt, start, end)
-
-			res += in_txt[s1..start] + repl
-			s1 = end 
-
-			ls = i + s
-			i  = i + e
-			continue
-		} else {
-			i++
-		}
-
-	}
-	res += in_txt[s1..]
-	return res
-}
-
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@@ -17,9 +17,19 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 }

 // new_regex create a RE of small size, usually sufficient for ordinary use
-[deprecated]
 pub fn new() RE {
-	return impl_new_regex_by_size(1)
+	// init regex
+    mut re := regex.RE{}
+    re.prog = []Token    {len: max_code_len + 1} // max program length, can not be longer then the pattern
+    re.cc   = []CharClass{len: max_code_len}     // can not be more char class the the length of the pattern
+    re.group_csave_flag = false                 // enable continuos group saving
+    re.group_max_nested = 128                   // set max 128 group nested
+    re.group_max        = max_code_len >> 1      // we can't have more groups than the half of the pattern legth
+
+    re.group_stack = []int{len: re.group_max, init: -1}
+	re.group_data  = []int{len: re.group_max, init: -1}
+
+	return re
 }

 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@@ -144,29 +144,6 @@ match_test_suite = [
 ]
 )

-struct TestItemFa {
-	src string
-	q string
-	r []int
-}
-
-const (
-match_test_suite_fa = [
-	// find_all tests
-	TestItemFa{
-		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
-		r"p[iplut]+o",
-		[5, 10, 31, 36, 51, 56]
-	},
-	TestItemFa{
-		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
-		r"(pi?(ba)+o)",
-		[5, 10, 31, 39, 54, 65]
-	},
-
-]
-)
-
 struct TestItemRe {
 	src string
 	q string
@@ -174,7 +151,7 @@ struct TestItemRe {
 	r string
 }
 const (
-match_test_suite_re = [
+match_test_suite_replace = [
 	// replace tests
 	TestItemRe{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
@@ -241,12 +218,53 @@ cgroups_test_suite = [
 ]
 )

+
+struct Test_find_all {
+	src string
+	q string
+	res []int // [0,4,5,6...] 
+	res_str []string // ['find0','find1'...]
+}
+const (
+find_all_test_suite = [
+	Test_find_all{
+		"abcd 1234 efgh 1234 ghkl1234 ab34546df",
+		r"\d+",
+		[5, 9, 15, 19, 24, 28, 31, 36],
+		['1234', '1234', '1234', '34546']
+	},
+	Test_find_all{
+		"abcd 1234 efgh 1234 ghkl1234 ab34546df",
+		r"\a+",
+		[0, 4, 10, 14, 20, 24, 29, 31, 36, 38],
+		['abcd', 'efgh', 'ghkl', 'ab', 'df']
+	},
+	Test_find_all{
+		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
+		r"p[iplut]+o",
+		[5, 10, 31, 36, 51, 56],
+		['pippo', 'pluto', 'pippo']
+	},
+	Test_find_all{
+		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
+		r"(pi?(ba)+o)",
+		[5, 10, 31, 39, 54, 65],
+		['pibao', 'pbababao', 'pibabababao']
+	},
+	Test_find_all{
+		"Today is a good day and tomorrow will be for sure.",
+		r"[Tt]o\w+",
+		[0, 5, 24, 32],
+		['Today', 'tomorrow']
+	}
+]
+)
+
 const (
 	debug = false // true for debug println 
 )

 fn test_regex(){
-
 	// check capturing groups
 	for c,to in cgroups_test_suite {
 		// debug print
@@ -275,8 +293,8 @@ fn test_regex(){

 		if start != to.s || end != to.e {
 			//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-			println("ERROR!")
-			C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+			eprintln("ERROR!")
+			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
 		}	
@@ -284,7 +302,7 @@ fn test_regex(){
 		// check cgroups
 		if to.cgn.len > 0 {
 			if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
-				println("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
+				eprintln("Capturing group len error! found: ${re.group_csave[0]} true ground: ${to.cg[0]}")
 				assert false
 				continue
 			}
@@ -293,7 +311,7 @@ fn test_regex(){
 			mut ln := re.group_csave[0]*3
 			for ln > 0 {
 				if re.group_csave[ln] != to.cg[ln] {
-					println("Capturing group failed on $ln item!")
+					eprintln("Capturing group failed on $ln item!")
 					assert false
 				}
 				ln--
@@ -302,7 +320,7 @@ fn test_regex(){
 			// check named captured groups
 			for k in to.cgn.keys() {
 				if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
-					println("Named capturing group error! [$k]")
+					eprintln("Named capturing group error! [$k]")
 					assert false
 					continue
 				}
@@ -314,9 +332,9 @@ fn test_regex(){
 			}
 			for ln:=0; ln < re.groups.len; ln++ {
 				if re.groups[ln] != to.cg[ln] {
-					println("Capture group doesn't match:")
-					println("true ground: [${to.cg}]")
-					println("elaborated : [${re.groups}]")
+					eprintln("Capture group doesn't match:")
+					eprintln("true ground: [${to.cg}]")
+					eprintln("elaborated : [${re.groups}]")
 					assert false
 				}
 			} 
@@ -324,9 +342,9 @@ fn test_regex(){
 	}

 	// check find_all
-	for c,to in match_test_suite_fa{
+	for c,to in find_all_test_suite {
 		// debug print
-		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
+		if debug { println("#$c [$to.src] q[$to.q] ($to.res, $to.res_str)") }

 		mut re := regex.regex_opt(to.q) or {
 			eprintln('err: $err')
@@ -334,25 +352,24 @@ fn test_regex(){
 			continue
 		}

+		re.reset()
 		res := re.find_all(to.src)
-		if res.len != to.r.len {
-			println("ERROR: find_all, array of different size.")
+		if res != to.res {
+			eprintln('err: find_all !!')
+			if debug { println("#$c exp: $to.res calculated: $res") }
 			assert false
-			continue
 		}

-		for c1,i in res {
-			if i != to.r[c1] {
-				println("ERROR: find_all, different indexes.")
-				assert false
-				continue
-			}
+		res_str := re.find_all_str(to.src)
+		if res_str != to.res_str {
+			eprintln('err: find_all_str !!')
+			if debug { println("#$c exp: $to.res_str calculated: $res_str") }
+			assert false
 		}
-
 	}

 	// check replace
-	for c,to in match_test_suite_re{
+	for c,to in match_test_suite_replace{
 		// debug print
 		if debug { println("#$c [$to.src] q[$to.q] $to.r") }

@@ -364,7 +381,7 @@ fn test_regex(){

 		res := re.replace(to.src,to.rep)
 		if res != to.r {
-			println("ERROR: replace.")
+			eprintln("ERROR: replace.")
 			assert false
 			continue
 		}
@@ -383,12 +400,12 @@ fn test_regex(){
 				continue
 			}
 			// q_str := re.get_query()
-			// println("Query: $q_str")
+			// eprintln("Query: $q_str")
 			start,end := re.find(to.src)

 			if start != to.s || end != to.e {
 				err_str := re.get_parse_error_string(start)
-				println("ERROR : $err_str start: ${start} end: ${end}")
+				eprintln("ERROR : $err_str start: ${start} end: ${end}")
 				assert false
 			} else {
 				//tmp_str := text[start..end]
@@ -416,8 +433,8 @@ fn test_regex(){
 		}

 		if start != to.s || end != to.e {
-			println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-			println("ERROR!")
+			eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			eprintln("ERROR!")
 			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
@@ -427,7 +444,7 @@ fn test_regex(){
 		tmp_str1 := to.src.clone()
 		start1, end1 := re.match_string(tmp_str1)
 		if start1 != start || end1 != end {
-			println("two run ERROR!!")
+			eprintln("two run ERROR!!")
 			assert false
 			continue
 		}
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@@ -117,6 +117,7 @@ pub fn (re RE) get_group_list() []Re_group {
 	mut res := []Re_group{len: re.groups.len >> 1}
 	mut gi := 0
 	//println("len: ${re.groups.len} groups: ${re.groups}")
+	
 	for gi < re.groups.len {
 		if re.groups[gi] >= 0 {
 			txt_st := re.groups[gi]
@@ -136,3 +137,143 @@ pub fn (re RE) get_group_list() []Re_group {
 	return res
 }

+/******************************************************************************
+*
+* Finders
+*
+******************************************************************************/
+// find try to find the first match in the input string
+[direct_array_access]
+pub fn (mut re RE) find(in_txt string) (int,int) {
+	old_flag := re.flag
+	re.flag |= f_src  // enable search mode
+
+	start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
+	//print("Find [$start,$end] '${in_txt[start..end]}'")
+	if end > in_txt.len {
+		end = in_txt.len
+	}
+	re.flag = old_flag
+
+	if start >= 0 && end > start {
+		return start, end
+	}
+	return no_match_found, 0
+}
+
+// find_all find all the non overlapping occurrences of the match pattern
+[direct_array_access]
+pub fn (mut re RE) find_all(in_txt string) []int {
+	mut i := 0
+	mut res := []int{}
+	mut ls := -1
+
+	for i < in_txt.len {
+		s,e := re.find(in_txt[i..])
+		if s >= 0 && e > s && i+s > ls {
+			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
+			res << i+s
+			res << i+e
+			ls = i+s
+			i = i+e
+			continue
+		} else {
+			i++
+		}
+
+	}
+	return res
+}
+
+// find_all_str find all the non overlapping occurrences of the match pattern, return a string list
+[direct_array_access]
+pub fn (mut re RE) find_all_str(in_txt string) []string {
+	mut i := 0
+	mut res := []string{}
+	mut ls := -1
+
+	for i < in_txt.len {
+		s,e := re.find(in_txt[i..])
+		if s >= 0 && e > s && i+s > ls {
+			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
+			res << in_txt[i+s..i+e]
+			ls = i+s
+			i = i+e
+			continue
+		} else {
+			i++
+		}
+
+	}
+	return res
+}
+/******************************************************************************
+*
+* Replacers
+*
+******************************************************************************/
+// replace return a string where the matches are replaced with the replace string
+pub fn (mut re RE) replace(in_txt string, repl string) string {
+	pos := re.find_all(in_txt)
+
+	if pos.len > 0 {
+		mut res := ""
+		mut i := 0
+
+		mut s1 := 0
+		mut e1 := in_txt.len
+
+		for i < pos.len {
+			e1 = pos[i]
+			res += in_txt[s1..e1] + repl
+			s1 = pos[i+1]
+			i += 2
+		}
+
+		res += in_txt[s1..]
+		return res
+	}
+	return in_txt
+}
+
+// type of function used for custom replace
+// in_txt  source text
+// start   index of the start of the match in in_txt
+// end     index of the end   of the match in in_txt
+// the match is in in_txt[start..end]
+pub type FnReplace = fn (re RE, in_txt string, start int, end int) string 
+
+// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
+pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
+	mut i   := 0
+	mut res := ""
+	mut ls  := -1
+	mut s1  := 0
+
+	for i < in_txt.len {
+		s,e := re.find(in_txt[i..])
+		if s >= 0 && e > s && i+s > ls {
+			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
+			start := i + s
+			end   := i + e
+			// update grups index diplacement
+			mut gi := 0
+			for gi < re.groups.len {
+				re.groups[gi] += i
+				gi++
+			}
+			repl  := repl_fn(re, in_txt, start, end)
+
+			res += in_txt[s1..start] + repl
+			s1 = end 
+
+			ls = i + s
+			i  = i + e
+			continue
+		} else {
+			i++
+		}
+	}
+	res += in_txt[s1..]
+	return res
+}