regex: added groups in replace strings (#9576)

2023-08-10 21:13:21 +03:00 · 2021-04-03 22:16:56 +02:00
parent 0eb59cf2bd
commit 1a324679b9
3 changed files with 154 additions and 6 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@@ -544,10 +544,36 @@ pub fn (mut re RE) find_all_str(in_txt string) []string
 #### Replace functions

 ```v ignore
-// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
+// replace return a string where the matches are replaced with the repl_str string, 
+// this function support groups in the replace string
 pub fn (re mut RE) replace(in_txt string, repl string) string
 ```

+replace string can include groups references:
+
+```v ignore
+txt   := "Today it is a good day."
+query := r'(a\w)[ ,.]'
+mut re := regex.regex_opt(query)?
+res := re.replace(txt, r"__[\0]__")
+```
+
+in this example we used the group `0` in the replace string: `\0`, the result will be:
+
+```
+Today it is a good day. => Tod__[ay]__it is a good d__[ay]__
+```
+
+**Note:** in the replace strings can be used only groups from `0` to `9`.
+
+If the usage of `groups` in the replace process is not needed it is possible
+to use a quick function:
+
+```v ignore
+// replace_simple return a string where the matches are replaced with the replace string
+pub fn (mut re RE) replace_simple(in_txt string, repl string) string
+```
+
 #### Custom replace function

 For complex find and replace operations it is available the function `replace_by_fn` .
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@@ -176,7 +176,41 @@ match_test_suite_replace = [
 		r"[Tt]o\w+",
 		"CIAO",
 		"CIAO is a good day and CIAO will be for sure."
-	}
+	},
+	TestItemRe{
+		"Today is a good day and tomorrow will be for sure.",
+		r"(a\w) ",
+		r"[\0] ",
+		"Tod[ay] is a good d[ay] and tomorrow will be for sure."
+	},
+	TestItemRe{
+		"Today is a good day and tomorrow will be for sure.",
+		r"(a\w) ",
+		r"[\0_\0] ",
+		"Tod[ay_ay] is a good d[ay_ay] and tomorrow will be for sure."
+	},
+	TestItemRe{
+		"Today is a good day and tomorrow will be for sure.",
+		r"(a\w) ",
+		r"[\0\1] ",
+		"Tod[ay] is a good d[ay] and tomorrow will be for sure."
+	},
+]
+
+match_test_suite_replace_simple = [
+	// replace tests
+	TestItemRe{
+		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
+		r"(pi?(ba)+o)",
+		"CIAO",
+		"oggi CIAO è andato a casa di CIAO ed ha trovato CIAO"
+	},
+	TestItemRe{
+		"Today is a good day and tomorrow will be for sure.",
+		r"[Tt]o\w+",
+		"CIAO",
+		"CIAO is a good day and CIAO will be for sure."
+	},
 ]
 )

@@ -425,6 +459,25 @@ fn test_regex(){
 		}
 	}

+	// check replace simple
+	for c,to in match_test_suite_replace_simple{
+		// debug print
+		if debug { println("#$c [$to.src] q[$to.q] $to.r") }
+
+		mut re := regex.regex_opt(to.q) or {
+			eprintln('err: $err')
+			assert false
+			continue
+		}
+
+		res := re.replace_simple(to.src,to.rep)
+		if res != to.r {
+			eprintln("ERROR: replace.")
+			assert false
+			continue
+		}
+	}
+
 	// check match and find
 	for c,to in match_test_suite {
 		// debug print
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@@ -54,7 +54,9 @@ pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
 		tmp_index := re.group_map[group_name]-1
 		start     := re.groups[tmp_index * 2]
 		end       := re.groups[tmp_index * 2 + 1]
-		return in_txt[start..end]
+		if start >= 0 && end > start {
+			return in_txt[start..end]
+		}
 	}
 	return ""
 }
@@ -65,7 +67,9 @@ pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
 		index := group_id << 1
 		start := re.groups[index]
 		end   := re.groups[index + 1]
-		return in_txt[start..end]
+		if start >= 0 && end > start {
+			return in_txt[start..end]
+		}
 	}
 	return ""
 }
@@ -307,8 +311,8 @@ pub fn (mut re RE) find_all_str(in_txt string) []string {
 * Replacers
 *
 ******************************************************************************/
-// replace return a string where the matches are replaced with the replace string
-pub fn (mut re RE) replace(in_txt string, repl string) string {
+// replace_simple return a string where the matches are replaced with the replace string
+pub fn (mut re RE) replace_simple(in_txt string, repl string) string {
 	pos := re.find_all(in_txt)

 	if pos.len > 0 {
@@ -331,6 +335,7 @@ pub fn (mut re RE) replace(in_txt string, repl string) string {
 	return in_txt
 }

+
 // type of function used for custom replace
 // in_txt  source text
 // start   index of the start of the match in in_txt
@@ -378,3 +383,67 @@ pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	}
 	return res.str()
 }
+
+
+fn (re RE) parsed_replace_string(in_txt string, repl string) string {
+	str_lst := repl.split("\\")
+	mut res := str_lst[0]
+	mut i := 1
+	for i < str_lst.len {
+		tmp := str_lst[i]
+		//println("tmp: ${tmp}")
+		if tmp.len > 0 && tmp[0] >= `0` && tmp[0] <= `9` {
+			group_id := int(tmp[0] - `0`)
+			group := re.get_group_by_id(in_txt, group_id)
+			//println("group: $group_id [$group]")
+			res += "${group}${tmp[1..]}"
+		} else {
+			res += '\\'+tmp
+		}
+		i++
+	}
+	return res
+}
+
+// replace return a string where the matches are replaced with the repl_str string, 
+// this function support use groups in the replace string
+pub fn (mut re RE) replace(in_txt string, repl_str string) string {
+	mut i   := 0
+	mut res := strings.new_builder(in_txt.len)
+	mut last_end    := 0
+
+	for i < in_txt.len {
+		//println("Find Start. $i [${in_txt[i..]}]")
+		s, e := re.find_from(in_txt,i)
+		//println("Find End.")
+		if s >= 0 && e > s  {
+			//println("find match in: ${s},${e} [${in_txt[s..e]}]")
+			
+			if last_end < s {
+				res.write_string(in_txt[last_end..s])
+			}
+
+			for g_i in 0..re.group_count {
+				re.groups[g_i << 1      ] += i
+				re.groups[(g_i << 1) + 1] += i
+			}
+			
+			//repl := repl_fn(re, in_txt, s, e)
+			repl := re.parsed_replace_string(in_txt, repl_str)
+			//println("repl res: $repl")
+			res.write_string(repl)
+			//res.write_string("[[${in_txt[s..e]}]]")
+			
+			last_end = e
+			i = e
+		} else {
+			break
+			//i++
+		}
+		//println(i)
+	}
+	if last_end >= 0 && last_end < in_txt.len {
+		res.write_string(in_txt[last_end..])
+	}
+	return res.str()
+}