regex: lots of fixes (#7380)

2023-08-10 21:13:21 +03:00 · 2020-12-18 05:57:31 +01:00
parent 05e15bdd59
commit a6baffcb8c
5 changed files with 635 additions and 485 deletions
--- a/examples/regex_example.v
+++ b/examples/regex_example.v
@@ -54,13 +54,13 @@ fn convert_html_rgb_n(in_col string) u32 {
 	println("start: $start, end: $end")
 	mut res := u32(0)
 	if start >= 0 {
-		red_s, red_e := re.get_group("red")
+		red_s, red_e := re.get_group_bounds_by_name("red")
 		r := ("0x" + in_col[red_s..red_e]).int() << col_mul
 		
-		green_s, green_e := re.get_group("green")
+		green_s, green_e := re.get_group_bounds_by_name("green")
 		g := ("0x" + in_col[green_s..green_e]).int() << col_mul
 		
-		blue_s, blue_e := re.get_group("blue")
+		blue_s, blue_e := re.get_group_bounds_by_name("blue")
 		b := ("0x" + in_col[blue_s..blue_e]).int() << col_mul
 		
 		println("r: $r g: $g b: $b")
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@@ -1,4 +1,4 @@
-# V RegEx (Regular expression) 0.9h
+# V RegEx (Regular expression) 1.0 alpha

 [TOC]

@@ -226,7 +226,18 @@ fn convert_html_rgb(in_col string) u32 {
 }
 ```

+Others utility functions are `get_group_by_id` and `get_group_bounds_by_id` 
+that get  directly the string of a group using its `id`:

+```v ignore
+txt := "my used string...."
+for g_index := 0; g_index < re.group_count ; g_index++ {
+	println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+    	bounds: ${re.get_group_bounds_by_id(g_index)}") 
+}
+```
+
+more helper functions are listed in the **Groups query functions** section.

 ### Groups Continuous saving

@@ -251,59 +262,54 @@ The regex save until finish or found that the array have no space.
 If the space ends no error is raised, further records will not be saved.

 ```v ignore
-fn example2() {
-	test_regex()
-	text := 'tst: 01,23,45 ,56, 78'
-	query := r'.*:(\s*\d+[\s,]*)+'
-	mut re := new() or { panic(err) }
-	// re.debug = 2
-	re.group_csave_flag = true  // enable continuous capture
-	re.compile_opt(query) or {
-		println(err)
-		return
-	}
-	q_str := re.get_query()
-	println('Query: $q_str')
-	start, end := re.match_string(text)
-	if start < 0 {
-		println('ERROR : ${re.get_parse_error_string(start)}, $start')
-	} else {
-		println('found in [$start, $end] => [${text[start..end]}]')
-	}
-	// groups capture
-	mut gi := 0
-	for gi < re.groups.len {
-		if re.groups[gi] >= 0 {
-			println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
-				1]]}]')
-		}
-		gi += 2
-	}
-	// continuous saving
-	gi = 0
-	println('num: ${re.group_csave[0]}')
-	for gi < re.group_csave[0] {
-		id := re.group_csave[1 + gi * 3]
-		st := re.group_csave[1 + gi * 3 + 1]
-		en := re.group_csave[1 + gi * 3 + 2]
-		println('cg id: $id [$st, $en] => [${text[st..en]}]')
-		gi++
-	}
+import regex
+fn main(){
+    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
+    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
+
+    mut re := regex.regex_opt(query) or { panic(err) }
+    //println(re.get_code())   // uncomment to see the print of the regex execution code
+    re.debug=2  // enable maximum log
+    println("String: ${txt}")
+    println("Query : ${re.get_query()}")
+    re.debug=0  // disable log
+    re.group_csave_flag = true
+    start, end := re.match_string(txt)
+    if start >= 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
+    } else {
+        println("No Match")
+    }
+
+    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
+        println("cg: $re.group_csave")
+        mut cs_i := 1
+        for cs_i < re.group_csave[0]*3 {
+            g_id := re.group_csave[cs_i]
+            st   := re.group_csave[cs_i+1]
+            en   := re.group_csave[cs_i+2]
+            println("cg[$g_id] $st $en:[${txt[st..en]}]")
+            cs_i += 3
+        }
+    }
 }
 ```

 The output will be:

 ```
-Query: .*:(\s*\d+[\s,]*)+
-found in [0, 21] => [tst: 01,23,45 ,56, 78]
-0 19,21 :[78]
-num: 5
-cg id: 0 [4, 8] => [ 01,]
-cg id: 0 [8, 11] => [23,]
-cg id: 0 [11, 15] => [45 ,]
-cg id: 0 [15, 19] => [56, ]
-cg id: 0 [19, 21] => [78]
+String: http://www.ciao.mondo/hello/pippo12_/pera.html
+Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
+Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
+cg: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
+cg[0] 0 4:[http]
+cg[1] 7 11:[www.]
+cg[1] 11 16:[ciao.]
+cg[1] 16 22:[mondo/]
+cg[1] 22 28:[hello/]
+cg[1] 28 37:[pippo12_/]
+cg[1] 37 42:[pera.]
+cg[1] 42 46:[html]
 ```

 ### Named capturing groups
@@ -323,89 +329,42 @@ example:

 ```v ignore
 import regex
+fn main(){
+    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
+    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"

-fn main() {
-	test_regex()
-	text := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
-	query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+'
-	mut re := new()
-	re.debug = 2
-	// must provide an array of the right size if want the continuous saving of the groups
-	re.group_csave = [-1].repeat(3 * 20 + 1)
-	re.compile_opt(query) or {
-		println(err)
-		return
-	}
-	q_str := re.get_query()
-	println('O.Query: $query')
-	println('Query  : $q_str')
-	re.debug = 0
-	start, end := re.match_string(text)
-	if start < 0 {
-		err_str := re.get_parse_error_string(start)
-		println('ERROR : $err_str, $start')
-	} else {
-		text1 := text[start..end]
-		println('found in [$start, $end] => [$text1]')
-	}
-	// groups
-	mut gi := 0
-	for gi < re.groups.len {
-		if re.groups[gi] >= 0 {
-			println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
-				1]]}]')
-		}
-		gi += 2
-	}
-	// continuous saving
-	gi = 0
-	println('num of group item saved: ${re.group_csave[0]}')
-	for gi < re.group_csave[0] {
-		id := re.group_csave[1 + gi * 3]
-		st := re.group_csave[1 + gi * 3 + 1]
-		en := re.group_csave[1 + gi * 3 + 2]
-		println('cg id: $id [$st, $en] => [${text[st..en]}]')
-		gi++
-	}
-	println('raw array: ${re.group_csave[0..gi * 3 + 2 - 1]}')
-	// named capturing groups
-	println('named capturing groups:')
-	for g_name in re.group_map.keys() {
-		s, e := re.get_group(g_name)
-		if s >= 0 && e > s {
-			println("'$g_name':[$s, $e] => '${text[s..e]}'")
-		} else {
-			println("Group [$g_name] doesn't exist.")
-		}
-	}
+    mut re := regex.regex_opt(query) or { panic(err) }
+    //println(re.get_code())   // uncomment to see the print of the regex execution code
+    re.debug=2  // enable maximum log
+    println("String: ${txt}")
+    println("Query : ${re.get_query()}")
+    re.debug=0  // disable log
+    start, end := re.match_string(txt)
+    if start >= 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
+    } else {
+        println("No Match")
+    }
+
+    for name in re.group_map.keys() {
+        println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
+        bounds: ${re.get_group_bounds_by_name(name)}")
+    }
 }
 ```

 Output:

 ```
-O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
-Query  : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
-found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
-0 0,4 :[http]
-1 42,46 :[html]
-num of group item saved: 8
-cg id: 0 [0, 4] => [http]
-cg id: 1 [7, 11] => [www.]
-cg id: 1 [11, 16] => [ciao.]
-cg id: 1 [16, 22] => [mondo/]
-cg id: 1 [22, 28] => [hello/]
-cg id: 1 [28, 37] => [pippo12_/]
-cg id: 1 [37, 42] => [pera.]
-cg id: 1 [42, 46] => [html]
-raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
-named capturing groups:
-'format':[0, 4] => 'http'
-'token':[42, 46] => 'html'
+String: http://www.ciao.mondo/hello/pippo12_/pera.html
+Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
+Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
+group:'format' 	=> [http] bounds: (0, 4)
+group:'token' 	=> [html] bounds: (42, 46)
 ```

 In order to simplify the use of the named groups it possible to use names map in the `re`
-struct using the function `re.get_group`.
+struct using the function `re.get_group_by_name`.

 Here a more complex example of use:

@@ -420,11 +379,11 @@ fn convert_html_rgb_n(in_col string) u32 {
 	println('start: $start, end: $end')
 	mut res := u32(0)
 	if start >= 0 {
-		red_s, red_e := re.get_group('red')
+		red_s, red_e := re.get_group_by_name('red')
 		r := ('0x' + in_col[red_s..red_e]).int() << col_mul
-		green_s, green_e := re.get_group('green')
+		green_s, green_e := re.get_group_by_name('green')
 		g := ('0x' + in_col[green_s..green_e]).int() << col_mul
-		blue_s, blue_e := re.get_group('blue')
+		blue_s, blue_e := re.get_group_by_name('blue')
 		b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul
 		println('r: $r g: $g b: $b')
 		res = u32(r) << 16 | u32(g) << 8 | u32(b)
@@ -433,7 +392,45 @@ fn convert_html_rgb_n(in_col string) u32 {
 }
 ```

+Others utility functions are `get_group_by_name` and `get_group_bounds_by_name`
+that get  directly the string of a group using its `name`:

+```v ignore
+txt := "my used string...."
+for name in re.group_map.keys() {
+	println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
+    bounds: ${re.get_group_bounds_by_name(name)}")
+}
+```
+
+
+
+### Groups query functions
+
+These functions are helpers to query the captured groups
+
+```v ignore
+// get_group_bounds_by_name get a group boundaries by its name
+pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) 
+
+// get_group_by_name get a group boundaries by its name
+pub fn (re RE) get_group_by_name(group_name string) string
+
+// get_group_by_id get a group boundaries by its id
+pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int)
+
+// get_group_by_id get a group string by its id
+pub fn (re RE) get_group_by_id(in_txt string, group_id int) string
+
+struct Re_group {
+pub:
+	start int = -1
+	end   int = -1
+}
+
+// get_group_list return a list of Re_group for the found groups
+pub fn (re RE) get_group_list() []Re_group
+```

 ## Flags

@@ -501,6 +498,48 @@ pub fn (re mut RE) find_all(in_txt string) []int
 pub fn (re mut RE) replace(in_txt string, repl string) string
 ```

+## Find and Replace
+
+For complex find and replace operations it is available the function `replace_by_fn` .
+The`replace_by_fn` use a custom replace function making possible customizations. 
+**The custom function is called for every non overlapped find.**
+The custom function must be of the type:
+```v ignore
+fn (re RE, in_txt string, start int, end int) string
+```
+
+The following example will clarify the use:
+
+```v ignore
+import regex
+// customized replace functions
+// it will be called on each non overlapped find
+fn my_repl(re regex.RE, in_txt string, start int, end int) string {
+    g0 := re.get_group_by_id(in_txt, 0)
+    g1 := re.get_group_by_id(in_txt, 1)
+    g2 := re.get_group_by_id(in_txt, 2)
+    return "*$g0*$g1*$g2*"    
+}
+
+fn main(){
+    txt   := "today [John] is gone to his house with (Jack) and [Marie]."
+    query := r"(.)(\A\w+)(.)"
+
+    mut re := regex.regex_opt(query) or { panic(err) }
+   
+    result := re.replace_by_fn(txt, my_repl)
+    println(result)
+}
+```
+
+Output:
+
+```
+today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
+```
+
+
+
 ## Debugging

 This module has few small utilities to help the writing of regex expressions.
@@ -527,11 +566,20 @@ The result will be something like this:

 ```
 ========================================
-v RegEx compiler v 0.9c output:
-PC:  0 ist: 7fffffff [a]      query_ch {  1,  1}
-PC:  1 ist: 7fffffff [b]      query_ch {  1,MAX}
-PC:  2 ist: 88000000 PROG_END {  0,  0}
+v RegEx compiler v 1.0 alpha output:
+PC:  0 ist: 92000000 (        GROUP_START #:0 {  1,  1}
+PC:  1 ist: 98000000 .        DOT_CHAR nx chk: 4 {  1,  1}
+PC:  2 ist: 94000000 )        GROUP_END   #:0 {  1,  1}
+PC:  3 ist: 92000000 (        GROUP_START #:1 {  1,  1}
+PC:  4 ist: 90000000 [\A]     BSLS {  1,  1}
+PC:  5 ist: 90000000 [\w]     BSLS {  1,MAX}
+PC:  6 ist: 94000000 )        GROUP_END   #:1 {  1,  1}
+PC:  7 ist: 92000000 (        GROUP_START #:2 {  1,  1}
+PC:  8 ist: 98000000 .        DOT_CHAR nx chk: -1 last! {  1,  1}
+PC:  9 ist: 94000000 )        GROUP_END   #:2 {  1,  1}
+PC: 10 ist: 88000000 PROG_END {  0,  0}
 ========================================
+
 ```

 `PC`:`int` is the program counter or step of execution, each single step is a token.
@@ -625,54 +673,29 @@ re.log_func = custom_print

 Here there is a simple code to perform some basically match of strings

-```v oksyntax
-struct TestObj {
-	source string // source string to parse
-	query  string // regex query string
-	s      int // expected match start index
-	e      int // expected match end index
-}
+```v ignore
+import regex

-const (
-	tests = [
-		TestObj{'this is a good.', r'this (\w+) a', 0, 9},
-		TestObj{'this,these,those. over', r'(th[eio]se?[,. ])+', 0, 17},
-		TestObj{'test1@post.pip.com, pera', r'[\w]+@([\w]+\.)+\w+', 0, 18},
-		TestObj{'cpapaz ole. pippo,', r'.*c.+ole.*pi', 0, 14},
-		TestObj{'adce aabe', r'(a(ab)+)|(a(dc)+)e', 0, 4},
-	]
-)
+fn main(){
+    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
+    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"

-fn example() {
-	for c, tst in tests {
-		mut re := regex.new()
-		re.compile_opt(tst.query) or {
-			println(err)
-			continue
-		}
-		// print the query parsed with the groups ids
-		re.debug = 1 // set debug on at minimum level
-		println('#${c:2d} query parsed: $re.get_query()')
-		re.debug = 0
-		// do the match
-		start, end := re.match_string(tst.source)
-		if start >= 0 && end > start {
-			println('#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]')
-		}
-		// print the groups
-		mut gi := 0
-		for gi < re.groups.len {
-			if re.groups[gi] >= 0 {
-				println('group ${gi / 2:2d} :[${tst.source[re.groups[gi]..re.groups[gi + 1]]}]')
-			}
-			gi += 2
-		}
-		println('')
-	}
-}
-
-fn main() {
-	example()
+    mut re := regex.regex_opt(query) or { panic(err) }
+   
+    start, end := re.match_string(txt)
+    if start >= 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
+        for g_index := 0; g_index < re.group_count ; g_index++ {
+            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+            bounds: ${re.get_group_bounds_by_id(g_index)}")  
+        }
+        for name in re.group_map.keys() {
+            println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
+            bounds: ${re.get_group_bounds_by_name(name)}")
+        }
+    } else {
+        println("No Match")
+    }
 }
 ```

--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@@ -7,7 +7,7 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 	
 	if re_err != compile_ok {
 		mut err_msg := strings.new_builder(300)
-		err_msg.write("query: $pattern\n")
+		err_msg.write("\nquery: $pattern\n")
 		line := "-".repeat(err_pos)
 		err_msg.write("err  : ${line}^\n")
 		err_str := re.get_parse_error_string(re_err)
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@@ -21,6 +21,10 @@ match_test_suite = [
 	TestItem{"b",r"b|a",0,1},
 	TestItem{"c",r"b|a",-1,0},

+	// test base
+	TestItem{"[ciao]",r"(.)ciao(.)",0,6},
+	TestItem{"[ciao] da me",r"(.)ciao(.)",0,6},
+
 	// positive
 	TestItem{"this is a good.",r"this",0,4},
 	TestItem{"this is a good.",r"good",10,14},
@@ -193,7 +197,8 @@ cgroups_test_suite = [
 	TestItemCGroup{
 		"http://www.ciao.mondo/hello/pippo12_/pera.html",
 		r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
-		[8, 0, 0, 4, 1, 7, 12, 1, 11, 17, 1, 16, 23, 1, 22, 29, 1, 28, 38, 1, 37, 43, 1, 42, 46],
+		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
+		//[8, 0, 0, 4, 1, 7, 10, 1, 11, 15, 1, 16, 21, 1, 22, 27, 1, 28, 36, 1, 37, 41, 1, 42, 46],		
 		{'format':int(0),'token':1}
 	},
 	TestItemCGroup{