regex: new options returning interface to the regex module (#6062)

2023-08-10 21:13:21 +03:00 · 2020-08-08 08:04:12 +02:00 · 2020-08-08 08:04:12 +02:00 · 2a4ef2acbd
commit 2a4ef2acbd
parent 664c26ab4b
4 changed files with 268 additions and 274 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -1,4 +1,4 @@
-# V RegEx (Regular expression) 0.9d
+# V RegEx (Regular expression) 0.9g

 [TOC]

@ -137,7 +137,7 @@ The "capture groups" are store as couple of index in the field `groups` that is
 ```v
 text := "cpaz cpapaz cpapapaz"
 query:= r"(c(pa)+z ?)+"
-re, _, _ := regex.regex(query) 
+mut re := regex.regex_opt(query) or { panic(err) }

 println(re.get_query())
 // #0(c#1(pa)+z ?)+  // #0 and #1 are the ids of the groups, are shown if re.debug is 1 or 2
@ -155,8 +155,6 @@ for gi < re.groups.len {
 // groups captured
 // 0 :[cpapapaz]
 // 1 :[pa]
-
-
 ```

 **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`*
@ -187,48 +185,41 @@ fn example2() {
 	text := "tst: 01,23,45 ,56, 78"
 	query:= r".*:(\s*\d+[\s,]*)+"

-	mut re := regex.new_regex()
+	mut re := new() or { panic(err) }
 	//re.debug = 2
 	re.group_csave = [-1].repeat(3*20+1)  // we expect max 20 records

-	re_err, err_pos := re.compile(query)
-	if re_err == regex.COMPILE_OK {
-		q_str := re.get_query()
-		println("Query: $q_str")
-	
-		start, end := re.match_string(text)
-		if start < 0 {
-			println("ERROR : ${re.get_parse_error_string(start)}, $start")
-		} else {
-			println("found in [$start, $end] => [${text[start..end]}]")
-		}
+	re.compile_opt(query) or { println(err) return }

-		// groups capture
-		mut gi := 0
-		for gi < re.groups.len {
-			if re.groups[gi] >= 0 {
-				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
-			}
-			gi += 2
-		}
+    q_str := re.get_query()
+    println("Query: $q_str")

-		// continuous saving
-		gi = 0
-		println("num: ${re.group_csave[0]}")
-		for gi < re.group_csave[0] {
-			id := re.group_csave[1+gi*3]
-			st := re.group_csave[1+gi*3+1]
-			en := re.group_csave[1+gi*3+2]
-			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
-			gi++
-		}
-	} else {
-		println("query: $query")
-		lc := "-".repeat(err_pos)
-		println("err  : $lc^")
-		err_str := re.get_parse_error_string(re_err)
-		println("ERROR: $err_str")	
-	}
+    start, end := re.match_string(text)
+    if start < 0 {
+        println("ERROR : ${re.get_parse_error_string(start)}, $start")
+    } else {
+        println("found in [$start, $end] => [${text[start..end]}]")
+    }
+
+    // groups capture
+    mut gi := 0
+    for gi < re.groups.len {
+        if re.groups[gi] >= 0 {
+            println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
+        }
+        gi += 2
+    }
+
+    // continuous saving
+    gi = 0
+    println("num: ${re.group_csave[0]}")
+    for gi < re.group_csave[0] {
+        id := re.group_csave[1+gi*3]
+        st := re.group_csave[1+gi*3+1]
+        en := re.group_csave[1+gi*3+2]
+        println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
+        gi++
+    }
 }
 ```

@ -261,73 +252,65 @@ Have a look at the example for the use of them.
 example:

 ```v
+import regex
 fn main() {
 	test_regex()

 	text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
 	query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"

-	mut re := new_regex()
+	mut re := new()
 	re.debug = 2

 	// must provide an array of the right size if want the continuos saving of the groups
 	re.group_csave = [-1].repeat(3*20+1)

-	re_err, err_pos := re.compile(query)
-	if re_err == COMPILE_OK {
-		q_str := re.get_query()
-		println("O.Query: $query")
-		println("Query  : $q_str")
-		
-		re.debug = 0	
-		start, end := re.match_string(text)
-		if start < 0 {
-			err_str := re.get_parse_error_string(start)
-			println("ERROR : $err_str, $start")
-		} else {
-			text1 := text[start..end]
-			println("found in [$start, $end] => [$text1]")
-		}
+	re.compile_opt(query) or { println(err) return }

-		// groups
-		mut gi := 0
-		for gi < re.groups.len {
-			if re.groups[gi] >= 0 {
-				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
-			}
-			gi += 2
-		}
-		// continuous saving
-		gi = 0
-		println("num of group item saved: ${re.group_csave[0]}")
-		for gi < re.group_csave[0] {
-			id := re.group_csave[1+gi*3]
-			st := re.group_csave[1+gi*3+1]
-			en := re.group_csave[1+gi*3+2]
-			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
-			gi++
-		}
-		println("raw array: ${re.group_csave[0..gi*3+2-1]}")
+    q_str := re.get_query()
+    println("O.Query: $query")
+    println("Query  : $q_str")
+    
+    re.debug = 0	
+    start, end := re.match_string(text)
+    if start < 0 {
+        err_str := re.get_parse_error_string(start)
+        println("ERROR : $err_str, $start")
+    } else {
+        text1 := text[start..end]
+        println("found in [$start, $end] => [$text1]")
+    }

-		// named capturing groups
-		println("named capturing groups:")
-		for g_name in re.group_map.keys() {
-			s,e := re.get_group(g_name)
-			if s >= 0 && e > s {
-				println("'${g_name}':[$s, $e] => '${text[s..e]}'")
-			} else {
-				println("Group [${g_name}] doesn't exist.")
-			}
-		}
-		
-	} else {
-		println("query: $query")
-		lc := "-".repeat(err_pos)
-		println("err  : $lc^")
-		err_str := re.get_parse_error_string(re_err)
-		println("ERROR: $err_str")	
-	}
+    // groups
+    mut gi := 0
+    for gi < re.groups.len {
+        if re.groups[gi] >= 0 {
+            println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
+        }
+        gi += 2
+    }
+    // continuous saving
+    gi = 0
+    println("num of group item saved: ${re.group_csave[0]}")
+    for gi < re.group_csave[0] {
+        id := re.group_csave[1+gi*3]
+        st := re.group_csave[1+gi*3+1]
+        en := re.group_csave[1+gi*3+2]
+        println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
+        gi++
+    }
+    println("raw array: ${re.group_csave[0..gi*3+2-1]}")

+    // named capturing groups
+    println("named capturing groups:")
+    for g_name in re.group_map.keys() {
+        s,e := re.get_group(g_name)
+        if s >= 0 && e > s {
+            println("'${g_name}':[$s, $e] => '${text[s..e]}'")
+        } else {
+            println("Group [${g_name}] doesn't exist.")
+        }
+    }
 }
 ```

@ -360,7 +343,7 @@ It is possible to set some flags in the regex parser that change the behavior of

 ```v
 // example of flag settings
-mut re := regex.new_regex()
+mut re := regex.new()
 re.flag = regex.F_BIN 

 ```
@ -382,22 +365,22 @@ These functions are helper that create the `RE` struct, a `RE` struct can be cre

 ```v
 // regex create a regex object from the query string and compile it
-pub fn regex(in_query string) (RE,int,int)
+pub fn regex_opt(in_query string) ?RE
 ```

 #### **Base initializer**

 ```v
 // new_regex create a REgex of small size, usually sufficient for ordinary use
-pub fn new_regex() RE
+pub fn new() RE

 // new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
-pub fn new_regex_by_size(mult int) RE
+pub fn new_by_size(mult int) RE
 ```
 After a base initializer is used, the regex expression must be compiled with:
 ```v
-// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
-pub fn (re mut RE) compile(in_txt string) (int,int)
+// compile compiles the REgex returning an error if the compilation fails
+pub fn (re mut RE) compile_opt(in_txt string) ?
 ```

 ### Operative Functions
@ -428,20 +411,9 @@ the following example code show how to visualize the syntax errors in the compil

 ```v
 query:= r"ciao da ab[ab-]"  // there is an error, a range not closed!!
-mut re := new_regex()
+mut re := new()

-// re_err ==> is the return value, if < 0 it is an error
-// re_pos ==> if re_err < 0, re_pos is the error index in the query string 
-re_err, err_pos := re.compile(query)
-
-// print the error if one happen
-if re_err != COMPILE_OK {
-	println("query: $query")
-    lc := "-".repeat(err_pos)
-    println("err  : $lc^")
-    err_str := re.get_parse_error_string(re_err)  // get the error string
-    println("ERROR: $err_str")
-}
+re.compile_opt(query) or { println(err) }

 // output!!

@ -543,7 +515,7 @@ fn custom_print(txt string) {
 	println("my log: $txt")
 }

-mut re := new_regex()
+mut re := new()
 re.log_func = custom_print  // every debug output from now will call this function

 ```
@ -571,38 +543,29 @@ tests = [

 fn example() {
 	for c,tst in tests {
-		mut re := regex.new_regex()
-		re_err, err_pos := re.compile(tst.query)
-		if re_err == regex.COMPILE_OK {
+		mut re := regex.new()
+		re.compile_opt(tst.query) or { println(err) continue }
 			
-			// print the query parsed with the groups ids
-			re.debug = 1 // set debug on at minimum level
-			println("#${c:2d} query parsed: ${re.get_query()}")
-			re.debug = 0
-			
-			// do the match
-			start, end := re.match_string(tst.source)
-			if start >= 0 && end > start {
-				println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
-			}	
-			
-			// print the groups
-			mut gi := 0
-			for gi < re.groups.len {
-				if re.groups[gi] >= 0 {
-					println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
-				}
-				gi += 2
-			}		
-			println("")
-		} else {
-			// print the compile error
-			println("query: $tst.query")
-			lc := "-".repeat(err_pos-1)
-			println("err  : $lc^")
-			err_str := re.get_parse_error_string(re_err)
-			println("ERROR: $err_str")
-		}
+        // print the query parsed with the groups ids
+        re.debug = 1 // set debug on at minimum level
+        println("#${c:2d} query parsed: ${re.get_query()}")
+        re.debug = 0
+        
+        // do the match
+        start, end := re.match_string(tst.source)
+        if start >= 0 && end > start {
+            println("#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]")
+        }	
+        
+        // print the groups
+        mut gi := 0
+        for gi < re.groups.len {
+            if re.groups[gi] >= 0 {
+                println("group ${gi/2:2d} :[${tst.source[re.groups[gi]..re.groups[gi+1]]}]")
+            }
+            gi += 2
+        }		
+        println("")
 	}
 }

--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -1,6 +1,6 @@
 /*

-regex 0.9e
+regex 0.9g

 Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
 Use of this source code is governed by an MIT license
@ -19,7 +19,7 @@ module regex
 import strings

 pub const(
-	v_regex_version = "0.9e"      // regex module version
+	v_regex_version = "0.9g"      // regex module version

 	max_code_len     = 256        // default small base code len for the regex programs
 	max_quantifier   = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -912,7 +912,12 @@ fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
 // main compiler
 //
 // compile return (return code, index) where index is the index of the error in the query string if return code is an error code
+[deprecated]
 pub fn (mut re RE) compile(in_txt string) (int,int) {
+	return re.impl_compile(in_txt)
+}    
+
+fn (mut re RE) impl_compile(in_txt string) (int,int) {
 	mut i        := 0      // input string index
 	mut pc       := 0      // program counter
 	mut tmp_code := u32(0)
@ -2187,6 +2192,7 @@ Public functions
 //

 // regex create a regex object from the query string
+[deprecated]
 pub fn regex(in_query string) (RE,int,int){
 	mut re := RE{}
 	re.prog = [Token{}].repeat(in_query.len+1)
@ -2198,12 +2204,17 @@ pub fn regex(in_query string) (RE,int,int){
 }

 // new_regex create a RE of small size, usually sufficient for ordinary use
+[deprecated]
 pub fn new_regex() RE {
-	return new_regex_by_size(1)
+	return impl_new_regex_by_size(1)
 }

 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
+[deprecated]
 pub fn new_regex_by_size(mult int) RE {
+	return impl_new_regex_by_size(mult)
+}    
+fn impl_new_regex_by_size(mult int) RE {
 	mut re := RE{}
 	re.prog = [Token{}].repeat(max_code_len*mult)       // max program length, default 256 istructions
 	re.cc = [CharClass{}].repeat(max_code_len*mult)     // char class list
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -0,0 +1,34 @@
+module regex
+import strings
+
+// compile_opt compile RE pattern string
+pub fn (mut re RE) compile_opt(pattern string) ? {
+	re_err,err_pos := re.impl_compile(pattern)
+	
+	if re_err != compile_ok {
+		mut err_msg := strings.new_builder(300)
+		err_msg.write("query: $pattern\n")
+		line := "-".repeat(err_pos)
+		err_msg.write("err  : ${line}^\n")
+		err_str := re.get_parse_error_string(re_err)
+		err_msg.write("ERROR: $err_str\n")
+		return error_with_code(err_msg.str(), re_err)
+	}
+}
+
+// new_regex create a RE of small size, usually sufficient for ordinary use
+pub fn new() RE {
+	return impl_new_regex_by_size(1)
+}
+
+// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
+pub fn new_by_size(mult int) RE {
+	return impl_new_regex_by_size(mult)
+}
+
+// regex_opt create new RE object from RE pattern string
+pub fn regex_opt(pattern string) ?RE {
+	mut re := new()
+	re.compile_opt(pattern)?
+	return re
+}
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -175,48 +175,52 @@ fn test_regex(){
 		// debug print
 		//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")

-		mut re, re_err, _ := regex.regex(to.q)
+		mut re := regex.regex_opt(to.q) or {
+			eprintln('err: $err')
+			assert false
+			continue
+		}
+
 		re.group_csave = [-1].repeat(3*20+1)

-		if re_err == regex.compile_ok {
-			start, end := re.match_string(to.src)
+		start, end := re.match_string(to.src)

-			mut tmp_str := ""
-			if start >= 0 && end  > start{
-				tmp_str = to.src[start..end]
-			}
+		mut tmp_str := ""
+		if start >= 0 && end  > start{
+			tmp_str = to.src[start..end]
+		}

-			if start != to.s || end != to.e {
-				println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-				println("ERROR!")
-				//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
-				assert false
-				break
-			}
+		if start != to.s || end != to.e {
+			println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			println("ERROR!")
+			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+			assert false
+			continue
+		}

-			// check cgroups
-			if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
-				println("Capturing group len error!")
+		// check cgroups
+		if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
+			println("Capturing group len error!")
+			assert false
+			continue
+		}
+
+		// check captured groups
+		mut ln := re.group_csave[0]*3
+		for ln > 0 {
+			if re.group_csave[ln] != to.cg[ln] {
 				assert false
 			}
+			ln--
+		}

-			// check captured groups
-			mut ln := re.group_csave[0]*3
-			for ln > 0 {
-				if re.group_csave[ln] != to.cg[ln] {
-					assert false
-				}
-				ln--
+		// check named captured groups
+		for k in to.cgn.keys() {
+			if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
+				println("Named capturing group error! [$k]")
+				assert false
+				continue
 			}
-
-			// check named captured groups
-			for k in to.cgn.keys() {
-				if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
-					println("Named capturing group error! [$k]")
-					assert false
-				}
-			}
-
 		}
 	}

@ -225,29 +229,27 @@ fn test_regex(){
 		// debug print
 		//println("#$c [$to.src] q[$to.q] $to.r")

-		mut re, re_err, err_pos := regex.regex(to.q)
-		if re_err == regex.compile_ok {
-			res := re.find_all(to.src)
-			if res.len != to.r.len {
-				println("ERROR: find_all, array of different size.")
-				assert false
-			}
-
-			for c1,i in res {
-				if i != to.r[c1] {
-					println("ERROR: find_all, different indexes.")
-					assert false
-				}
-			}
-
-		} else {
-			println("query: $to.q")
-			lc := "-".repeat(err_pos-1)
-			println("err  : $lc^")
-			err_str := re.get_parse_error_string(re_err)
-			println("ERROR: $err_str")
+		mut re := regex.regex_opt(to.q) or {
+			eprintln('err: $err')
 			assert false
+			continue
 		}
+
+		res := re.find_all(to.src)
+		if res.len != to.r.len {
+			println("ERROR: find_all, array of different size.")
+			assert false
+			continue            
+		}
+
+		for c1,i in res {
+			if i != to.r[c1] {
+				println("ERROR: find_all, different indexes.")
+				assert false
+				continue
+			}
+		}
+
 	}

 	// check replace
@ -255,97 +257,81 @@ fn test_regex(){
 		// debug print
 		//println("#$c [$to.src] q[$to.q] $to.r")

-		mut re, re_err, err_pos := regex.regex(to.q)
-		if re_err == regex.compile_ok {
-			res := re.replace(to.src,to.rep)
-			if res != to.r {
-				println("ERROR: replace.")
-				assert false
-			}
-
-		} else {
-			println("query: $to.q")
-			lc := "-".repeat(err_pos-1)
-			println("err  : $lc^")
-			err_str := re.get_parse_error_string(re_err)
-			println("ERROR: $err_str")
+		mut re := regex.regex_opt(to.q) or {
+			eprintln('err: $err')
 			assert false
+			continue
 		}
+
+		res := re.replace(to.src,to.rep)
+		if res != to.r {
+			println("ERROR: replace.")
+			assert false
+			continue
+		}		
 	}

 	// check match and find
 	for c,to in match_test_suite {
 		// debug print
-		//println("#$c [$to.src] q[$to.q] $to.s")
+		println("#$c [$to.src] q[$to.q] $to.s $to.e")

 		// test the find
 		if to.s > 0 {
-			mut re, re_err, err_pos := regex.regex(to.q)
-			if re_err == regex.compile_ok {
-				//q_str := re.get_query()
-				//println("Query: $q_str")
-				start,end := re.find(to.src)
-
-				if start != to.s || end != to.e {
-					err_str := re.get_parse_error_string(start)
-					println("ERROR : $err_str")
-					assert false
-				} else {
-					//tmp_str := text[start..end]
-					//println("found in [$start, $end] => [$tmp_str]")
-					assert true
-				}
-
-			} else {
-				println("query: $to.q")
-				lc := "-".repeat(err_pos-1)
-				println("err  : $lc^")
-				err_str := re.get_parse_error_string(re_err)
-				println("ERROR: $err_str")
+			mut re := regex.regex_opt(to.q) or {
+				eprintln('err: $err')
 				assert false
+				continue
+			}                
+			// q_str := re.get_query()
+			// println("Query: $q_str")
+			start,end := re.find(to.src)
+
+			if start != to.s || end != to.e {
+				err_str := re.get_parse_error_string(start)
+				println("ERROR : $err_str")
+				assert false
+			} else {
+				//tmp_str := text[start..end]
+				//println("found in [$start, $end] => [$tmp_str]")
+				assert true
 			}
 			continue
 		}

 		// test the match
-		mut re := regex.new_regex()
+		mut re := regex.new()
 		//re.debug = true

-		re_err,err_pos := re.compile(to.q)
-		if re_err == regex.compile_ok {
-			//println("#$c [$to.src] q[$to.q]")
-			start, end := re.match_string(to.src)
-
-			mut tmp_str := ""
-			if start >= 0 && end  > start{
-				tmp_str = to.src[start..end]
-			}
-
-			if start != to.s || end != to.e {
-				println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
-				println("ERROR!")
-				//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
-				assert false
-				break
-			}
-
-			// rerun to test consistency
-			tmp_str1 := to.src.clone()
-			start1, end1 := re.match_string(tmp_str1)
-			if start1 != start || end1 != end {
-				println("two run ERROR!!")
-				assert false
-				break
-			}
-
-		} else {
-			println("query: $to.q")
-			lc := "-".repeat(err_pos-1)
-			println("err  : $lc^")
-			err_str := re.get_parse_error_string(re_err)
-			println("ERROR: $err_str")
+		re.compile_opt(to.q) or {
+			eprintln('err: $err')
 			assert false
-			break
+			continue
 		}
+		//println("#$c [$to.src] q[$to.q]")
+		start, end := re.match_string(to.src)
+
+		mut tmp_str := ""
+		if start >= 0 && end  > start{
+			tmp_str = to.src[start..end]
+		}
+
+		if start != to.s || end != to.e {
+			println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			println("ERROR!")
+			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+			assert false
+			continue
+		}
+
+		// rerun to test consistency
+		tmp_str1 := to.src.clone()
+		start1, end1 := re.match_string(tmp_str1)
+		if start1 != start || end1 != end {
+			println("two run ERROR!!")
+			assert false
+			continue
+		}
+
 	}
 }