regex: refactoring, documentation, examples (#7418)

2023-08-10 21:13:21 +03:00 · 2020-12-20 04:52:02 +01:00 · 2020-12-20 04:52:02 +01:00 · b29bcb3fbe
commit b29bcb3fbe
parent 8278af4ee8
4 changed files with 279 additions and 158 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -471,10 +471,23 @@ pub fn regex_opt(in_query string) ?RE
 // new_regex create a REgex of small size, usually sufficient for ordinary use
 pub fn new() RE

-// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
-pub fn new_by_size(mult int) RE
 ```
-After a base initializer is used, the regex expression must be compiled with:
+#### **Custom initialization**
+For some particular need it is possible initialize a fully customized regex:
+```v ignore
+// init custom regex
+mut re := regex.RE{}
+re.prog = []Token    {len: pattern.len + 1} // max program length, can not be longer then the pattern
+re.cc   = []CharClass{len: pattern.len}     // can not be more char class the the length of the pattern
+
+re.group_csave_flag = false          // true enable continuos group saving if needed
+re.group_max_nested = 128            // set max 128 group nested possible
+re.group_max        = pattern.len>>1 // we can't have more groups than the half of the pattern legth
+```
+### Compiling
+
+After an initializer is used, the regex expression must be compiled with:
+
 ```v ignore
 // compile compiles the REgex returning an error if the compilation fails
 pub fn (re mut RE) compile_opt(in_txt string) ?
@ -500,11 +513,38 @@ pub fn (re mut RE) replace(in_txt string, repl string) string

 ## Find and Replace

+There are the following find  and replace functions:
+
+#### Find functions
+
+```v ignore
+// find try to find the first match in the input string, return start and end index if found else start is -1
+pub fn (re mut RE) find(in_txt string) (int,int)
+
+// find_all find all the "non overlapping" occurrences of the matching pattern
+// return a list of start end indexes like: [3,4,6,8] 
+// the matches are [3,4] and [6,8]
+pub fn (re mut RE) find_all(in_txt string) []int
+```
+
+#### Replace functions
+
+```v ignore
+// replace return a string where the matches are replaced with the replace string, only non overlapped matches are used
+pub fn (re mut RE) replace(in_txt string, repl string) string
+```
+
+#### Custom replace function
+
 For complex find and replace operations it is available the function `replace_by_fn` .
 The`replace_by_fn` use a custom replace function making possible customizations. 
 **The custom function is called for every non overlapped find.**
 The custom function must be of the type:
+
 ```v ignore
+// re RE struct
+// in_txt all the text passed to the regex expression
+// the match is: in_txt[start..end]
 fn (re RE, in_txt string, start int, end int) string
 ```

@ -671,7 +711,7 @@ re.log_func = custom_print

 ## Example code

-Here there is a simple code to perform some basically match of strings
+Here an example that perform some basically match of strings

 ```v ignore
 import regex
@ -698,5 +738,63 @@ fn main(){
    }
 }
 ```
+Here an example of total customization of the regex environment creation:
+```v ignore
+import regex
+
+fn main(){
+    txt   := "today John is gone to his house with Jack and Marie."
+    query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+"
+
+    // init regex
+    mut re := regex.RE{}
+    re.prog = []regex.Token    {len: query.len + 1} // max program length, can not be longer then the query
+    re.cc   = []regex.CharClass{len: query.len}     // can not be more char class the the length of the query
+    re.prog = []regex.Token    {len: query.len+1}
+    re.group_csave_flag = true         // enable continuos group saving
+    re.group_max_nested = 128          // set max 128 group nested
+    re.group_max        = query.len>>1 // we can't have more groups than the half of the query legth 
+    
+    // compile the query
+    re.compile_opt(query) or { panic(err) }
+
+    start, end := re.match_string(txt)
+    if start >= 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
+    } else {
+        println("No Match")
+    }
+
+    // show results for continuos group saving
+    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
+        println("cg: $re.group_csave")
+        mut cs_i := 1
+        for cs_i < re.group_csave[0]*3 {
+            g_id := re.group_csave[cs_i]
+            st   := re.group_csave[cs_i+1]
+            en   := re.group_csave[cs_i+2]
+            println("cg[$g_id] $st $en:[${txt[st..en]}]")
+            cs_i += 3
+        }
+    }
+
+    // show results for captured groups
+    if start >= 0 {
+        println("Match ($start, $end) => [${txt[start..end]}]")
+        for g_index := 0; g_index < re.group_count ; g_index++ {
+            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+            bounds: ${re.get_group_bounds_by_id(g_index)}")  
+        }
+        for name in re.group_map.keys() {
+            println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
+            bounds: ${re.get_group_bounds_by_name(name)}")
+        }
+    } else {
+        println("No Match")
+    }
+}
+```
+
+

 more example code is available in the test code for the `regex` module `vlib\regex\regex_test.v`.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -266,11 +266,11 @@ fn (mut tok Token) reset() {
 	tok.rep = 0
 }

-/*
-
-Regex struct
-
-*/
+/******************************************************************************
+*
+* Regex struct
+*
+******************************************************************************/
 pub const (
 	f_nl  = 0x00000001  // end the match when find a new line symbol
 	f_ms  = 0x00000002  // match true only if the match is at the start of the string
@ -354,11 +354,11 @@ fn (mut re RE) reset_src(){
 	}
 }

-/*
-
-Backslashes chars
-
-*/
+/******************************************************************************
+*
+* Backslashes chars
+*
+******************************************************************************/
 struct BslsStruct {
 	ch rune                   // meta char
 	validator FnValidator    // validator function pointer
@ -430,11 +430,11 @@ fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
 	return err_syntax_error, i
 }

-/*
-
-Char class
-
-*/
+/******************************************************************************
+*
+* Char class
+*
+******************************************************************************/
 const(
 	cc_null = 0    // empty cc token
 	cc_char = 1    // simple char: a
@ -653,11 +653,11 @@ fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
 	return err_syntax_error,0,u32(0)
 }

-/*
-
-Re Compiler
-
-*/
+/******************************************************************************
+*
+* Re Compiler
+*
+******************************************************************************/
 //
 // Quantifier
 //
@ -1462,11 +1462,11 @@ pub fn (re RE) get_query() string {
 	return res.str()
 }

-/*
-
-Groups saving utilities
-
-*/
+/******************************************************************************
+*
+* Groups saving utilities
+*
+******************************************************************************/
 [inline]
 fn (mut re RE) group_continuous_save(g_index int) {
 	if re.group_csave_flag == true {
@ -1501,11 +1501,11 @@ fn (mut re RE) group_continuous_save(g_index int) {
 	}
 }

-/*
-
-Matching
-
-*/
+/******************************************************************************
+*
+* Matching
+*
+******************************************************************************/					
 enum Match_state{
 	start = 0
 	stop
@ -2001,6 +2001,7 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						last_dot_pc: state.pc
 					}
 					m_state = .ist_quant_n
+					//println("dot_char stack len: $state_list.len")
 					continue
 				}

@ -2363,47 +2364,11 @@ pub fn (mut re RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 	return no_match_found, 0
 }

-/*
-
-Public functions
-
-*/
-
-//
-// Inits
-//
-
-// regex create a regex object from the query string
-[deprecated]
-pub fn regex(in_query string) (RE,int,int){
-	mut re := RE{}
-	re.prog = []Token    {len: in_query.len+1}
-	re.cc   = []CharClass{len: in_query.len+1}
-	re.group_max_nested = 8
-
-	re_err,err_pos := re.compile(in_query)
-	return re, re_err, err_pos
-}
-
-// new_regex create a RE of small size, usually sufficient for ordinary use
-[deprecated]
-pub fn new_regex() RE {
-	return impl_new_regex_by_size(1)
-}
-
-// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
-[deprecated]
-pub fn new_regex_by_size(mult int) RE {
-	return impl_new_regex_by_size(mult)
-}
-fn impl_new_regex_by_size(mult int) RE {
-	mut re := RE{}
-	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
-	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
-	re.group_max_nested = 3*mult                        // max nested group
-
-	return re
-}
+/******************************************************************************
+*
+* Public functions
+*
+******************************************************************************/	

 //
 // Matchers
@ -2538,82 +2503,3 @@ pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
 	return res
 }

-/*
-
-Utilities
-
-*/
-
-// get_group_bounds_by_name get a group boundaries by its name
-pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
-	if group_name in re.group_map {
-		tmp_index := re.group_map[group_name]-1
-		start     := re.groups[tmp_index * 2]
-		end       := re.groups[tmp_index * 2 + 1]
-		return start,end
-	}
-	return -1, -1
-}
-
-// get_group_by_name get a group boundaries by its name
-pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
-	if group_name in re.group_map {
-		tmp_index := re.group_map[group_name]-1
-		start     := re.groups[tmp_index * 2]
-		end       := re.groups[tmp_index * 2 + 1]
-		return in_txt[start..end]
-	}
-	return ""
-}
-
-// get_group_by_id get a group string by its id
-pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
-	if group_id < (re.groups.len >> 1) {
-		index := group_id << 1
-		start := re.groups[index]
-		end   := re.groups[index + 1]
-		return in_txt[start..end]
-	}
-	return ""
-}
-
-// get_group_by_id get a group boundaries by its id
-pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
-	if group_id < (re.groups.len >> 1) {
-		index := group_id << 1
-		return re.groups[index], re.groups[index]
-	}
-	return -1, -1
-}
-
-pub
-struct Re_group {
-pub:
-	start int = -1
-	end   int = -1
-}
-
-// get_group_list return a list of Re_group for the found groups
-pub fn (re RE) get_group_list() []Re_group {
-	mut res := []Re_group{len: re.groups.len >> 1}
-	mut gi := 0
-	//println("len: ${re.groups.len} groups: ${re.groups}")
-	for gi < re.groups.len {
-		if re.groups[gi] >= 0 {
-			txt_st := re.groups[gi]
-            txt_en := re.groups[gi+1]
-
-            //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
-            if txt_st >= 0 && txt_en > txt_st {
-				tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]}
-				//println(tmp)
-				res[gi >> 1] = tmp
-			} else {
-				res[gi >> 1] = Re_group{}
-			}
-		}
-		gi += 2
-	}
-	return res
-}
-
--- a/vlib/regex/regex_opt.v
+++ b/vlib/regex/regex_opt.v
@ -17,18 +17,29 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
 }

 // new_regex create a RE of small size, usually sufficient for ordinary use
+[deprecated]
 pub fn new() RE {
 	return impl_new_regex_by_size(1)
 }

 // new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
+[deprecated]
 pub fn new_by_size(mult int) RE {
 	return impl_new_regex_by_size(mult)
 }

 // regex_opt create new RE object from RE pattern string
 pub fn regex_opt(pattern string) ?RE {
-	mut re := new()
-	re.compile_opt(pattern)?
-	return re
+	// init regex
+    mut re := regex.RE{}
+    re.prog = []Token    {len: pattern.len + 1} // max program length, can not be longer then the pattern
+    re.cc   = []CharClass{len: pattern.len}     // can not be more char class the the length of the pattern
+    re.group_csave_flag = false                 // enable continuos group saving
+    re.group_max_nested = 128                   // set max 128 group nested
+    re.group_max        = pattern.len >> 1      // we can't have more groups than the half of the pattern legth
+
+    // compile the pattern
+    re.compile_opt(pattern)?
+
+    return re
 }
--- a/vlib/regex/regex_util.v
+++ b/vlib/regex/regex_util.v
@ -0,0 +1,126 @@
+/*
+
+regex 1.0 alpha
+
+Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
+Use of this source code is governed by an MIT license
+that can be found in the LICENSE file.
+
+*/
+module regex
+
+/******************************************************************************
+*
+* Inits
+*
+******************************************************************************/
+// regex create a regex object from the query string
+[deprecated]
+pub fn regex(in_query string) (RE,int,int){
+	mut re := RE{}
+	re.prog = []Token    {len: in_query.len+1}
+	re.cc   = []CharClass{len: in_query.len+1}
+	re.group_max_nested = 8
+
+	re_err,err_pos := re.compile(in_query)
+	return re, re_err, err_pos
+}
+
+// new_regex create a RE of small size, usually sufficient for ordinary use
+[deprecated]
+pub fn new_regex() RE {
+	return impl_new_regex_by_size(1)
+}
+
+// new_regex_by_size create a RE of large size, mult specify the scale factor of the memory that will be allocated
+[deprecated]
+pub fn new_regex_by_size(mult int) RE {
+	return impl_new_regex_by_size(mult)
+}
+fn impl_new_regex_by_size(mult int) RE {
+	mut re := RE{}
+	re.prog = []Token    {len: max_code_len*mult}       // max program length, default 256 istructions
+	re.cc   = []CharClass{len: max_code_len*mult}       // char class list
+	re.group_max_nested = 3*mult                        // max nested group
+
+	return re
+}
+
+/******************************************************************************
+*
+* Utilities
+*
+******************************************************************************/
+// get_group_bounds_by_name get a group boundaries by its name
+pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
+	if group_name in re.group_map {
+		tmp_index := re.group_map[group_name]-1
+		start     := re.groups[tmp_index * 2]
+		end       := re.groups[tmp_index * 2 + 1]
+		return start,end
+	}
+	return -1, -1
+}
+
+// get_group_by_name get a group boundaries by its name
+pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
+	if group_name in re.group_map {
+		tmp_index := re.group_map[group_name]-1
+		start     := re.groups[tmp_index * 2]
+		end       := re.groups[tmp_index * 2 + 1]
+		return in_txt[start..end]
+	}
+	return ""
+}
+
+// get_group_by_id get a group string by its id
+pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
+	if group_id < (re.groups.len >> 1) {
+		index := group_id << 1
+		start := re.groups[index]
+		end   := re.groups[index + 1]
+		return in_txt[start..end]
+	}
+	return ""
+}
+
+// get_group_by_id get a group boundaries by its id
+pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
+	if group_id < (re.groups.len >> 1) {
+		index := group_id << 1
+		return re.groups[index], re.groups[index + 1]
+	}
+	return -1, -1
+}
+
+pub
+struct Re_group {
+pub:
+	start int = -1
+	end   int = -1
+}
+
+// get_group_list return a list of Re_group for the found groups
+pub fn (re RE) get_group_list() []Re_group {
+	mut res := []Re_group{len: re.groups.len >> 1}
+	mut gi := 0
+	//println("len: ${re.groups.len} groups: ${re.groups}")
+	for gi < re.groups.len {
+		if re.groups[gi] >= 0 {
+			txt_st := re.groups[gi]
+            txt_en := re.groups[gi+1]
+
+            //println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
+            if txt_st >= 0 && txt_en > txt_st {
+				tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]}
+				//println(tmp)
+				res[gi >> 1] = tmp
+			} else {
+				res[gi >> 1] = Re_group{}
+			}
+		}
+		gi += 2
+	}
+	return res
+}
+