regex: named capturing groups, small fixes

2023-08-10 21:13:21 +03:00 · 2020-01-28 20:34:11 +01:00
parent 9ac0c54eb0
commit 5a2534122e
3 changed files with 387 additions and 31 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -55,7 +55,7 @@ A meta-char is specified by a backslash before a char like `\w` in this case the

 A meta-char can match different type of chars.

-* `\w` match an alphanumeric char `[a-zA-Z0-9]`
+* `\w` match an alphanumeric char `[a-zA-Z0-9_]`
 * `\W` match a non alphanumeric char
 * `\d` match a digit `[0-9]`
 * `\D` match a non digit
@ -244,6 +244,114 @@ cg id: 0 [15, 19] => [56, ]
 cg id: 0 [19, 21] => [78] 
 ```

+### Named capturing groups
+
+This regex module support partially the question mark `?` PCRE syntax for groups.
+
+`(?:abcd)` **non capturing group**:  the content of the group will not be saved
+
+`(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled as `mygroup`
+
+The label of the groups is saved in the `group_map` of the `RE` struct, this is a map from `string` to `int` where the value is the index in `group_csave` list of index.
+
+Have a look at the example for the use of them.
+
+example:
+
+```v
+fn main() {
+	test_regex()
+
+	text := "http://www.ciao.mondo/hello/pippo12_/pera.html"
+	query:= r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+"
+
+	mut re := new_regex()
+	re.debug = 2
+
+	// must provide an array of the right size if want the continuos saving of the groups
+	re.group_csave = [-1].repeat(3*20+1)
+
+	re_err, err_pos := re.compile(query)
+	if re_err == COMPILE_OK {
+		q_str := re.get_query()
+		println("O.Query: $query")
+		println("Query  : $q_str")
+		
+		re.debug = 0	
+		start, end := re.match_string(text)
+		if start < 0 {
+			err_str := re.get_parse_error_string(start)
+			println("ERROR : $err_str, $start")
+		} else {
+			text1 := text[start..end]
+			println("found in [$start, $end] => [$text1]")
+		}
+
+		// groups
+		mut gi := 0
+		for gi < re.groups.len {
+			if re.groups[gi] >= 0 {
+				println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]")
+			}
+			gi += 2
+		}
+		// continuous saving
+		gi = 0
+		println("num of group item saved: ${re.group_csave[0]}")
+		for gi < re.group_csave[0] {
+			id := re.group_csave[1+gi*3]
+			st := re.group_csave[1+gi*3+1]
+			en := re.group_csave[1+gi*3+2]
+			println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]")
+			gi++
+		}
+		println("raw array: ${re.group_csave[0..gi*3+2-1]}")
+
+		// named capturing groups
+		println("named capturing groups:")
+		for g_name in re.group_map.keys() {
+			s,e := re.get_group(g_name)
+			if s >= 0 && e > s {
+				println("'${g_name}':[$s, $e] => '${text[s..e]}'")
+			} else {
+				println("Group [${g_name}] doesn't exist.")
+			}
+		}
+		
+	} else {
+		println("query: $query")
+		lc := "-".repeat(err_pos)
+		println("err  : $lc^")
+		err_str := re.get_parse_error_string(re_err)
+		println("ERROR: $err_str")	
+	}
+
+}
+```
+
+Output:
+
+```
+O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+
+Query  : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+
+found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html]
+0 0,4 :[http]
+1 42,46 :[html]
+num of group item saved: 8
+cg id: 0 [0, 4] => [http]
+cg id: 1 [7, 11] => [www.]
+cg id: 1 [11, 16] => [ciao.]
+cg id: 1 [16, 22] => [mondo/]
+cg id: 1 [22, 28] => [hello/]
+cg id: 1 [28, 37] => [pippo12_/]
+cg id: 1 [37, 42] => [pera.]
+cg id: 1 [42, 46] => [html]
+raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46] 
+named capturing groups:
+'format':[0, 4] => 'http'
+'token':[42, 46] => 'html'
+```
+
 ## Flags

 It is possible to set some flags in the regex parser that change the behavior of the parser itself.
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@ -1,6 +1,6 @@
 /**********************************************************************
 *
-* regex 0.9c
+* regex 0.9d
 *
 * Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
 * Use of this source code is governed by an MIT license
@ -9,8 +9,9 @@
 * This file contains regex module
 *
 * Know limitation:
-* - max 8 stacked groups
 * - find is implemented in a trivial way
+* - not full compliant PCRE
+* - not compliant POSIX ERE
 *
 *
 **********************************************************************/
@ -18,7 +19,7 @@ module regex
 import strings

 pub const(
-	V_REGEX_VERSION = "0.9c"      // regex module version
+	V_REGEX_VERSION = "0.9d"      // regex module version

 	MAX_CODE_LEN     = 256        // default small base code len for the regex programs
 	MAX_QUANTIFIER   = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
@ -41,6 +42,7 @@ pub const(
 	ERR_GROUPS_OVERFLOW     = -7   // max number of groups reached
 	ERR_GROUPS_MAX_NESTED   = -8   // max number of nested group reached
 	ERR_GROUP_NOT_BALANCED  = -9   // group not balanced
+	ERR_GROUP_QM_NOTATION   = -10  // group invalid notation
 )

 const(
@ -133,6 +135,7 @@ fn is_alnum(in_char byte) bool {
 	if tmp >= 0x00 && tmp <= 25 { return true }
 	tmp = in_char - `0`
 	if tmp >= 0x00 && tmp <= 9  { return true }
+	if tmp == `_` { return true }
 	return false
 }

@ -193,9 +196,10 @@ pub fn (re RE) get_parse_error_string(err int) string {
 		ERR_INTERNAL_ERROR     { return "ERR_INTERNAL_ERROR" }
 		ERR_CC_ALLOC_OVERFLOW  { return "ERR_CC_ALLOC_OVERFLOW" }
 		ERR_SYNTAX_ERROR       { return "ERR_SYNTAX_ERROR" }
-		ERR_GROUPS_OVERFLOW    { return "ERR_GROUPS_OVERFLOW"}
-		ERR_GROUPS_MAX_NESTED  { return "ERR_GROUPS_MAX_NESTED"}
-		ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"}
+		ERR_GROUPS_OVERFLOW    { return "ERR_GROUPS_OVERFLOW" }
+		ERR_GROUPS_MAX_NESTED  { return "ERR_GROUPS_MAX_NESTED" }
+		ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
+		ERR_GROUP_QM_NOTATION  { return "ERR_GROUP_QM_NOTATION" }
 		else { return "ERR_UNKNOWN" }
 	}
 }
@ -272,6 +276,9 @@ pub const (

 	F_EFM = 0x00000100  // exit on first token matched, used by search
 	F_BIN = 0x00000200  // work only on bytes, ignore utf-8
+
+	// behaviour modifier flags
+	//F_OR  = 0x00010000  // the OR work with concatenation like PCRE
 )

 struct StateDotObj{
@ -305,6 +312,8 @@ pub mut:
 	group_csave []int    = []int  // groups continuous save array
 	group_csave_index int= -1     // groups continuous save index

+	group_map map[string]int      // groups names map
+
 	// flags
 	flag int             = 0   // flag for optional parameters

@ -336,6 +345,16 @@ fn (re mut RE) reset(){
 	}
 }

+pub fn (re RE) get_group(group_name string) (int, int) {
+	if group_name in re.group_map {
+		tmp_index := re.group_map[group_name]-1
+		start := re.groups[tmp_index*2]
+		end := re.groups[tmp_index*2+1]
+		return start,end
+	}
+	return -1, -1
+}
+
 /******************************************************************************
 *
 * Backslashes chars
@ -631,7 +650,7 @@ enum Quant_parse_state {
 	finish
 }

-// parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char
+// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
 fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 	mut status := Quant_parse_state.start
 	mut i := in_i
@ -748,6 +767,104 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
 	return ERR_SYNTAX_ERROR, i, 0, false
 }

+//
+// Groups
+//
+enum Group_parse_state {
+	start,
+	q_mark,      // (?
+	q_mark1,     // (?:|P  checking
+	p_status,    // (?P
+	p_start,     // (?P<
+	p_end,       // (?P<...>
+	p_in_name,   // (?P<...	
+	finish
+}
+
+// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
+fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
+	mut status := Group_parse_state.start
+	mut i := in_i
+	mut name := ''
+
+	for i < in_txt.len && status != .finish {
+
+		// get our char
+		char_tmp,char_len := re.get_char(in_txt,i)
+		ch := byte(char_tmp)
+
+		// start
+		if status == .start && ch == `(` {
+			status = .q_mark
+			i += char_len
+			continue
+		}
+
+		// check for question marks
+		if status == .q_mark && ch == `?` {
+			status = .q_mark1
+			i += char_len
+			continue
+		}
+
+		// non capturing group
+		if status == .q_mark1 && ch == `:` {
+			i += char_len
+			return 0, false, name, i
+		}
+
+		// enter in P section
+		if status == .q_mark1 && ch == `P` {
+			status = .p_status
+			i += char_len
+			continue
+		}
+
+		// not a valid q mark found
+		if status == .q_mark1 {
+			//println("NO VALID Q MARK")
+			return -2 , true, name, i
+		}
+
+		if status == .p_status && ch == `<` {
+			status = .p_start
+			i += char_len
+			continue
+		}
+
+		if status == .p_start && ch != `>` {
+			status = .p_in_name
+			name += "${ch:1c}" // TODO: manage utf8 chars
+			i += char_len
+			continue
+		}
+
+		// colect name
+		if status == .p_in_name && ch != `>` && is_alnum(ch) {
+			name += "${ch:1c}" // TODO: manage utf8 chars
+			i += char_len
+			continue
+		}
+
+		// end name
+		if status == .p_in_name && ch == `>` {
+			i += char_len
+			return 0, true, name, i
+		}
+
+		// error on name group
+		if status == .p_in_name {
+			return -2 , true, name, i
+		}
+
+		// normal group, nothig to do, exit
+		return  0 , true, name, i
+	}
+	/* UNREACHABLE */
+	//println("ERROR!! NOT MEANT TO BE HERE!!1")
+	return -2 , true, name, i
+}
+
 //
 // main compiler
 //
@ -795,7 +912,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 			if group_count > re.group_max {
 				return ERR_GROUPS_OVERFLOW,i+1
 			}
-			
 			group_stack_index++

 			// check max nested groups allowed
@ -803,17 +919,50 @@ pub fn (re mut RE) compile(in_txt string) (int,int) {
 				return ERR_GROUPS_MAX_NESTED,i+1
 			}

+			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)
+			
+			// manage question mark format error
+			if tmp_res < -1 {
+				return ERR_GROUP_QM_NOTATION,next_i
+			}
+
+			//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
+			i = next_i
+
+			if cgroup_flag == true {
 				group_count++
+			}
+
+			// calculate the group id
+			// if it is a named group, recycle the group id
+			// NOTE: **** the group index is +1 because map return 0 when not found!! ****
+			mut group_id := group_count
+			if cgroup_name.len > 0 {
+				//println("GROUP NAME: ${cgroup_name}")
+				if cgroup_name in re.group_map{
+					group_id = re.group_map[cgroup_name]-1
+					group_count--
+				} else {
+					re.group_map[cgroup_name] = group_id+1
+				}
+			}

 			group_stack_txt_index[group_stack_index] = i
 			group_stack[group_stack_index] = pc

 			re.prog[pc].ist = u32(0) | IST_GROUP_START
-			re.prog[pc].group_id = group_count
 			re.prog[pc].rep_min = 1
 			re.prog[pc].rep_max = 1
+			
+			// set the group id
+			if cgroup_flag == false {
+				//println("NO CAPTURE GROUP")
+				re.prog[pc].group_id = -1 
+			} else {
+				re.prog[pc].group_id = group_id
+			}
+
 			pc = pc + 1
-			i = i + char_len
 			continue

 		}
@ -1099,6 +1248,16 @@ pub fn (re RE) get_code() string {
 				res.write(".        DOT_CHAR")
 			} else if ist == IST_GROUP_START {
 				res.write("(        GROUP_START #:${tk.group_id}")
+				if tk.group_id == -1 {
+					res.write(" ?:")
+				} else {
+					for x in re.group_map.keys() {
+						if re.group_map[x] == (tk.group_id+1) {
+							res.write(" ?P<${x}>")
+							break
+						}
+					}
+				}
 			} else if ist == IST_GROUP_END {
 				res.write(")        GROUP_END   #:${tk.group_id}")
 			} else if ist == IST_SIMPLE_CHAR {
@ -1145,9 +1304,21 @@ pub fn (re RE) get_query() string {
 		if ch == IST_GROUP_START {
 			if re.debug == 0 {
 				res.write("(")
+			} else {
+				if tk.group_id == -1 {
+					res.write("(?:")   // non capturing group
 				} else {
 					res.write("#${tk.group_id}(")
 				}
+			}
+			
+			for x in re.group_map.keys() {
+				if re.group_map[x] == (tk.group_id+1) {
+					res.write("?P<${x}>")
+					break
+				}
+			}
+
 			i++
 			continue
 		}
@ -1400,7 +1571,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 						re.prog[tmp_pc].group_rep
 					)
 					*/
-					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{
+					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
 						start_i   := group_stack[group_index]
 	 					group_stack[group_index]=-1

@ -1420,7 +1591,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
-								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
@ -1545,7 +1716,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 					// restore txt index stack and save the group data
 					
 					//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
-					if group_index >= 0 {
+					if group_index >= 0 && re.prog[pc].group_id >= 0 {
 	 					start_i   := group_stack[group_index]
 	 					//group_stack[group_index]=-1

@ -1566,7 +1737,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 								// incrment counter
 								re.group_csave[0]++
 								// save the record  
-								re.group_csave[re.group_csave_index++] = g_index               // group id
+								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
 								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
 								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
 							}
@ -1709,7 +1880,7 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
 				if re.prog[pc].ch == ch
 				{
 					state.match_flag = true
-					l_ist = u32(IST_SIMPLE_CHAR)
+					l_ist = IST_SIMPLE_CHAR
 					
 					if first_match < 0 {
 						first_match = i
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@ -72,6 +72,7 @@ match_test_suite = [
 	TestItem{" pippo pera",r"\s(.*)pe(.*)",0,11},
 	TestItem{" abb",r"\s(.*)",0,4},

+
 	// negative
 	TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0},
 	TestItem{"this is a good.",r"thes",-1,0},
@ -81,7 +82,6 @@ match_test_suite = [
 	TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0},
 	TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0},
 	
-
 	// check unicode
 	TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34},
 	TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23},
@ -94,11 +94,10 @@ struct TestItemFa {
 	r []int
 }

+
 const (
 match_test_suite_fa = [
-
 	// find_all tests
-
 	TestItemFa{
 		"oggi pippo è andato a casa di pluto ed ha trovato pippo",
 		r"p[iplut]+o",
@ -119,12 +118,9 @@ struct TestItemRe {
 	rep string
 	r string
 }
-
 const (
 match_test_suite_re = [
-
 	// replace tests
-
 	TestItemRe{
 		"oggi pibao è andato a casa di pbababao ed ha trovato pibabababao",
 		r"(pi?(ba)+o)",
@ -140,7 +136,88 @@ match_test_suite_re = [
 ]
 )

+struct TestItemCGroup {
+	src string
+	q string
+	s int = 0
+	e int = 0
+	cg []int
+	cgn map[string]int
+}
+const (
+cgroups_test_suite = [
+	TestItemCGroup{
+		"http://www.ciao.mondo/hello/pippo12_/pera.html",
+		r"(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+",0,46,
+		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		{'format':0,'token':1}
+	},
+	TestItemCGroup{
+		"http://www.ciao.mondo/hello/pippo12_/pera.html",
+		r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
+		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		{'format':0,'token':1}
+	},
+	TestItemCGroup{
+		"http://www.ciao.mondo/hello/pippo12_/pera.html",
+		r"(?P<format>https?)|(?P<format>ftps?)://([\w_]+.)+",0,46,
+		[8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46],
+		{'format':0}
+	},
+]
+)
+
 fn test_regex(){
+	// check capturing groups
+	for c,to in cgroups_test_suite {
+		// debug print
+		//println("#$c [$to.src] q[$to.q] ($to.s, $to.e)")
+
+		mut re, re_err, err_pos := regex.regex(to.q)
+		re.group_csave = [-1].repeat(3*20+1)
+
+		if re_err == regex.COMPILE_OK {
+			start, end := re.match_string(to.src)
+
+			mut tmp_str := ""
+			if start >= 0 && end  > start{
+				tmp_str = to.src[start..end]
+			}
+			
+			if start != to.s || end != to.e {
+				println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")	
+				println("ERROR!")
+				//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
+				assert false
+				break
+			}
+
+			// check cgroups
+			if re.group_csave.len == 0 || re.group_csave[0] != to.cg[0] {
+				println("Capturing group len error!")
+				assert false
+			}
+
+			// check captured groups
+			mut ln := re.group_csave[0]*3
+			for ln > 0 {
+				if re.group_csave[ln] != to.cg[ln] {
+					assert false
+				}
+				ln--
+			}
+
+			// check named captured groups
+			for k in to.cgn.keys() {
+				if to.cgn[k] != (re.group_map[k]-1) { // we have -1 because the map not found is 0, in groups we start from 0 and we store using +1
+					println("Named capturing group error! [$k]")
+					assert false
+				}
+			}
+
+		}
+	}
+
 	// check find_all
 	for c,to in match_test_suite_fa{
 		// debug print