From 15a63b5bcb8a1a46171bc7346e0a5279e094e95f Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sat, 25 Jan 2020 19:12:23 +0100 Subject: [PATCH] regex fixes --- vlib/regex/README.md | 85 +++++++++++++++++++++++++++++ vlib/regex/regex.v | 126 +++++++++++++++++++++++++------------------ 2 files changed, 159 insertions(+), 52 deletions(-) diff --git a/vlib/regex/README.md b/vlib/regex/README.md index 8fc2dbf29e..7d09db7a48 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -159,6 +159,91 @@ for gi < re.groups.len { **note:** *to show the `group id number` in the result of the `get_query()` the flag `debug` of the RE object must be `1` or `2`* +### Groups Continuous saving + +In particular situations it is useful have a continuous save of the groups, this is possible initializing the saving array field in `RE` struct: `group_csave`. + +This feature allow to collect data in a continuous way. + +In the example we pass a text followed by a integer list that we want collect. +To achieve this task we can use the continuous saving of the group that save each captured group in a array that we set with: `re.group_csave = [-1].repeat(3*20+1)`. + +The array will be filled with the following logic: + +`re.group_csave[0]` number of total saved records + +`re.group_csave[1+n*3]` id of the saved group +`re.group_csave[1+n*3]` start index in the source string of the saved group +`re.group_csave[1+n*3]` end index in the source string of the saved group + +The regex save until finish or found that the array have no space. If the space ends no error is raised, further records will not be saved. + +```v +fn example2() { + test_regex() + + text := "tst: 01,23,45 ,56, 78" + query:= r".*:(\s*\d+[\s,]*)+" + + mut re := regex.new_regex() + //re.debug = 2 + re.group_csave = [-1].repeat(3*20+1) // we expect max 20 records + + re_err, err_pos := re.compile(query) + if re_err == regex.COMPILE_OK { + q_str := re.get_query() + println("Query: $q_str") + + start, end := re.match_string(text) + if start < 0 { + println("ERROR : ${re.get_parse_error_string(start)}, $start") + } else { + println("found in [$start, $end] => [${text[start..end]}]") + } + + // groups capture + mut gi := 0 + for gi < re.groups.len { + if re.groups[gi] >= 0 { + println("${gi/2} ${re.groups[gi]},${re.groups[gi+1]} :[${text[re.groups[gi]..re.groups[gi+1]]}]") + } + gi += 2 + } + + // continuous saving + gi = 0 + println("num: ${re.group_csave[0]}") + for gi < re.group_csave[0] { + id := re.group_csave[1+gi*3] + st := re.group_csave[1+gi*3+1] + en := re.group_csave[1+gi*3+2] + println("cg id: ${id} [${st}, ${en}] => [${text[st..en]}]") + gi++ + } + } else { + println("query: $query") + lc := "-".repeat(err_pos) + println("err : $lc^") + err_str := re.get_parse_error_string(re_err) + println("ERROR: $err_str") + } +} +``` + +The output will be: + +``` +Query: .*:(\s*\d+[\s,]*)+ +found in [0, 21] => [tst: 01,23,45 ,56, 78] +0 19,21 :[78] +num: 5 +cg id: 0 [4, 8] => [ 01,] +cg id: 0 [8, 11] => [23,] +cg id: 0 [11, 15] => [45 ,] +cg id: 0 [15, 19] => [56, ] +cg id: 0 [19, 21] => [78] +``` + ## Flags It is possible to set some flags in the regex parser that change the behavior of the parser itself. diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index ef53842c51..e40f1a4e27 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -266,23 +266,20 @@ fn (tok mut Token) reset() { * ******************************************************************************/ pub const ( - //F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!! - //F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true + F_NL = 0x00000001 // end the match when find a new line symbol + F_MS = 0x00000002 // match true only if the match is at the start of the string + F_ME = 0x00000004 // match true only if the match is at the end of the string - F_NL = 0x00000002 // end the match when find a new line symbol - F_MS = 0x00000008 // match true only if the match is at the start of the string - F_ME = 0x00000010 // match true only if the match is at the end of the string - - F_EFM = 0x01000000 // exit on first token matched, used by search - F_BIN = 0x02000000 // work only on bytes, ignore utf-8 + F_EFM = 0x00000100 // exit on first token matched, used by search + F_BIN = 0x00000200 // work only on bytes, ignore utf-8 ) struct StateDotObj{ mut: i int = -1 // char index in the input buffer - pc int = -1 // program counter saved - mi int = -1 // match_index saved - group_stack_index int = -1 // group index stack pointer saved + pc int = -1 // program counter saved + mi int = -1 // match_index saved + group_stack_index int = -1 // continuous save on capturing groups } pub @@ -305,6 +302,9 @@ pub mut: group_max_nested int = 3 // max nested group group_max int = 8 // max allowed number of different groups + group_csave []int = []int // groups continuous save array + group_csave_index int= -1 // groups continuous save index + // flags flag int = 0 // flag for optional parameters @@ -328,6 +328,12 @@ fn (re mut RE) reset(){ re.groups = [-1].repeat(re.group_count*2) re.state_stack_index = -1 + + // reset group_csave + if re.group_csave.len > 0 { + re.group_csave_index = 1 + re.group_csave[0] = 0 // reset the capture count + } } /****************************************************************************** @@ -734,8 +740,6 @@ fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { } } - - // not a {} quantifier, exit return ERR_SYNTAX_ERROR, i, 0, false } @@ -997,7 +1001,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { // Post processing //****************************************** - // count IST_DOT_CHAR to set the size of the state stack mut pc1 := 0 mut tmp_count := 0 @@ -1054,7 +1057,6 @@ pub fn (re mut RE) compile(in_txt string) (int,int) { pc1++ } - //****************************************** // DEBUG PRINT REGEX GENERATED CODE //****************************************** @@ -1075,14 +1077,15 @@ pub fn (re RE) get_code() string { mut stop_flag := false for pc1 <= re.prog.len { + tk := re.prog[pc1] res.write("PC:${pc1:3d}") res.write(" ist: ") - res.write("${re.prog[pc1].ist:8x}".replace(" ","0") ) + res.write("${tk.ist:8x}".replace(" ","0") ) res.write(" ") - ist :=re.prog[pc1].ist + ist :=tk.ist if ist == IST_BSLS_CHAR { - res.write("[\\${re.prog[pc1].ch:1c}] BSLS") + res.write("[\\${tk.ch:1c}] BSLS") } else if ist == IST_PROG_END { res.write("PROG_END") stop_flag = true @@ -1095,22 +1098,22 @@ pub fn (re RE) get_code() string { } else if ist == IST_DOT_CHAR { res.write(". DOT_CHAR") } else if ist == IST_GROUP_START { - res.write("( GROUP_START #:${re.prog[pc1].group_id}") + res.write("( GROUP_START #:${tk.group_id}") } else if ist == IST_GROUP_END { - res.write(") GROUP_END #:${re.prog[pc1].group_id}") + res.write(") GROUP_END #:${tk.group_id}") } else if ist == IST_SIMPLE_CHAR { - res.write("[${re.prog[pc1].ch:1c}] query_ch") + res.write("[${tk.ch:1c}] query_ch") } - if re.prog[pc1].rep_max == MAX_QUANTIFIER { - res.write(" {${re.prog[pc1].rep_min:3d},MAX}") + if tk.rep_max == MAX_QUANTIFIER { + res.write(" {${tk.rep_min:3d},MAX}") }else{ if ist == IST_OR_BRANCH { - res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}") + res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}") } else { - res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}") + res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}") } - if re.prog[pc1].greedy == true { + if tk.greedy == true { res.write("?") } } @@ -1123,11 +1126,9 @@ pub fn (re RE) get_code() string { res.write("========================================\n") return res.str() - } // get_query return a string with a reconstruction of the query starting from the regex program code - pub fn (re RE) get_query() string { mut res := strings.new_builder(re.query.len*2) @@ -1137,14 +1138,15 @@ pub fn (re RE) get_query() string { mut i := 0 for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{ - ch := re.prog[i].ist + tk := &re.prog[i] + ch := tk.ist // GROUP start if ch == IST_GROUP_START { if re.debug == 0 { res.write("(") } else { - res.write("#${re.prog[i].group_id}(") + res.write("#${tk.group_id}(") } i++ continue @@ -1159,7 +1161,7 @@ pub fn (re RE) get_query() string { if ch == IST_OR_BRANCH { res.write("|") if re.debug > 0 { - res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") + res.write("{${tk.rep_min},${tk.rep_max}}") } i++ continue @@ -1177,7 +1179,7 @@ pub fn (re RE) get_query() string { // bsls char if ch == IST_BSLS_CHAR { - res.write("\\${re.prog[i].ch:1c}") + res.write("\\${tk.ch:1c}") } // IST_DOT_CHAR @@ -1190,29 +1192,28 @@ pub fn (re RE) get_query() string { if byte(ch) in BSLS_ESCAPE_LIST { res.write("\\") } - res.write("${re.prog[i].ch:c}") + res.write("${tk.ch:c}") } // quantifier - if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) { - if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 { + if !(tk.rep_min == 1 && tk.rep_max == 1) { + if tk.rep_min == 0 && tk.rep_max == 1 { res.write("?") - } else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER { + } else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER { res.write("+") - } else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER { + } else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER { res.write("*") } else { - if re.prog[i].rep_max == MAX_QUANTIFIER { - res.write("{${re.prog[i].rep_min},MAX}") + if tk.rep_max == MAX_QUANTIFIER { + res.write("{${tk.rep_min},MAX}") } else { - res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") + res.write("{${tk.rep_min},${tk.rep_max}}") } - if re.prog[i].greedy == true { + if tk.greedy == true { res.write("?") } } } - i++ } if (re.flag & F_ME) != 0 { @@ -1411,6 +1412,20 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { re.groups[g_index] = 0 } re.groups[g_index+1] = i + + // continuous save, save until we have space + if re.group_csave_index > 0 { + // check if we have space to save the record + if (re.group_csave_index + 3) < re.group_csave.len { + // incrment counter + re.group_csave[0]++ + // save the record + re.group_csave[re.group_csave_index++] = g_index // group id + re.group_csave[re.group_csave_index++] = re.groups[g_index] // start + re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end + } + } + } group_index-- @@ -1543,6 +1558,19 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { } re.groups[g_index+1] = i //C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1]) + + // continuous save, save until we have space + if re.group_csave_index > 0 { + // check if we have space to save the record + if (re.group_csave_index + 3) < re.group_csave.len { + // incrment counter + re.group_csave[0]++ + // save the record + re.group_csave[re.group_csave_index++] = g_index // group id + re.group_csave[re.group_csave_index++] = re.groups[g_index] // start + re.group_csave[re.group_csave_index++] = re.groups[g_index+1] // end + } + } } re.prog[pc].group_rep++ // increase repetitions @@ -1796,8 +1824,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { if rep < re.prog[tmp_pc].rep_min { //C.printf("ist_quant_pg UNDER RANGE\n") pc = re.prog[tmp_pc].goto_pc - //group_index-- - m_state = .ist_next continue } @@ -1841,12 +1867,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { m_state = .ist_next // go to next ist continue } - - // match failed - else if rep == 0 && re.prog[pc].rep_min > 0 { - //C.printf("ist_quant_n NO MATCH\n") - // dummy - } // match + or * else if rep >= re.prog[pc].rep_min { //C.printf("ist_quant_n MATCH RANGE\n") @@ -1902,7 +1922,6 @@ pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { m_state = .ist_next continue } - m_state = .ist_load continue } @@ -1981,6 +2000,9 @@ pub fn (re mut RE) match_string(in_txt string) (int,int) { return NO_MATCH_FOUND, 0 } if (re.flag & F_ME) != 0 && end < in_txt.len { + if in_txt[end] in NEW_LINE_LIST { + return start, end + } return NO_MATCH_FOUND, 0 } return start, end @@ -2002,7 +2024,7 @@ pub fn (re mut RE) find(in_txt string) (int,int) { for i < in_txt.len { // test only the first part of the query string - re.flag &= F_EFM // set to exit on the first token match + re.flag |= F_EFM // set to exit on the first token match mut tmp_end := i+re.query.len if tmp_end > in_txt.len { tmp_end = in_txt.len } tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i }