/********************************************************************** * * regex 0.9c * * Copyright (c) 2019 Dario Deledda. All rights reserved. * Use of this source code is governed by an MIT license * that can be found in the LICENSE file. * * This file contains regex module * * Know limitation: * - max 8 stacked groups * - find is implemented in a trivial way * * **********************************************************************/ module regex import strings pub const( V_REGEX_VERSION = "0.9c" // regex module version MAX_CODE_LEN = 256 // default small base code len for the regex programs MAX_QUANTIFIER = 1073741824 // default max repetitions allowed for the quantifiers = 2^30 // spaces chars (here only westerns!!) TODO: manage all the spaces from unicode SPACES = [` `, `\t`, `\n`, `\r`, `\v`, `\f`] // new line chars for now only '\n' NEW_LINE_LIST = [`\n`,`\r`] // Results NO_MATCH_FOUND = -1 // Errors COMPILE_OK = 0 // the regex string compiled, all ok ERR_CHAR_UNKNOWN = -2 // the char used is unknow to the system ERR_UNDEFINED = -3 // the compiler symbol is undefined ERR_INTERNAL_ERROR = -4 // Bug in the regex system!! ERR_CC_ALLOC_OVERFLOW = -5 // memory for char class full!! ERR_SYNTAX_ERROR = -6 // syntax error in regex compiling ERR_GROUPS_OVERFLOW = -7 // max number of groups reached ERR_GROUPS_MAX_NESTED = -8 // max number of nested group reached ERR_GROUP_NOT_BALANCED = -9 // group not balanced ) const( //************************************* // regex program instructions //************************************* IST_SIMPLE_CHAR = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char // char class 11 0100 AA xxxxxxxx // AA = 00 regular class // AA = 01 Negated class ^ char IST_CHAR_CLASS = 0xD1000000 // MASK IST_CHAR_CLASS_POS = 0xD0000000 // char class normal [abc] IST_CHAR_CLASS_NEG = 0xD1000000 // char class negate [^abc] // dot char 10 0110 xx xxxxxxxx IST_DOT_CHAR = 0x98000000 // match any char except \n // backslash chars 10 0100 xx xxxxxxxx IST_BSLS_CHAR = 0x90000000 // backslash char // OR | 10 010Y xx xxxxxxxx IST_OR_BRANCH = 0x91000000 // OR case // groups 10 010Y xx xxxxxxxx IST_GROUP_START = 0x92000000 // group start ( IST_GROUP_END = 0x94000000 // group end ) // control instructions IST_PROG_END = u32(0x88000000) //10 0010 xx xxxxxxxx //************************************* ) /****************************************************************************** * * General Utilities * ******************************************************************************/ // utf8util_char_len calculate the length in bytes of a utf8 char [inline] fn utf8util_char_len(b byte) int { return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1 } // get_char get a char from position i and return an u32 with the unicode code [inline] fn (re RE) get_char(in_txt string, i int) (u32,int) { // ascii 8 bit if (re.flag & F_BIN) !=0 || in_txt.str[i] & 0x80 == 0 { return u32(in_txt.str[i]), 1 } // unicode char char_len := utf8util_char_len(in_txt.str[i]) mut tmp := 0 mut ch := u32(0) for tmp < char_len { ch = (ch << 8) | in_txt.str[i+tmp] tmp++ } return ch,char_len } // get_charb get a char from position i and return an u32 with the unicode code [inline] fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) { // ascii 8 bit if (re.flag & F_BIN) !=0 || in_txt[i] & 0x80 == 0 { return u32(in_txt[i]), 1 } // unicode char char_len := utf8util_char_len(in_txt[i]) mut tmp := 0 mut ch := u32(0) for tmp < char_len { ch = (ch << 8) | in_txt[i+tmp] tmp++ } return ch,char_len } [inline] fn is_alnum(in_char byte) bool { mut tmp := in_char - `A` if tmp >= 0x00 && tmp <= 25 { return true } tmp = in_char - `a` if tmp >= 0x00 && tmp <= 25 { return true } tmp = in_char - `0` if tmp >= 0x00 && tmp <= 9 { return true } return false } [inline] fn is_not_alnum(in_char byte) bool { return !is_alnum(in_char) } [inline] fn is_space(in_char byte) bool { return in_char in SPACES } [inline] fn is_not_space(in_char byte) bool { return !is_space(in_char) } [inline] fn is_digit(in_char byte) bool { tmp := in_char - `0` return tmp <= 0x09 && tmp >= 0 } [inline] fn is_not_digit(in_char byte) bool { return !is_digit(in_char) } [inline] fn is_wordchar(in_char byte) bool { return is_alnum(in_char) || in_char == `_` } [inline] fn is_not_wordchar(in_char byte) bool { return !is_alnum(in_char) } [inline] fn is_lower(in_char byte) bool { tmp := in_char - `a` return tmp >= 0x00 && tmp <= 25 } [inline] fn is_upper(in_char byte) bool { tmp := in_char - `A` return tmp >= 0x00 && tmp <= 25 } pub fn (re RE) get_parse_error_string(err int) string { match err { COMPILE_OK { return "COMPILE_OK" } NO_MATCH_FOUND { return "NO_MATCH_FOUND" } ERR_CHAR_UNKNOWN { return "ERR_CHAR_UNKNOWN" } ERR_UNDEFINED { return "ERR_UNDEFINED" } ERR_INTERNAL_ERROR { return "ERR_INTERNAL_ERROR" } ERR_CC_ALLOC_OVERFLOW { return "ERR_CC_ALLOC_OVERFLOW" } ERR_SYNTAX_ERROR { return "ERR_SYNTAX_ERROR" } ERR_GROUPS_OVERFLOW { return "ERR_GROUPS_OVERFLOW"} ERR_GROUPS_MAX_NESTED { return "ERR_GROUPS_MAX_NESTED"} ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED"} else { return "ERR_UNKNOWN" } } } // utf8_str convert and utf8 sequence to a printable string [inline] fn utf8_str(ch u32) string { mut i := 4 mut res := "" for i > 0 { v := byte((ch >> ((i-1)*8)) & 0xFF) if v != 0{ res += "${v:1c}" } i-- } return res } // simple_log default log function fn simple_log(txt string) { print(txt) } /****************************************************************************** * * Token Structs * ******************************************************************************/ struct Token{ mut: ist u32 = u32(0) // char ch u32 = u32(0)// char of the token if any ch_len byte = byte(0) // char len // Quantifiers / branch rep_min int = 0 // used also for jump next in the OR branch [no match] pc jump rep_max int = 0 // used also for jump next in the OR branch [ match] pc jump greedy bool = false // greedy quantifier flag // Char class cc_index int = -1 // counters for quantifier check (repetitions) rep int = 0 // validator function pointer and control char validator fn (byte) bool // groups variables group_rep int = 0 // repetition of the group group_id int = -1 // id of the group goto_pc int = -1 // jump to this PC if is needed // OR flag for the token next_is_or bool = false // true if the next token is an OR } fn (tok mut Token) reset() { tok.rep = 0 } /****************************************************************************** * * Regex struct * ******************************************************************************/ pub const ( //F_FND = 0x00000001 // check until the end of the input string, it act like a "find first match", not efficient!! //F_PM = 0x00000004 // partial match: if the source text finish and the match is positive until then return true F_NL = 0x00000002 // end the match when find a new line symbol F_MS = 0x00000008 // match true only if the match is at the start of the string F_ME = 0x00000010 // match true only if the match is at the end of the string F_EFM = 0x01000000 // exit on first token matched, used by search F_BIN = 0x02000000 // work only on bytes, ignore utf-8 ) struct StateDotObj{ mut: i int = 0 // char index in the input buffer pc int = 0 // program counter saved mi int = 0 // match_index saved group_stack_index int = -1 // group index stack pointer saved } pub struct RE { pub mut: prog []Token // char classes storage cc []CharClass // char class list cc_index int = 0 // index // state index state_stack_index int= -1 state_stack []StateDotObj // groups group_count int = 0 // number of groups in this regex struct groups []int // groups index results group_max_nested int = 3 // max nested group group_max int = 8 // max allowed number of different groups // flags flag int = 0 // flag for optional parameters // Debug/log debug int = 0 // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE log_func fn (string) = simple_log // log function, can be customized by the user query string = "" // query string } // Reset RE object fn (re mut RE) reset(){ //re.group_count = 0 re.cc_index = 0 mut i := 0 for i < re.prog.len { re.prog[i].group_rep = 0 // clear repetition of the group re.prog[i].rep = 0 // clear repetition of the token i++ } re.groups = [-1].repeat(re.group_count*2) re.state_stack_index = -1 } /****************************************************************************** * * Backslashes chars * ******************************************************************************/ struct BslsStruct { ch u32 // meta char validator fn (byte) bool // validator function pointer } const( BSLS_VALIDATOR_ARRAY = [ BslsStruct{`w`, is_alnum}, BslsStruct{`W`, is_not_alnum}, BslsStruct{`s`, is_space}, BslsStruct{`S`, is_not_space}, BslsStruct{`d`, is_digit}, BslsStruct{`D`, is_not_digit}, BslsStruct{`a`, is_lower}, BslsStruct{`A`, is_upper}, ] // these chars are escape if preceded by a \ BSLS_ESCAPE_LIST = [ `\\`,`|`,`.`,`*`,`+`,`{`,`}`,`[`,`]` ] ) enum BSLS_parse_state { start, bsls_found, bsls_char, normal_char } // parse_bsls return (index, str_len) BSLS_VALIDATOR_ARRAY index, len of the backslash sequence if present fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){ mut status := BSLS_parse_state.start mut i := in_i for i < in_txt.len { // get our char char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) if status == .start && ch == `\\` { status = .bsls_found i += char_len continue } // check if is our bsls char, for now only one length sequence if status == .bsls_found { for c,x in BSLS_VALIDATOR_ARRAY { if x.ch == ch { return c,i-in_i+1 } } status = .normal_char continue } // no BSLS validator, manage as normal escape char char if status == .normal_char { if ch in BSLS_ESCAPE_LIST { return NO_MATCH_FOUND,i-in_i+1 } return ERR_SYNTAX_ERROR,i-in_i+1 } // at the present time we manage only one char after the \ break } // not our bsls return KO return ERR_SYNTAX_ERROR, i } /****************************************************************************** * * Char class * ******************************************************************************/ const( CC_NULL = 0 // empty cc token CC_CHAR = 1 // simple char: a CC_INT = 2 // char interval: a-z CC_BSLS = 3 // backslash char CC_END = 4 // cc sequence terminator ) struct CharClass { mut: cc_type int = CC_NULL // type of cc token ch0 u32 = u32(0) // first char of the interval a-b a in this case ch1 u32 = u32(0) // second char of the interval a-b b in this case validator fn (byte) bool // validator function pointer } enum CharClass_parse_state { start, in_char, in_bsls, separator, finish, } fn (re RE) get_char_class(pc int) string { buf := [byte(0)].repeat(re.cc.len) mut buf_ptr := *byte(&buf) mut cc_i := re.prog[pc].cc_index mut i := 0 mut tmp := 0 for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != CC_END { if re.cc[cc_i].cc_type == CC_BSLS { buf_ptr[i++] = `\\` buf_ptr[i++] = byte(re.cc[cc_i].ch0) } else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 { tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF) if x != 0 { buf_ptr[i++] = x } tmp-- } } else { tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF) if x != 0 { buf_ptr[i++] = x } tmp-- } buf_ptr[i++] = `-` tmp = 3 for tmp >= 0 { x := byte((re.cc[cc_i].ch1 >> (tmp*8)) & 0xFF) if x != 0 { buf_ptr[i++] = x } tmp-- } } cc_i++ } buf_ptr[i] = byte(0) return tos_clone( buf_ptr ) } fn (re RE) check_char_class(pc int, ch u32) bool { mut cc_i := re.prog[pc].cc_index for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != CC_END { if re.cc[cc_i].cc_type == CC_BSLS { if re.cc[cc_i].validator(byte(ch)) { return true } } else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 { return true } cc_i++ } return false } // parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char fn (re mut RE) parse_char_class(in_txt string, in_i int) (int, int, u32) { mut status := CharClass_parse_state.start mut i := in_i mut tmp_index := re.cc_index res_index := re.cc_index mut cc_type := u32(IST_CHAR_CLASS_POS) for i < in_txt.len { // check if we are out of memory for char classes if tmp_index >= re.cc.len { return ERR_CC_ALLOC_OVERFLOW,0,u32(0) } // get our char char_tmp,char_len := re.get_char(in_txt,i) ch := byte(char_tmp) //C.printf("CC #%3d ch: %c\n",i,ch) // negation if status == .start && ch == `^` { cc_type = u32(IST_CHAR_CLASS_NEG) i += char_len continue } // bsls if (status == .start || status == .in_char) && ch == `\\` { //C.printf("CC bsls.\n") status = .in_bsls i += char_len continue } if status == .in_bsls { //C.printf("CC bsls validation.\n") for c,x in BSLS_VALIDATOR_ARRAY { if x.ch == ch { //C.printf("CC bsls found \\%c.\n",ch) re.cc[tmp_index].cc_type = CC_BSLS re.cc[tmp_index].ch0 = BSLS_VALIDATOR_ARRAY[c].ch re.cc[tmp_index].ch1 = BSLS_VALIDATOR_ARRAY[c].ch re.cc[tmp_index].validator = BSLS_VALIDATOR_ARRAY[c].validator i += char_len tmp_index++ status = .in_char break } } if status == .in_bsls { //C.printf("CC bsls not found \\%c.\n",ch) status = .in_char }else { continue } } // simple char if (status == .start || status == .in_char) && ch != `-` && ch != `]` { status = .in_char re.cc[tmp_index].cc_type = CC_CHAR re.cc[tmp_index].ch0 = char_tmp re.cc[tmp_index].ch1 = char_tmp i += char_len tmp_index++ continue } // check range separator if status == .in_char && ch == `-` { status = .separator i += char_len continue } // check range end if status == .separator && ch != `]` && ch != `-` { status = .in_char re.cc[tmp_index-1].cc_type = CC_INT re.cc[tmp_index-1].ch1 = char_tmp i += char_len continue } // char class end if status == .in_char && ch == `]` { re.cc[tmp_index].cc_type = CC_END re.cc[tmp_index].ch0 = 0 re.cc[tmp_index].ch1 = 0 re.cc_index = tmp_index+1 return res_index, i-in_i+2, cc_type } i++ } return ERR_SYNTAX_ERROR,0,u32(0) } /****************************************************************************** * * Re Compiler * ******************************************************************************/ // // Quantifier // enum Quant_parse_state { start, min_parse, comma_checked, max_parse, greedy, gredy_parse, finish } // parse_quantifier return (min, max, str_len) of a {min,max}? quantifier starting after the { char fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) { mut status := Quant_parse_state.start mut i := in_i mut q_min := 0 // default min in a {} quantifier is 1 mut q_max := 0 // deafult max in a {} quantifier is MAX_QUANTIFIER mut ch := byte(0) for i < in_txt.len { ch = in_txt.str[i] //C.printf("%c status: %d\n",ch,status) // exit on no compatible char with {} quantifier if utf8util_char_len(ch) != 1 { return ERR_SYNTAX_ERROR,i,0,false } // min parsing skip if comma present if status == .start && ch == `,` { q_min = 1 // default min in a {} quantifier is 1 status = .comma_checked i++ continue } if status == .start && is_digit( ch ) { status = .min_parse q_min *= 10 q_min += int(ch - `0`) i++ continue } if status == .min_parse && is_digit( ch ) { q_min *= 10 q_min += int(ch - `0`) i++ continue } // we have parsed the min, now check the max if status == .min_parse && ch == `,` { status = .comma_checked i++ continue } // single value {4} if status == .min_parse && ch == `}` { q_max = q_min status = .greedy continue } // end without max if status == .comma_checked && ch == `}` { q_max = MAX_QUANTIFIER status = .greedy continue } // start max parsing if status == .comma_checked && is_digit( ch ) { status = .max_parse q_max *= 10 q_max += int(ch - `0`) i++ continue } // parse the max if status == .max_parse && is_digit( ch ) { q_max *= 10 q_max += int(ch - `0`) i++ continue } // finished the quantifier if status == .max_parse && ch == `}` { status = .greedy continue } // check if greedy flag char ? is present if status == .greedy { if i+1 < in_txt.len { i++ status = .gredy_parse continue } return q_min, q_max, i-in_i+2, false } // check the greedy flag if status == .gredy_parse { if ch == `?` { return q_min, q_max, i-in_i+2, true } else { i-- return q_min, q_max, i-in_i+2, false } } // not a {} quantifier, exit return ERR_SYNTAX_ERROR, i, 0, false } // not a conform {} quantifier return ERR_SYNTAX_ERROR, i, 0, false } // // main compiler // // compile return (return code, index) where index is the index of the error in the query string if return code is an error code pub fn (re mut RE) compile(in_txt string) (int,int) { mut i := 0 // input string index mut pc := 0 // program counter mut tmp_code := u32(0) // group management variables mut group_count := -1 mut group_stack := [0 ].repeat(re.group_max_nested) mut group_stack_txt_index := [-1].repeat(re.group_max_nested) mut group_stack_index := -1 re.query = in_txt // save the query string i = 0 for i < in_txt.len { tmp_code = u32(0) mut char_tmp := u32(0) mut char_len := 0 //C.printf("i: %3d ch: %c\n", i, in_txt.str[i]) char_tmp,char_len = re.get_char(in_txt,i) // // check special cases: $ ^ // if char_len == 1 && i == 0 && byte(char_tmp) == `^` { re.flag = F_MS i = i + char_len continue } if char_len == 1 && i == (in_txt.len-1) && byte(char_tmp) == `$` { re.flag = F_ME i = i + char_len continue } // IST_GROUP_START if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` { //check max groups allowed if group_count > re.group_max { return ERR_GROUPS_OVERFLOW,i+1 } group_stack_index++ // check max nested groups allowed if group_stack_index > re.group_max_nested { return ERR_GROUPS_MAX_NESTED,i+1 } group_count++ group_stack_txt_index[group_stack_index] = i group_stack[group_stack_index] = pc re.prog[pc].ist = u32(0) | IST_GROUP_START re.prog[pc].group_id = group_count re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 pc = pc + 1 i = i + char_len continue } // IST_GROUP_END if char_len==1 && pc > 0 && byte(char_tmp) == `)` { if group_stack_index < 0 { return ERR_GROUP_NOT_BALANCED,i+1 } goto_pc := group_stack[group_stack_index] group_stack_index-- re.prog[pc].ist = u32(0) | IST_GROUP_END re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc //re.prog[goto_pc].group_id = group_count // id of this group, used for storing data pc = pc + 1 i = i + char_len continue } // IST_DOT_CHAR match any char except the following token if char_len==1 && pc >= 0 && byte(char_tmp) == `.` { re.prog[pc].ist = u32(0) | IST_DOT_CHAR re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 pc = pc + 1 i = i + char_len continue } // OR branch if char_len==1 && pc > 0 && byte(char_tmp) == `|` { // two consecutive IST_DOT_CHAR are an error if pc > 0 && re.prog[pc-1].ist == IST_OR_BRANCH { return ERR_SYNTAX_ERROR,i } re.prog[pc].ist = u32(0) | IST_OR_BRANCH pc = pc + 1 i = i + char_len continue } // Quantifiers if char_len==1 && pc > 0{ mut quant_flag := true match byte(char_tmp) { `?` { //C.printf("q: %c\n",char_tmp) re.prog[pc-1].rep_min = 0 re.prog[pc-1].rep_max = 1 } `+` { //C.printf("q: %c\n",char_tmp) re.prog[pc-1].rep_min = 1 re.prog[pc-1].rep_max = MAX_QUANTIFIER } `*` { //C.printf("q: %c\n",char_tmp) re.prog[pc-1].rep_min = 0 re.prog[pc-1].rep_max = MAX_QUANTIFIER } `{` { min, max, tmp, greedy := re.parse_quantifier(in_txt, i+1) // it is a quantifier if min >= 0 { //C.printf("{%d,%d}\n str:[%s] greedy: %d\n", min, max, in_txt[i..i+tmp], greedy) i = i + tmp re.prog[pc-1].rep_min = min re.prog[pc-1].rep_max = max re.prog[pc-1].greedy = greedy continue } else { return min,i } // TODO: decide if the open bracket can be conform without the close bracket /* // no conform, parse as normal char else { quant_flag = false } */ } else{ quant_flag = false } } if quant_flag { i = i + char_len continue } } // IST_CHAR_CLASS_* if char_len==1 && pc >= 0{ if byte(char_tmp) == `[` { cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1) if cc_index >= 0 { //C.printf("index: %d str:%s\n",cc_index,in_txt[i..i+tmp]) i = i + tmp re.prog[pc].ist = u32(0) | cc_type re.prog[pc].cc_index = cc_index re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 pc = pc + 1 continue } // cc_class vector memory full else if cc_index < 0 { return cc_index, i } } } // IST_BSLS_CHAR if char_len==1 && pc >= 0{ if byte(char_tmp) == `\\` { bsls_index,tmp := re.parse_bsls(in_txt,i) //C.printf("index: %d str:%s\n",bsls_index,in_txt[i..i+tmp]) if bsls_index >= 0 { i = i + tmp re.prog[pc].ist = u32(0) | IST_BSLS_CHAR re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 re.prog[pc].validator = BSLS_VALIDATOR_ARRAY[bsls_index].validator re.prog[pc].ch = BSLS_VALIDATOR_ARRAY[bsls_index].ch pc = pc + 1 continue } // this is an escape char, skip the bsls and continue as a normal char else if bsls_index == NO_MATCH_FOUND { i += char_len char_tmp,char_len = re.get_char(in_txt,i) // continue as simple char } // if not an escape or a bsls char then it is an error (at least for now!) else { return bsls_index,i+tmp } } } // IST_SIMPLE_CHAR re.prog[pc].ist = IST_SIMPLE_CHAR re.prog[pc].ch = char_tmp re.prog[pc].ch_len = char_len re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 //C.printf("char: %c\n",char_tmp) pc = pc +1 i+=char_len } // add end of the program re.prog[pc].ist = IST_PROG_END // check for unbalanced groups if group_stack_index != -1 { return ERR_GROUP_NOT_BALANCED, group_stack_txt_index[group_stack_index]+1 } // check for OR at the end of the program if pc > 0 && re.prog[pc-1].ist == IST_OR_BRANCH { return ERR_SYNTAX_ERROR,in_txt.len } // store the number of groups in the query re.group_count = group_count+1 //****************************************** // Post processing //****************************************** // count IST_DOT_CHAR to set the size of the state stack mut pc1 := 0 mut tmp_count := 0 for pc1 < pc { if re.prog[pc1].ist == IST_DOT_CHAR { tmp_count++ } pc1++ } // init the state stack re.state_stack = [StateDotObj{}].repeat(tmp_count+1) // OR branch // a|b|cd // d exit point // a,b,c branches // set the jump in the right places pc1 = 0 for pc1 < pc-2 { // two consecutive OR are a syntax error if re.prog[pc1+1].ist == IST_OR_BRANCH && re.prog[pc1+2].ist == IST_OR_BRANCH { return ERR_SYNTAX_ERROR, i } // manange a|b chains like a|(b)|c|d... // standard solution if re.prog[pc1].ist != IST_OR_BRANCH && re.prog[pc1+1].ist == IST_OR_BRANCH && re.prog[pc1+2].ist != IST_OR_BRANCH { re.prog[pc1].next_is_or = true // set that the next token is an OR re.prog[pc1+1].rep_min = pc1+2 // failed match jump // match jump, if an OR chain the next token will be an OR token mut pc2 := pc1+2 for pc2 < pc-1 { ist := re.prog[pc2].ist if ist == IST_GROUP_START { re.prog[pc1+1].rep_max = re.prog[pc2].goto_pc + 1 break } if ist != IST_OR_BRANCH { re.prog[pc1+1].rep_max = pc2 + 1 break } pc2++ } //C.printf("Compile OR postproc. [%d,OR %d,%d]\n",pc1,pc1+1,pc2) pc1 = pc2 continue } pc1++ } //****************************************** // DEBUG PRINT REGEX GENERATED CODE //****************************************** if re.debug > 0 { re.log_func(re.get_code()) } //****************************************** return COMPILE_OK, 0 } // get_code return the compiled code as regex string, note: may be different from the source! pub fn (re RE) get_code() string { mut pc1 := 0 mut res := strings.new_builder(re.cc.len*2*re.prog.len) res.write("========================================\nv RegEx compiler v $V_REGEX_VERSION output:\n") mut stop_flag := false for pc1 <= re.prog.len { res.write("PC:${pc1:3d}") res.write(" ist: ") res.write("${re.prog[pc1].ist:8x}".replace(" ","0") ) res.write(" ") ist :=re.prog[pc1].ist if ist == IST_BSLS_CHAR { res.write("[\\${re.prog[pc1].ch:1c}] BSLS") } else if ist == IST_PROG_END { res.write("PROG_END") stop_flag = true } else if ist == IST_OR_BRANCH { res.write("OR ") } else if ist == IST_CHAR_CLASS_POS { res.write("[${re.get_char_class(pc1)}] CHAR_CLASS_POS") } else if ist == IST_CHAR_CLASS_NEG { res.write("[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG") } else if ist == IST_DOT_CHAR { res.write(". DOT_CHAR") } else if ist == IST_GROUP_START { res.write("( GROUP_START #:${re.prog[pc1].group_id}") } else if ist == IST_GROUP_END { res.write(") GROUP_END #:${re.prog[pc1].group_id}") } else if ist == IST_SIMPLE_CHAR { res.write("[${re.prog[pc1].ch:1c}] query_ch") } if re.prog[pc1].rep_max == MAX_QUANTIFIER { res.write(" {${re.prog[pc1].rep_min:3d},MAX}") }else{ if ist == IST_OR_BRANCH { res.write(" if false go: ${re.prog[pc1].rep_min:3d} if true go: ${re.prog[pc1].rep_max:3d}") } else { res.write(" {${re.prog[pc1].rep_min:3d},${re.prog[pc1].rep_max:3d}}") } if re.prog[pc1].greedy == true { res.write("?") } } res.write("\n") if stop_flag { break } pc1++ } res.write("========================================\n") return res.str() } // get_query return a string with a reconstruction of the query starting from the regex program code pub fn (re RE) get_query() string { mut res := strings.new_builder(re.query.len*2) if (re.flag & F_MS) != 0 { res.write("^") } mut i := 0 for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{ ch := re.prog[i].ist // GROUP start if ch == IST_GROUP_START { if re.debug == 0 { res.write("(") } else { res.write("#${re.prog[i].group_id}(") } i++ continue } // GROUP end if ch == IST_GROUP_END { res.write(")") } // OR branch if ch == IST_OR_BRANCH { res.write("|") if re.debug > 0 { res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") } i++ continue } // char class if ch == IST_CHAR_CLASS_NEG || ch == IST_CHAR_CLASS_POS { res.write("[") if ch == IST_CHAR_CLASS_NEG { res.write("^") } res.write("${re.get_char_class(i)}") res.write("]") } // bsls char if ch == IST_BSLS_CHAR { res.write("\\${re.prog[i].ch:1c}") } // IST_DOT_CHAR if ch == IST_DOT_CHAR { res.write(".") } // char alone if ch == IST_SIMPLE_CHAR { if byte(ch) in BSLS_ESCAPE_LIST { res.write("\\") } res.write("${re.prog[i].ch:c}") } // quantifier if !(re.prog[i].rep_min == 1 && re.prog[i].rep_max == 1) { if re.prog[i].rep_min == 0 && re.prog[i].rep_max == 1 { res.write("?") } else if re.prog[i].rep_min == 1 && re.prog[i].rep_max == MAX_QUANTIFIER { res.write("+") } else if re.prog[i].rep_min == 0 && re.prog[i].rep_max == MAX_QUANTIFIER { res.write("*") } else { if re.prog[i].rep_max == MAX_QUANTIFIER { res.write("{${re.prog[i].rep_min},MAX}") } else { res.write("{${re.prog[i].rep_min},${re.prog[i].rep_max}}") } if re.prog[i].greedy == true { res.write("?") } } } i++ } if (re.flag & F_ME) != 0 { res.write("$") } return res.str() } /****************************************************************************** * * Matching * ******************************************************************************/ enum match_state{ start = 0, stop, end, new_line, ist_load, // load and execute instruction ist_next, // go to next instruction ist_next_ks, // go to next instruction without clenaning the state ist_quant_p, // match positive ,quantifier check ist_quant_n, // match negative, quantifier check ist_quant_pg, // match positive ,group quantifier check ist_quant_ng, // match negative ,group quantifier check } fn state_str(s match_state) string { match s{ .start { return "start" } .stop { return "stop" } .end { return "end" } .new_line { return "new line" } .ist_load { return "ist_load" } .ist_next { return "ist_next" } .ist_next_ks { return "ist_next_ks" } .ist_quant_p { return "ist_quant_p" } .ist_quant_n { return "ist_quant_n" } .ist_quant_pg { return "ist_quant_pg" } .ist_quant_ng { return "ist_quant_ng" } else { return "UNKN" } } } struct StateObj { pub mut: match_flag bool = false match_index int = -1 match_first int = -1 } pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) { // result status mut result := NO_MATCH_FOUND // function return mut first_match := -1 //index of the first match mut i := 0 // source string index mut ch := u32(0) // examinated char mut char_len := 0 // utf8 examinated char len mut m_state := match_state.start // start point for the matcher FSM mut pc := -1 // program counter mut state := StateObj{} // actual state mut ist := u32(0) // Program Counter mut group_stack := [-1].repeat(re.group_max) mut group_data := [-1].repeat(re.group_max) mut group_index := -1 // group id used to know how many groups are open mut step_count := 0 // stats for debug mut dbg_line := 0 // count debug line printed re.reset() if re.debug>0 { // print header mut h_buf := strings.new_builder(32) h_buf.write("flags: ") h_buf.write("${re.flag:8x}".replace(" ","0")) h_buf.write("\n") re.log_func(h_buf.str()) } for m_state != .end { if pc >= 0 && pc < re.prog.len { ist = re.prog[pc].ist }else if pc >= re.prog.len { //C.printf("ERROR!! PC overflow!!\n") return ERR_INTERNAL_ERROR, i } //****************************************** // DEBUG LOG //****************************************** if re.debug>0 { mut buf2 := strings.new_builder(re.cc.len+128) // print all the instructions // end of the input text if i >= in_txt_len { buf2.write("# ${step_count:3d} END OF INPUT TEXT\n") re.log_func(buf2.str()) }else{ // print only the exe instruction if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 { if ist == IST_PROG_END { buf2.write("# ${step_count:3d} PROG_END\n") } else if ist == 0 || m_state in [.start,.ist_next,.stop] { buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n") }else{ ch, char_len = re.get_charb(in_txt,i) buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>") buf2.write("${ist:8x}".replace(" ","0")) buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ") if ist == IST_SIMPLE_CHAR { buf2.write("query_ch: [${re.prog[pc].ch:1c}]") } else { if ist == IST_BSLS_CHAR { buf2.write("BSLS [\\${re.prog[pc].ch:1c}]") } else if ist == IST_PROG_END { buf2.write("PROG_END") } else if ist == IST_OR_BRANCH { buf2.write("OR") } else if ist == IST_CHAR_CLASS_POS { buf2.write("CHAR_CLASS_POS[${re.get_char_class(pc)}]") } else if ist == IST_CHAR_CLASS_NEG { buf2.write("CHAR_CLASS_NEG[${re.get_char_class(pc)}]") } else if ist == IST_DOT_CHAR { buf2.write("DOT_CHAR") } else if ist == IST_GROUP_START { tmp_gi :=re.prog[pc].group_id tmp_gr := re.prog[re.prog[pc].goto_pc].group_rep buf2.write("GROUP_START #:${tmp_gi} rep:${tmp_gr} ") } else if ist == IST_GROUP_END { buf2.write("GROUP_END #:${re.prog[pc].group_id} deep:${group_index} ") } } if re.prog[pc].rep_max == MAX_QUANTIFIER { buf2.write("{${re.prog[pc].rep_min},MAX}:${re.prog[pc].rep}") } else { buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}") } if re.prog[pc].greedy == true { buf2.write("?") } buf2.write(" (#${group_index})\n") } re.log_func(buf2.str()) } } step_count++ dbg_line++ } //****************************************** // we're out of text, manage it if i >= in_txt_len || m_state == .new_line { // manage groups if group_index >= 0 && state.match_index >= 0 { //C.printf("End text with open groups!\n") // close the groups for group_index >= 0 { tmp_pc := group_data[group_index] re.prog[tmp_pc].group_rep++ /* C.printf("Closing group %d {%d,%d}:%d\n", group_index, re.prog[tmp_pc].rep_min, re.prog[tmp_pc].rep_max, re.prog[tmp_pc].group_rep ) */ if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min{ start_i := group_stack[group_index] group_stack[group_index]=-1 // save group results g_index := re.prog[tmp_pc].group_id*2 if start_i >= 0 { re.groups[g_index] = start_i } else { re.groups[g_index] = 0 } re.groups[g_index+1] = i } group_index-- } } // manage IST_DOT_CHAR if re.state_stack_index >= 0 { //C.printf("DOT CHAR text end management!\n") // if DOT CHAR is not the last instruction and we are still going, then no match!! if pc < re.prog.len && re.prog[pc+1].ist != IST_PROG_END { return NO_MATCH_FOUND,0 } } m_state == .end break return NO_MATCH_FOUND,0 } // starting and init if m_state == .start { pc = -1 i = 0 m_state = .ist_next continue } // ist_next, next instruction reseting its state if m_state == .ist_next { pc = pc + 1 re.prog[pc].reset() // check if we are in the program bounds if pc < 0 || pc > re.prog.len { //C.printf("ERROR!! PC overflow!!\n") return ERR_INTERNAL_ERROR, i } m_state = .ist_load continue } // ist_next_ks, next instruction keeping its state if m_state == .ist_next_ks { pc = pc + 1 // check if we are in the program bounds if pc < 0 || pc > re.prog.len { //C.printf("ERROR!! PC overflow!!\n") return ERR_INTERNAL_ERROR, i } m_state = .ist_load continue } // load the char ch, char_len = re.get_charb(in_txt,i) // check new line if flag F_NL enabled if (re.flag & F_NL) != 0 && char_len == 1 && byte(ch) in NEW_LINE_LIST { m_state = .new_line continue } // check if stop if m_state == .stop { // if we are in restore state ,do it and restart if re.state_stack_index >= 0 { i = re.state_stack[re.state_stack_index].i pc = re.state_stack[re.state_stack_index].pc state.match_index = re.state_stack[re.state_stack_index].mi group_index = re.state_stack[re.state_stack_index].group_stack_index m_state = .ist_load continue } if ist == IST_PROG_END { return first_match,i } // exit on no match return result,0 } // ist_load if m_state == .ist_load { // program end if ist == IST_PROG_END { // if we are in match exit well if group_index >= 0 && state.match_index >= 0 { group_index = -1 } m_state = .stop continue } // check GROUP start, no quantifier is checkd for this token!! else if ist == IST_GROUP_START { group_index++ group_data[group_index] = re.prog[pc].goto_pc // save where is IST_GROUP_END, we will use it for escape group_stack[group_index]=i // index where we start to manage //C.printf("group_index %d rep %d\n", group_index, re.prog[re.prog[pc].goto_pc].group_rep) m_state = .ist_next continue } // check GROUP end else if ist == IST_GROUP_END { // we are in matching streak if state.match_index >= 0 { // restore txt index stack and save the group data //C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index) if group_index >= 0 { start_i := group_stack[group_index] group_stack[group_index]=-1 // save group results g_index := re.prog[pc].group_id*2 if start_i >= 0 { re.groups[g_index] = start_i } else { re.groups[g_index] = 0 } re.groups[g_index+1] = i } re.prog[pc].group_rep++ // increase repetitions //C.printf("GROUP %d END %d\n", group_index, re.prog[pc].group_rep) m_state = .ist_quant_pg continue } m_state = .ist_quant_ng continue } // check OR else if ist == IST_OR_BRANCH { if state.match_index >= 0 { pc = re.prog[pc].rep_max //C.printf("IST_OR_BRANCH True pc: %d\n", pc) }else{ pc = re.prog[pc].rep_min //C.printf("IST_OR_BRANCH False pc: %d\n", pc) } re.prog[pc].reset() m_state == .ist_load continue } // check IST_DOT_CHAR else if ist == IST_DOT_CHAR { //C.printf("IST_DOT_CHAR rep: %d\n", re.prog[pc].rep) state.match_flag = true if first_match < 0 { first_match = i } state.match_index = i re.prog[pc].rep++ if re.prog[pc].rep == 1 { // save the state re.state_stack_index++ re.state_stack[re.state_stack_index].pc = pc re.state_stack[re.state_stack_index].mi = state.match_index re.state_stack[re.state_stack_index].group_stack_index = group_index } if re.prog[pc].rep >= 1 && re.state_stack_index >= 0 { re.state_stack[re.state_stack_index].i = i + char_len } // manage * and {0,} quantifier if re.prog[pc].rep_min > 0 { i += char_len // next char } if re.prog[pc+1].ist != IST_GROUP_END { m_state = .ist_next continue } // IST_DOT_CHAR is the last instruction, get all else { //C.printf("We are the last one!\n") pc-- m_state = .ist_next_ks continue } } // char class IST else if ist == IST_CHAR_CLASS_POS || ist == IST_CHAR_CLASS_NEG { state.match_flag = false mut cc_neg := false if ist == IST_CHAR_CLASS_NEG { cc_neg = true } mut cc_res := re.check_char_class(pc,ch) if cc_neg { cc_res = !cc_res } if cc_res { state.match_flag = true if first_match < 0 { first_match = i } state.match_index = i re.prog[pc].rep++ // increase repetitions i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } // check bsls else if ist == IST_BSLS_CHAR { state.match_flag = false tmp_res := re.prog[pc].validator(byte(ch)) //C.printf("BSLS in_ch: %c res: %d\n", ch, tmp_res) if tmp_res { state.match_flag = true if first_match < 0 { first_match = i } state.match_index = i re.prog[pc].rep++ // increase repetitions i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } // simple char IST else if ist == IST_SIMPLE_CHAR { //C.printf("IST_SIMPLE_CHAR\n") state.match_flag = false if re.prog[pc].ch == ch { state.match_flag = true if first_match < 0 { first_match = i } //C.printf("state.match_index: %d\n", state.match_index) state.match_index = i re.prog[pc].rep++ // increase repetitions i += char_len // next char m_state = .ist_quant_p continue } m_state = .ist_quant_n continue } /* UNREACHABLE */ //C.printf("PANIC2!! state: %d\n", m_state) return ERR_INTERNAL_ERROR, i } /*********************************** * Quantifier management ***********************************/ // ist_quant_ng if m_state == .ist_quant_ng { // we are finished here if group_index < 0 { //C.printf("Early stop!\n") result = NO_MATCH_FOUND m_state = .stop continue } tmp_pc := group_data[group_index] // PC to the end of the group token rep := re.prog[tmp_pc].group_rep // use a temp variable re.prog[tmp_pc].group_rep = 0 // clear the repetitions //C.printf(".ist_quant_ng group_pc_end: %d rep: %d\n", tmp_pc,rep) if rep >= re.prog[tmp_pc].rep_min { //C.printf("ist_quant_ng GROUP CLOSED OK group_index: %d\n", group_index) i = group_stack[group_index] pc = tmp_pc group_index-- m_state = .ist_next continue } else if re.prog[tmp_pc].next_is_or { //C.printf("ist_quant_ng OR Negative branch\n") i = group_stack[group_index] pc = re.prog[tmp_pc+1].rep_min -1 group_index-- m_state = .ist_next continue } else if rep>0 && rep < re.prog[tmp_pc].rep_min { //C.printf("ist_quant_ng UNDER THE MINIMUM g.i: %d\n", group_index) // check if we are inside a group, if yes exit from the nested groups if group_index > 0{ group_index-- pc = tmp_pc m_state = .ist_quant_ng //.ist_next continue } if group_index == 0 { group_index-- pc = tmp_pc // TEST m_state = .ist_next continue } result = NO_MATCH_FOUND m_state = .stop continue } else if rep==0 && rep < re.prog[tmp_pc].rep_min { //C.printf("ist_quant_ng ZERO UNDER THE MINIMUM g.i: %d\n", group_index) if group_index > 0{ group_index-- pc = tmp_pc m_state = .ist_quant_ng //.ist_next continue } result = NO_MATCH_FOUND m_state = .stop continue } //C.printf("DO NOT STAY HERE!! {%d,%d}:%d\n", re.prog[tmp_pc].rep_min, re.prog[tmp_pc].rep_max, rep) /* UNREACHABLE */ return ERR_INTERNAL_ERROR, i } // ist_quant_pg else if m_state == .ist_quant_pg { //C.printf(".ist_quant_pg\n") mut tmp_pc := pc if group_index >= 0 { tmp_pc = group_data[group_index] } rep := re.prog[tmp_pc].group_rep if rep < re.prog[tmp_pc].rep_min { //C.printf("ist_quant_pg UNDER RANGE\n") pc = re.prog[tmp_pc].goto_pc //group_index-- m_state = .ist_next continue } else if rep == re.prog[tmp_pc].rep_max { //C.printf("ist_quant_pg MAX RANGE\n") re.prog[tmp_pc].group_rep = 0 // clear the repetitions group_index-- m_state = .ist_next continue } else if rep >= re.prog[tmp_pc].rep_min { //C.printf("ist_quant_pg IN RANGE group_index:%d\n", group_index) // check greedy flag, if true exit on minimum if re.prog[tmp_pc].greedy == true { re.prog[tmp_pc].group_rep = 0 // clear the repetitions group_index-- m_state = .ist_next continue } pc = re.prog[tmp_pc].goto_pc - 1 group_index-- m_state = .ist_next continue } /* UNREACHABLE */ //C.printf("PANIC3!! state: %d\n", m_state) return ERR_INTERNAL_ERROR, i } // ist_quant_n else if m_state == .ist_quant_n { rep := re.prog[pc].rep //C.printf("Here!! PC %d is_next_or: %d \n", pc, re.prog[pc].next_is_or) // zero quantifier * or ? if rep == 0 && re.prog[pc].rep_min == 0 { //C.printf("ist_quant_n ZERO RANGE MIN\n") m_state = .ist_next // go to next ist continue } // match failed else if rep == 0 && re.prog[pc].rep_min > 0 { //C.printf("ist_quant_n NO MATCH\n") // dummy } // match + or * else if rep >= re.prog[pc].rep_min { //C.printf("ist_quant_n MATCH RANGE\n") m_state = .ist_next continue } // check the OR if present if re.prog[pc].next_is_or { //C.printf("OR present on failing\n") state.match_index = -1 m_state = .ist_next continue } // we are in a group manage no match from here if group_index >= 0 { //C.printf("ist_quant_n FAILED insied a GROUP group_index:%d\n", group_index) m_state = .ist_quant_ng continue } // no other options //C.printf("NO_MATCH_FOUND\n") result = NO_MATCH_FOUND m_state = .stop continue //return NO_MATCH_FOUND, 0 } // ist_quant_p else if m_state == .ist_quant_p { // exit on first match if (re.flag & F_EFM) != 0 { return i,i+1 } rep := re.prog[pc].rep // clear the actual dot char capture state if re.state_stack_index >= 0 { //C.printf("Drop the DOT_CHAR state!\n") re.state_stack_index-- } // under range if rep > 0 && rep < re.prog[pc].rep_min { //C.printf("ist_quant_p UNDER RANGE\n") m_state = .ist_load // continue the loop continue } // range ok, continue loop else if rep >= re.prog[pc].rep_min && rep < re.prog[pc].rep_max { //C.printf("ist_quant_p IN RANGE\n") // check greedy flag, if true exit on minimum if re.prog[pc].greedy == true { m_state = .ist_next continue } m_state = .ist_load continue } // max reached else if rep == re.prog[pc].rep_max { //C.printf("ist_quant_p MAX RANGE\n") m_state = .ist_next continue } } /* UNREACHABLE */ //C.printf("PANIC4!! state: %d\n", m_state) return ERR_INTERNAL_ERROR, i } // Check the results if state.match_index >= 0 { if group_index < 0 { //C.printf("OK match,natural end [%d,%d]\n", first_match, i) return first_match, i } else { //C.printf("Skip last group\n") return first_match,group_stack[group_index--] } } //C.printf("NO_MATCH_FOUND, natural end\n") return NO_MATCH_FOUND, 0 } /****************************************************************************** * * Public functions * ******************************************************************************/ // // Inits // // regex create a regex object from the query string pub fn regex(in_query string) (RE,int,int){ mut re := RE{} re.prog = [Token{}].repeat(in_query.len+1) re.cc = [CharClass{}].repeat(in_query.len+1) re.group_max_nested = 8 re_err,err_pos := re.compile(in_query) return re, re_err, err_pos } // new_regex create a REgex of small size, usually sufficient for ordinary use pub fn new_regex() RE { return new_regex_by_size(1) } // new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated pub fn new_regex_by_size(mult int) RE { mut re := RE{} re.prog = [Token{}].repeat(MAX_CODE_LEN*mult) // max program length, default 256 istructions re.cc = [CharClass{}].repeat(MAX_CODE_LEN*mult) // char class list re.group_max_nested = 3*mult // max nested group return re } // // Matchers // pub fn (re mut RE) match_string(in_txt string) (int,int) { start, end := re.match_base(in_txt.str,in_txt.len) if start >= 0 && end > start { if (re.flag & F_MS) != 0 && start > 0 { return NO_MATCH_FOUND, 0 } if (re.flag & F_ME) != 0 && end < in_txt.len { return NO_MATCH_FOUND, 0 } return start, end } return start, end } // // Finders // // find try to find the first match in the input string pub fn (re mut RE) find(in_txt string) (int,int) { mut i := 0 mut start := -1 mut end := -1 old_flag := re.flag for i < in_txt.len { // test only the first part of the query string re.flag &= F_EFM // set to exit on the first token match mut tmp_end := i+re.query.len if tmp_end > in_txt.len { tmp_end = in_txt.len } tmp_txt := string{ str: in_txt.str+i, len: tmp_end-i } start, end = re.match_base(tmp_txt.str, tmp_txt.len) if start >= 0 && end > start { // test a complete match re.flag = old_flag tmp_txt1 := string{ str: in_txt.str+i , len: in_txt.len-i } start, end = re.match_base(tmp_txt1.str, tmp_txt1.len) if start >= 0 && end > start { if (re.flag & F_MS) != 0 && (i+start) > 0 { return NO_MATCH_FOUND, 0 } if (re.flag & F_ME) != 0 && (i+end) < in_txt.len { return NO_MATCH_FOUND, 0 } return i+start, i+end } } i++ if re.flag == F_MS && i>0 { return NO_MATCH_FOUND, 0 } } return NO_MATCH_FOUND, 0 } // find all the non overlapping occurrences of the match pattern pub fn (re mut RE) find_all(in_txt string) []int { mut i := 0 mut res := []int mut ls := -1 for i < in_txt.len { s,e := re.find(in_txt[i..]) if s >= 0 && e > s && i+s > ls { //println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls") res << i+s res << i+e ls = i+s i = i+e continue } else { i++ } } return res } // replace return a string where the matches are replaced with the replace string pub fn (re mut RE) replace(in_txt string, repl string) string { pos := re.find_all(in_txt) if pos.len > 0 { mut res := "" mut i := 0 mut s1 := 0 mut e1 := in_txt.len for i < pos.len { e1 = pos[i] res += in_txt[s1..e1] + repl s1 = pos[i+1] i += 2 } res += in_txt[s1..] return res } return in_txt }