v/vlib/regex/regex.v

/**********************************************************************
*
* regex 0.9d
*
* Copyright (c) 2019-2020 Dario Deledda. All rights reserved.
* Use of this source code is governed by an MIT license
* that can be found in the LICENSE file.
*
* This file contains regex module
*
* Know limitation:
* - find is implemented in a trivial way
* - not full compliant PCRE
* - not compliant POSIX ERE
*
*
**********************************************************************/
module regex
import strings

pub const(
	V_REGEX_VERSION = "0.9d"      // regex module version

	MAX_CODE_LEN     = 256        // default small base code len for the regex programs
	MAX_QUANTIFIER   = 1073741824 // default max repetitions allowed for the quantifiers = 2^30

	// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
	SPACES = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
	// new line chars for now only '\n'
	NEW_LINE_LIST = [`\n`,`\r`]

	// Results
	NO_MATCH_FOUND          = -1

	// Errors
	COMPILE_OK              =  0   // the regex string compiled, all ok
	ERR_CHAR_UNKNOWN        = -2   // the char used is unknow to the system
	ERR_UNDEFINED           = -3   // the compiler symbol is undefined
	ERR_INTERNAL_ERROR      = -4   // Bug in the regex system!!
	ERR_CC_ALLOC_OVERFLOW   = -5   // memory for char class full!!
	ERR_SYNTAX_ERROR        = -6   // syntax error in regex compiling
	ERR_GROUPS_OVERFLOW     = -7   // max number of groups reached
	ERR_GROUPS_MAX_NESTED   = -8   // max number of nested group reached
	ERR_GROUP_NOT_BALANCED  = -9   // group not balanced
	ERR_GROUP_QM_NOTATION   = -10  // group invalid notation
)

const(
	//*************************************
	// regex program instructions
	//*************************************
	IST_SIMPLE_CHAR  = u32(0x7FFFFFFF)   // single char instruction, 31 bit available to char

	// char class 11 0100 AA xxxxxxxx
	// AA = 00  regular class
	// AA = 01  Negated class ^ char
	IST_CHAR_CLASS       = 0xD1000000   // MASK
	IST_CHAR_CLASS_POS   = 0xD0000000   // char class normal [abc]
	IST_CHAR_CLASS_NEG   = 0xD1000000   // char class negate [^abc]

	// dot char        10 0110 xx xxxxxxxx
	IST_DOT_CHAR         = 0x98000000   // match any char except \n

	// backslash chars 10 0100 xx xxxxxxxx
	IST_BSLS_CHAR        = 0x90000000   // backslash char

	// OR |            10 010Y xx xxxxxxxx
	IST_OR_BRANCH        = 0x91000000   // OR case

	// groups          10 010Y xx xxxxxxxx
	IST_GROUP_START      = 0x92000000   // group start (
	IST_GROUP_END        = 0x94000000   // group end   )

	// control instructions
	IST_PROG_END         = u32(0x88000000)      //10 0010 xx xxxxxxxx
	//*************************************
)

/******************************************************************************
*
* General Utilities
*
******************************************************************************/
// utf8util_char_len calculate the length in bytes of a utf8 char
[inline]
fn utf8util_char_len(b byte) int {
	return (( 0xe5000000 >> (( b >> 3 ) & 0x1e )) & 3 ) + 1
}

// get_char get a char from position i and return an u32 with the unicode code
[inline]
fn (re RE) get_char(in_txt string, i int) (u32,int) {
	// ascii 8 bit
	if (re.flag & F_BIN) !=0 ||
		in_txt.str[i] & 0x80 == 0
	{
		return u32(in_txt.str[i]), 1
	}
	// unicode char
	char_len := utf8util_char_len(in_txt.str[i])
	mut tmp := 0
	mut ch := u32(0)
	for tmp < char_len {
		ch = (ch << 8) | in_txt.str[i+tmp]
		tmp++
	}
	return ch,char_len
}

// get_charb get a char from position i and return an u32 with the unicode code
[inline]
fn (re RE) get_charb(in_txt byteptr, i int) (u32,int) {
	// ascii 8 bit
	if (re.flag & F_BIN) !=0 ||
		in_txt[i] & 0x80 == 0
	{
		return u32(in_txt[i]), 1
	}
	// unicode char
	char_len := utf8util_char_len(in_txt[i])
	mut tmp := 0
	mut ch := u32(0)
	for tmp < char_len {
		ch = (ch << 8) | in_txt[i+tmp]
		tmp++
	}
	return ch,char_len
}

[inline]
fn is_alnum(in_char byte) bool {
	mut tmp := in_char - `A`
	if tmp >= 0x00 && tmp <= 25 { return true }
	tmp = in_char - `a`
	if tmp >= 0x00 && tmp <= 25 { return true }
	tmp = in_char - `0`
	if tmp >= 0x00 && tmp <= 9  { return true }
	if tmp == `_` { return true }
	return false
}

[inline]
fn is_not_alnum(in_char byte) bool {
	return !is_alnum(in_char)
}

[inline]
fn is_space(in_char byte) bool {
	return in_char in SPACES
}

[inline]
fn is_not_space(in_char byte) bool {
	return !is_space(in_char)
}

[inline]
fn is_digit(in_char byte) bool {
	tmp := in_char - `0`
	return tmp <= 0x09 && tmp >= 0
}

[inline]
fn is_not_digit(in_char byte) bool {
	return !is_digit(in_char)
}

[inline]
fn is_wordchar(in_char byte) bool {
	return is_alnum(in_char) || in_char == `_`
}

[inline]
fn is_not_wordchar(in_char byte) bool {
	return !is_alnum(in_char)
}

[inline]
fn is_lower(in_char byte) bool {
	tmp := in_char - `a`
	return  tmp >= 0x00 && tmp <= 25
}

[inline]
fn is_upper(in_char byte) bool {
	tmp := in_char - `A`
	return  tmp >= 0x00 && tmp <= 25
}

pub fn (re RE) get_parse_error_string(err int) string {
	match err {
		COMPILE_OK             { return "COMPILE_OK" }
		NO_MATCH_FOUND         { return "NO_MATCH_FOUND" }
		ERR_CHAR_UNKNOWN       { return "ERR_CHAR_UNKNOWN" }
		ERR_UNDEFINED          { return "ERR_UNDEFINED" }
		ERR_INTERNAL_ERROR     { return "ERR_INTERNAL_ERROR" }
		ERR_CC_ALLOC_OVERFLOW  { return "ERR_CC_ALLOC_OVERFLOW" }
		ERR_SYNTAX_ERROR       { return "ERR_SYNTAX_ERROR" }
		ERR_GROUPS_OVERFLOW    { return "ERR_GROUPS_OVERFLOW" }
		ERR_GROUPS_MAX_NESTED  { return "ERR_GROUPS_MAX_NESTED" }
		ERR_GROUP_NOT_BALANCED { return "ERR_GROUP_NOT_BALANCED" }
		ERR_GROUP_QM_NOTATION  { return "ERR_GROUP_QM_NOTATION" }
		else { return "ERR_UNKNOWN" }
	}
}

// utf8_str convert and utf8 sequence to a printable string
[inline]
fn utf8_str(ch u32) string {
	mut i := 4
	mut res := ""
	for i > 0 {
		v := byte((ch >> ((i-1)*8)) & 0xFF)
		if v != 0{
			res += "${v:1c}"
		}
		i--
	}
	return res
}

// simple_log default log function
fn simple_log(txt string) {
	print(txt)
}

/******************************************************************************
*
* Token Structs
*
******************************************************************************/
struct Token{
mut:
	ist u32 = u32(0)

	// char
	ch u32                 = u32(0)  // char of the token if any
	ch_len byte            = byte(0) // char len

	// Quantifiers / branch
	rep_min         int    = 0     // used also for jump next in the OR branch [no match] pc jump
	rep_max         int    = 0     // used also for jump next in the OR branch [   match] pc jump
	greedy          bool   = false // greedy quantifier flag

	// Char class
	cc_index        int    = -1

	// counters for quantifier check (repetitions)
	rep             int    = 0

	// validator function pointer
	validator fn (byte) bool

	// groups variables
	group_rep          int = 0     // repetition of the group
	group_id           int = -1    // id of the group
	goto_pc            int = -1    // jump to this PC if is needed

	// OR flag for the token
	next_is_or bool        = false // true if the next token is an OR
}

[inline]
fn (tok mut Token) reset() {
	tok.rep = 0
}

/******************************************************************************
*
* Regex struct
*
******************************************************************************/
pub const (
	F_NL  = 0x00000001  // end the match when find a new line symbol
	F_MS  = 0x00000002  // match true only if the match is at the start of the string
	F_ME  = 0x00000004  // match true only if the match is at the end of the string

	F_EFM = 0x00000100  // exit on first token matched, used by search
	F_BIN = 0x00000200  // work only on bytes, ignore utf-8

	// behaviour modifier flags
	//F_OR  = 0x00010000  // the OR work with concatenation like PCRE
	F_SRC = 0x00020000  // search mode enabled
)

struct StateDotObj{
mut:
	i  int                = -1  // char index in the input buffer
	pc int                = -1  // program counter saved
	mi int                = -1  // match_index saved
	group_stack_index int = -1  // continuous save on capturing groups
}

pub
struct RE {
pub mut:
	prog []Token

	// char classes storage
	cc []CharClass             // char class list
	cc_index int         = 0   // index

	// state index
	state_stack_index int= -1
	state_stack []StateDotObj


	// groups
	group_count int      = 0   // number of groups in this regex struct
	groups []int               // groups index results
	group_max_nested int = 3   // max nested group
	group_max int        = 8   // max allowed number of different groups

	group_csave []int    = []int  // groups continuous save array
	group_csave_index int= -1     // groups continuous save index

	group_map map[string]int      // groups names map

	// flags
	flag int             = 0   // flag for optional parameters

	// Debug/log
	debug int            = 0   // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
	log_func fn (string) = simple_log  // log function, can be customized by the user
	query string         = ""  // query string
}

// Reset RE object
//[inline]
fn (re mut RE) reset(){
	re.cc_index         = 0

	mut i := 0
	for i < re.prog.len {
		re.prog[i].group_rep          = 0 // clear repetition of the group
		re.prog[i].rep                = 0 // clear repetition of the token
		i++
	}
	re.groups = [-1].repeat(re.group_count*2)

	re.state_stack_index = -1

	// reset group_csave
	if re.group_csave.len > 0 {
		re.group_csave_index = 1
		re.group_csave[0] = 0     // reset the capture count
	}
}

// reset for search mode fail
// gcc bug, dont use [inline] or go 5 time slower
fn (re mut RE) reset_src(){
	mut i := 0
	for i < re.prog.len {
		re.prog[i].group_rep          = 0 // clear repetition of the group
		re.prog[i].rep                = 0 // clear repetition of the token
		i++
	}
	re.state_stack_index = -1
}

pub fn (re RE) get_group(group_name string) (int, int) {
	if group_name in re.group_map {
		tmp_index := re.group_map[group_name]-1
		start := re.groups[tmp_index*2]
		end := re.groups[tmp_index*2+1]
		return start,end
	}
	return -1, -1
}

/******************************************************************************
*
* Backslashes chars
*
******************************************************************************/
struct BslsStruct {
	ch u32                   // meta char
	validator fn (byte) bool // validator function pointer
}

const(
	BSLS_VALIDATOR_ARRAY = [
		BslsStruct{`w`, is_alnum},
		BslsStruct{`W`, is_not_alnum},
		BslsStruct{`s`, is_space},
		BslsStruct{`S`, is_not_space},
		BslsStruct{`d`, is_digit},
		BslsStruct{`D`, is_not_digit},
		BslsStruct{`a`, is_lower},
		BslsStruct{`A`, is_upper},
	]

	// these chars are escape if preceded by a \
	BSLS_ESCAPE_LIST = [ `\\`,`|`,`.`,`*`,`+`,`{`,`}`,`[`,`]` ]
)

enum BSLS_parse_state {
		start
		bsls_found
		bsls_char
		normal_char
}

// parse_bsls return (index, str_len) BSLS_VALIDATOR_ARRAY index, len of the backslash sequence if present
fn (re RE) parse_bsls(in_txt string, in_i int) (int,int){
	mut status := BSLS_parse_state.start
	mut i := in_i

	for i < in_txt.len {
		// get our char
		char_tmp,char_len := re.get_char(in_txt,i)
		ch := byte(char_tmp)

		if status == .start && ch == `\\` {
			status = .bsls_found
			i += char_len
			continue
		}

		// check if is our bsls char, for now only one length sequence
		if status == .bsls_found {
			for c,x in BSLS_VALIDATOR_ARRAY {
				if x.ch == ch {
					return c,i-in_i+1
				}
			}
			status = .normal_char
			continue
		}

		// no BSLS validator, manage as normal escape char char
		if status == .normal_char {
			if ch in BSLS_ESCAPE_LIST {
				return NO_MATCH_FOUND,i-in_i+1
			}
			return ERR_SYNTAX_ERROR,i-in_i+1
		}

		// at the present time we manage only one char after the \
		break

	}
	// not our bsls return KO
	return ERR_SYNTAX_ERROR, i
}

/******************************************************************************
*
* Char class
*
******************************************************************************/
const(
	CC_NULL = 0    // empty cc token
	CC_CHAR = 1    // simple char: a
	CC_INT  = 2    // char interval: a-z
	CC_BSLS = 3    // backslash char
	CC_END  = 4    // cc sequence terminator
)

struct CharClass {
mut:
	cc_type int = CC_NULL      // type of cc token
	ch0 u32     = u32(0)       // first char of the interval a-b  a in this case
	ch1 u32     = u32(0)	   // second char of the interval a-b b in this case
	validator fn (byte) bool   // validator function pointer
}

enum CharClass_parse_state {
	start
	in_char
	in_bsls
	separator
	finish
}

fn (re RE) get_char_class(pc int) string {
	buf := [byte(0)].repeat(re.cc.len)
	mut buf_ptr := *byte(&buf)

	mut cc_i := re.prog[pc].cc_index
	mut i := 0
	mut tmp := 0
	for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != CC_END {

		if re.cc[cc_i].cc_type == CC_BSLS {
			buf_ptr[i++] = `\\`
			buf_ptr[i++] = byte(re.cc[cc_i].ch0)
		}
		else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
			tmp = 3
			for tmp >= 0 {
				x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF)
				if x != 0 {
					buf_ptr[i++] = x
				}
				tmp--
			}
		}
		else {
			tmp = 3
			for tmp >= 0 {
				x := byte((re.cc[cc_i].ch0 >> (tmp*8)) & 0xFF)
				if x != 0 {
					buf_ptr[i++] = x
				}
				tmp--
			}
			buf_ptr[i++] = `-`
			tmp = 3
			for tmp >= 0 {
				x := byte((re.cc[cc_i].ch1 >> (tmp*8)) & 0xFF)
				if x != 0 {
					buf_ptr[i++] = x
				}
				tmp--
			}
		}
		cc_i++
	}
	buf_ptr[i] = byte(0)

	return tos_clone( buf_ptr )
}

fn (re RE) check_char_class(pc int, ch u32) bool {
	mut cc_i := re.prog[pc].cc_index
	for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != CC_END {
		if re.cc[cc_i].cc_type == CC_BSLS {
			if re.cc[cc_i].validator(byte(ch)) {
				return true
			}
		}
		else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 {
			return true
		}
		cc_i++
	}
	return false
}

// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
fn (re mut RE) parse_char_class(in_txt string, in_i int) (int, int, u32) {
	mut status := CharClass_parse_state.start
	mut i := in_i

	mut tmp_index := re.cc_index
	res_index := re.cc_index

	mut cc_type := u32(IST_CHAR_CLASS_POS)

	for i < in_txt.len {

		// check if we are out of memory for char classes
		if tmp_index >= re.cc.len {
			return ERR_CC_ALLOC_OVERFLOW,0,u32(0)
		}

		// get our char
		char_tmp,char_len := re.get_char(in_txt,i)
		ch := byte(char_tmp)

		//C.printf("CC #%3d ch: %c\n",i,ch)

		// negation
		if status == .start && ch == `^` {
			cc_type = u32(IST_CHAR_CLASS_NEG)
			i += char_len
			continue
		}

		// bsls
		if (status == .start || status == .in_char) && ch == `\\` {
			//C.printf("CC bsls.\n")
			status = .in_bsls
			i += char_len
			continue
		}

		if status == .in_bsls {
			//C.printf("CC bsls validation.\n")
			for c,x in BSLS_VALIDATOR_ARRAY {
				if x.ch == ch {
					//C.printf("CC bsls found \\%c.\n",ch)
					re.cc[tmp_index].cc_type   = CC_BSLS
					re.cc[tmp_index].ch0       = BSLS_VALIDATOR_ARRAY[c].ch
					re.cc[tmp_index].ch1       = BSLS_VALIDATOR_ARRAY[c].ch
					re.cc[tmp_index].validator = BSLS_VALIDATOR_ARRAY[c].validator
					i += char_len
					tmp_index++
					status = .in_char
					break
				}
			}
			if status == .in_bsls {
				//C.printf("CC bsls not found \\%c.\n",ch)
				status = .in_char
			}else {
				continue
			}
		}

		// simple char
		if (status == .start || status == .in_char) &&
			ch != `-` && ch != `]`
		{
			status = .in_char

			re.cc[tmp_index].cc_type = CC_CHAR
			re.cc[tmp_index].ch0     = char_tmp
			re.cc[tmp_index].ch1     = char_tmp

			i += char_len
			tmp_index++
			continue
		}

		// check range separator
		if status == .in_char && ch == `-` {
			status = .separator
			i += char_len
			continue
		}

		// check range end
		if status == .separator && ch != `]` && ch != `-` {
			status = .in_char
			re.cc[tmp_index-1].cc_type = CC_INT
			re.cc[tmp_index-1].ch1     = char_tmp
			i += char_len
			continue
		}

		// char class end
		if status == .in_char && ch == `]` {
			re.cc[tmp_index].cc_type = CC_END
			re.cc[tmp_index].ch0     = 0
			re.cc[tmp_index].ch1     = 0
			re.cc_index = tmp_index+1

			return res_index, i-in_i+2, cc_type
		}

		i++
	}
	return ERR_SYNTAX_ERROR,0,u32(0)
}

/******************************************************************************
*
* Re Compiler
*
******************************************************************************/
//
// Quantifier
//
enum Quant_parse_state {
	start
	min_parse
	comma_checked
	max_parse
	greedy
	gredy_parse
	finish
}

// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
	mut status := Quant_parse_state.start
	mut i := in_i

	mut q_min := 0 // default min in a {} quantifier is 1
	mut q_max := 0 // deafult max in a {} quantifier is MAX_QUANTIFIER

	mut ch := byte(0)

	for i < in_txt.len {
		ch = in_txt.str[i]

		//C.printf("%c status: %d\n",ch,status)

		// exit on no compatible char with {} quantifier
		if utf8util_char_len(ch) != 1 {
			return ERR_SYNTAX_ERROR,i,0,false
		}

		// min parsing skip if comma present
		if status == .start && ch == `,` {
			q_min = 0 // default min in a {} quantifier is 0
			status = .comma_checked
			i++
			continue
		}

		if status == .start && is_digit( ch ) {
			status = .min_parse
			q_min *= 10
			q_min += int(ch - `0`)
			i++
			continue
		}

		if status == .min_parse && is_digit( ch ) {
			q_min *= 10
			q_min += int(ch - `0`)
			i++
			continue
		}

		// we have parsed the min, now check the max
		if status == .min_parse && ch == `,` {
			status = .comma_checked
			i++
			continue
		}

		// single value {4}
		if status == .min_parse && ch == `}` {
			q_max = q_min

			status = .greedy
			continue
		}

		// end without max
		if status == .comma_checked && ch == `}` {
			q_max = MAX_QUANTIFIER

			status = .greedy
			continue
		}

		// start max parsing
		if status == .comma_checked && is_digit( ch ) {
			status = .max_parse
			q_max *= 10
			q_max += int(ch - `0`)
			i++
			continue
		}

		// parse the max
		if status == .max_parse && is_digit( ch ) {
			q_max *= 10
			q_max += int(ch - `0`)
			i++
			continue
		}

		// finished the quantifier
		if status == .max_parse && ch == `}` {
			status = .greedy
			continue
		}

		// check if greedy flag char ? is present
		if status == .greedy {
			if i+1 < in_txt.len {
				i++
				status = .gredy_parse
				continue
			}
			return q_min, q_max, i-in_i+2, false
		}

		// check the greedy flag
		if status == .gredy_parse {
			if ch == `?` {
				return q_min, q_max, i-in_i+2, true
			} else {
				i--
				return q_min, q_max, i-in_i+2, false
			}
		}

		// not  a {} quantifier, exit
		return ERR_SYNTAX_ERROR, i, 0, false
	}

	// not a conform {} quantifier
	return ERR_SYNTAX_ERROR, i, 0, false
}

//
// Groups
//
enum Group_parse_state {
	start
	q_mark      // (?
	q_mark1     // (?:|P  checking
	p_status    // (?P
	p_start     // (?P<
	p_end       // (?P<...>
	p_in_name   // (?P<...
	finish
}

// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, string, int) {
	mut status := Group_parse_state.start
	mut i := in_i
	mut name := ''

	for i < in_txt.len && status != .finish {

		// get our char
		char_tmp,char_len := re.get_char(in_txt,i)
		ch := byte(char_tmp)

		// start
		if status == .start && ch == `(` {
			status = .q_mark
			i += char_len
			continue
		}

		// check for question marks
		if status == .q_mark && ch == `?` {
			status = .q_mark1
			i += char_len
			continue
		}

		// non capturing group
		if status == .q_mark1 && ch == `:` {
			i += char_len
			return 0, false, name, i
		}

		// enter in P section
		if status == .q_mark1 && ch == `P` {
			status = .p_status
			i += char_len
			continue
		}

		// not a valid q mark found
		if status == .q_mark1 {
			//println("NO VALID Q MARK")
			return -2 , true, name, i
		}

		if status == .p_status && ch == `<` {
			status = .p_start
			i += char_len
			continue
		}

		if status == .p_start && ch != `>` {
			status = .p_in_name
			name += "${ch:1c}" // TODO: manage utf8 chars
			i += char_len
			continue
		}

		// colect name
		if status == .p_in_name && ch != `>` && is_alnum(ch) {
			name += "${ch:1c}" // TODO: manage utf8 chars
			i += char_len
			continue
		}

		// end name
		if status == .p_in_name && ch == `>` {
			i += char_len
			return 0, true, name, i
		}

		// error on name group
		if status == .p_in_name {
			return -2 , true, name, i
		}

		// normal group, nothig to do, exit
		return  0 , true, name, i
	}
	/* UNREACHABLE */
	//println("ERROR!! NOT MEANT TO BE HERE!!1")
	return -2 , true, name, i
}

//
// main compiler
//
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
pub fn (re mut RE) compile(in_txt string) (int,int) {
	mut i        := 0      // input string index
	mut pc       := 0      // program counter
	mut tmp_code := u32(0)

	// group management variables
	mut group_count           := -1
	mut group_stack           := [0 ].repeat(re.group_max_nested)
	mut group_stack_txt_index := [-1].repeat(re.group_max_nested)
	mut group_stack_index     := -1

	re.query = in_txt      // save the query string

	i = 0
	for i < in_txt.len {
		tmp_code = u32(0)
		mut char_tmp := u32(0)
		mut char_len := 0
		//C.printf("i: %3d ch: %c\n", i, in_txt.str[i])

		char_tmp,char_len = re.get_char(in_txt,i)

		//
		// check special cases: $ ^
		//
		if char_len == 1 && i == 0 && byte(char_tmp) == `^` {
			re.flag = F_MS
			i = i + char_len
			continue
		}
		if char_len == 1 && i == (in_txt.len-1) && byte(char_tmp) == `$` {
			re.flag = F_ME
			i = i + char_len
			continue
		}

		// IST_GROUP_START
		if char_len == 1 && pc >= 0 && byte(char_tmp) == `(` {

			//check max groups allowed
			if group_count > re.group_max {
				return ERR_GROUPS_OVERFLOW,i+1
			}
			group_stack_index++

			// check max nested groups allowed
			if group_stack_index > re.group_max_nested {
				return ERR_GROUPS_MAX_NESTED,i+1
			}

			tmp_res, cgroup_flag, cgroup_name, next_i := re.parse_groups(in_txt,i)

			// manage question mark format error
			if tmp_res < -1 {
				return ERR_GROUP_QM_NOTATION,next_i
			}

			//println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
			i = next_i

			if cgroup_flag == true {
				group_count++
			}

			// calculate the group id
			// if it is a named group, recycle the group id
			// NOTE: **** the group index is +1 because map return 0 when not found!! ****
			mut group_id := group_count
			if cgroup_name.len > 0 {
				//println("GROUP NAME: ${cgroup_name}")
				if cgroup_name in re.group_map{
					group_id = re.group_map[cgroup_name]-1
					group_count--
				} else {
					re.group_map[cgroup_name] = group_id+1
				}
			}

			group_stack_txt_index[group_stack_index] = i
			group_stack[group_stack_index] = pc

			re.prog[pc].ist = u32(0) | IST_GROUP_START
			re.prog[pc].rep_min = 1
			re.prog[pc].rep_max = 1

			// set the group id
			if cgroup_flag == false {
				//println("NO CAPTURE GROUP")
				re.prog[pc].group_id = -1
			} else {
				re.prog[pc].group_id = group_id
			}

			pc = pc + 1
			continue

		}

		// IST_GROUP_END
		if char_len==1 && pc > 0 && byte(char_tmp) == `)` {
			if group_stack_index < 0 {
				return ERR_GROUP_NOT_BALANCED,i+1
			}

			goto_pc := group_stack[group_stack_index]
			group_stack_index--

			re.prog[pc].ist = u32(0) | IST_GROUP_END
			re.prog[pc].rep_min = 1
			re.prog[pc].rep_max = 1

			re.prog[pc].goto_pc = goto_pc			          // PC where to jump if a group need
			re.prog[pc].group_id = re.prog[goto_pc].group_id  // id of this group, used for storing data

			re.prog[goto_pc].goto_pc = pc                     // start goto point to the end group pc
			//re.prog[goto_pc].group_id = group_count         // id of this group, used for storing data

			pc = pc + 1
			i = i + char_len
			continue
		}

		// IST_DOT_CHAR match any char except the following token
		if char_len==1 && pc >= 0 && byte(char_tmp) == `.` {
			re.prog[pc].ist = u32(0) | IST_DOT_CHAR
			re.prog[pc].rep_min = 1
			re.prog[pc].rep_max = 1
			pc = pc + 1
			i = i + char_len
			continue
		}

		// OR branch
		if char_len==1 && pc > 0 && byte(char_tmp) == `|` {
			// two consecutive IST_DOT_CHAR are an error
			if pc > 0 && re.prog[pc-1].ist == IST_OR_BRANCH {
				return ERR_SYNTAX_ERROR,i
			}
			re.prog[pc].ist = u32(0) | IST_OR_BRANCH
			pc = pc + 1
			i = i + char_len
			continue
		}

		// Quantifiers
		if char_len==1 && pc > 0{
			mut quant_flag := true
			match byte(char_tmp) {
				`?` {
					//C.printf("q: %c\n",char_tmp)
					re.prog[pc-1].rep_min = 0
					re.prog[pc-1].rep_max = 1
				}

				`+` {
					//C.printf("q: %c\n",char_tmp)
					re.prog[pc-1].rep_min = 1
					re.prog[pc-1].rep_max = MAX_QUANTIFIER
				}

				`*` {
					//C.printf("q: %c\n",char_tmp)
					re.prog[pc-1].rep_min = 0
					re.prog[pc-1].rep_max = MAX_QUANTIFIER
				}

				`{` {
					min, max, tmp, greedy := re.parse_quantifier(in_txt, i+1)
					// it is a quantifier
					if min >= 0 {
						//C.printf("{%d,%d}\n str:[%s] greedy: %d\n", min, max, in_txt[i..i+tmp], greedy)
						i = i + tmp
						re.prog[pc-1].rep_min = min
						re.prog[pc-1].rep_max = max
						re.prog[pc-1].greedy  = greedy
						continue
					}
					else {
						return min,i
					}
					// TODO: decide if the open bracket can be conform without the close bracket
					/*
					// no conform, parse as normal char
					else {
						quant_flag = false
					}
					*/
				}
				else{
					quant_flag = false
				}
			}

			if quant_flag {
				i = i + char_len
				continue
			}
		}

		// IST_CHAR_CLASS_*
		if char_len==1 && pc >= 0{
			if byte(char_tmp) == `[` {
				cc_index,tmp,cc_type := re.parse_char_class(in_txt, i+1)
				if cc_index >= 0 {
					//C.printf("index: %d str:%s\n",cc_index,in_txt[i..i+tmp])
					i = i + tmp
					re.prog[pc].ist      = u32(0) | cc_type
					re.prog[pc].cc_index = cc_index
					re.prog[pc].rep_min  = 1
					re.prog[pc].rep_max  = 1
					pc = pc + 1
					continue
				}

				// cc_class vector memory full
				else if cc_index < 0 {
					return cc_index, i
				}
			}
		}

		// IST_BSLS_CHAR
		if char_len==1 && pc >= 0{
			if byte(char_tmp) == `\\` {
				bsls_index,tmp := re.parse_bsls(in_txt,i)
				//C.printf("index: %d str:%s\n",bsls_index,in_txt[i..i+tmp])
				if bsls_index >= 0 {
					i = i + tmp
					re.prog[pc].ist       = u32(0) | IST_BSLS_CHAR
					re.prog[pc].rep_min   = 1
					re.prog[pc].rep_max   = 1
					re.prog[pc].validator = BSLS_VALIDATOR_ARRAY[bsls_index].validator
					re.prog[pc].ch      = BSLS_VALIDATOR_ARRAY[bsls_index].ch
					pc = pc + 1
					continue
				}
				// this is an escape char, skip the bsls and continue as a normal char
				else if bsls_index == NO_MATCH_FOUND {
					i += char_len
					char_tmp,char_len = re.get_char(in_txt,i)
					// continue as simple char
				}
				// if not an escape or a bsls char then it is an error (at least for now!)
				else {
					return bsls_index,i+tmp
				}
			}
		}

		// IST_SIMPLE_CHAR
		re.prog[pc].ist     = IST_SIMPLE_CHAR
		re.prog[pc].ch      = char_tmp
		re.prog[pc].ch_len  = char_len
		re.prog[pc].rep_min = 1
		re.prog[pc].rep_max = 1
		//C.printf("char: %c\n",char_tmp)
		pc = pc +1

		i+=char_len
	}

	// add end of the program
	re.prog[pc].ist = IST_PROG_END

	// check for unbalanced groups
	if group_stack_index != -1 {
		return ERR_GROUP_NOT_BALANCED, group_stack_txt_index[group_stack_index]+1
	}

	// check for OR at the end of the program
	if pc > 0 && re.prog[pc-1].ist == IST_OR_BRANCH {
		return ERR_SYNTAX_ERROR,in_txt.len
	}

	// store the number of groups in the query
	re.group_count = group_count+1

	//******************************************
	// Post processing
	//******************************************

	// count IST_DOT_CHAR to set the size of the state stack
	mut pc1 := 0
	mut tmp_count := 0
	for pc1 < pc {
		if re.prog[pc1].ist == IST_DOT_CHAR {
			tmp_count++
		}
		pc1++
	}

	// init the state stack
	re.state_stack = [StateDotObj{}].repeat(tmp_count+1)

	// OR branch
	// a|b|cd
	// d exit point
	// a,b,c branches
	// set the jump in the right places
	pc1 = 0
	for pc1 < pc-2 {
		// two consecutive OR are a syntax error
		if re.prog[pc1+1].ist == IST_OR_BRANCH && re.prog[pc1+2].ist == IST_OR_BRANCH {
			return ERR_SYNTAX_ERROR, i
		}

		// manange a|b chains like a|(b)|c|d...
		// standard solution
		if re.prog[pc1].ist != IST_OR_BRANCH &&
			re.prog[pc1+1].ist == IST_OR_BRANCH &&
			re.prog[pc1+2].ist != IST_OR_BRANCH
		{
			re.prog[pc1].next_is_or = true   // set that the next token is an  OR
			re.prog[pc1+1].rep_min = pc1+2   // failed match jump

			// match jump, if an OR chain the next token will be an OR token
			mut pc2 := pc1+2
			for pc2 < pc-1 {
				ist := re.prog[pc2].ist
				if  ist == IST_GROUP_START {
					re.prog[pc1+1].rep_max = re.prog[pc2].goto_pc + 1
					break
				}
				if ist != IST_OR_BRANCH {
					re.prog[pc1+1].rep_max = pc2 + 1
					break
				}
				pc2++
			}
			//C.printf("Compile OR postproc. [%d,OR %d,%d]\n",pc1,pc1+1,pc2)
			pc1 = pc2
			continue
		}

		pc1++
	}

	//******************************************
	// DEBUG PRINT REGEX GENERATED CODE
	//******************************************
	if re.debug > 0 {
		re.log_func(re.get_code())
	}
	//******************************************

	return COMPILE_OK, 0
}

// get_code return the compiled code as regex string, note: may be different from the source!
pub fn (re RE) get_code() string {
		mut pc1 := 0
		mut res := strings.new_builder(re.cc.len*2*re.prog.len)
		res.write("========================================\nv RegEx compiler v $V_REGEX_VERSION output:\n")

		mut stop_flag := false

		for pc1 <= re.prog.len {
			tk := re.prog[pc1]
			res.write("PC:${pc1:3d}")

		    res.write(" ist: ")
		    res.write("${tk.ist:8x}".replace(" ","0") )
		    res.write(" ")
			ist :=tk.ist
			if ist == IST_BSLS_CHAR {
				res.write("[\\${tk.ch:1c}]     BSLS")
			} else if ist == IST_PROG_END {
				res.write("PROG_END")
				stop_flag = true
			} else if ist == IST_OR_BRANCH {
				res.write("OR      ")
			} else if ist == IST_CHAR_CLASS_POS {
				res.write("[${re.get_char_class(pc1)}]     CHAR_CLASS_POS")
			} else if ist == IST_CHAR_CLASS_NEG {
				res.write("[^${re.get_char_class(pc1)}]    CHAR_CLASS_NEG")
			} else if ist == IST_DOT_CHAR {
				res.write(".        DOT_CHAR")
			} else if ist == IST_GROUP_START {
				res.write("(        GROUP_START #:${tk.group_id}")
				if tk.group_id == -1 {
					res.write(" ?:")
				} else {
					for x in re.group_map.keys() {
						if re.group_map[x] == (tk.group_id+1) {
							res.write(" ?P<${x}>")
							break
						}
					}
				}
			} else if ist == IST_GROUP_END {
				res.write(")        GROUP_END   #:${tk.group_id}")
			} else if ist == IST_SIMPLE_CHAR {
				res.write("[${tk.ch:1c}]      query_ch")
			}

			if tk.rep_max == MAX_QUANTIFIER {
				res.write(" {${tk.rep_min:3d},MAX}")
			}else{
				if ist == IST_OR_BRANCH {
					res.write(" if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}")
				} else {
					res.write(" {${tk.rep_min:3d},${tk.rep_max:3d}}")
				}
				if tk.greedy == true {
					res.write("?")
				}
			}
			res.write("\n")
			if stop_flag {
				break
			}
			pc1++
		}

		res.write("========================================\n")
		return res.str()
}

// get_query return a string with a reconstruction of the query starting from the regex program code
pub fn (re RE) get_query() string {
	mut res := strings.new_builder(re.query.len*2)

	if (re.flag & F_MS) != 0 {
		res.write("^")
	}

	mut i := 0
	for i < re.prog.len && re.prog[i].ist != IST_PROG_END && re.prog[i].ist != 0{
		tk := &re.prog[i]
		ch := tk.ist

		// GROUP start
		if ch == IST_GROUP_START {
			if re.debug == 0 {
				res.write("(")
			} else {
				if tk.group_id == -1 {
					res.write("(?:")   // non capturing group
				} else {
					res.write("#${tk.group_id}(")
				}
			}

			for x in re.group_map.keys() {
				if re.group_map[x] == (tk.group_id+1) {
					res.write("?P<${x}>")
					break
				}
			}

			i++
			continue
		}

		// GROUP end
		if ch == IST_GROUP_END {
			res.write(")")
		}

		// OR branch
		if ch == IST_OR_BRANCH {
			res.write("|")
			if re.debug > 0 {
				res.write("{${tk.rep_min},${tk.rep_max}}")
			}
			i++
			continue
		}

		// char class
		if ch == IST_CHAR_CLASS_NEG || ch == IST_CHAR_CLASS_POS {
			res.write("[")
			if ch == IST_CHAR_CLASS_NEG {
				res.write("^")
			}
			res.write("${re.get_char_class(i)}")
			res.write("]")
		}

		// bsls char
		if ch == IST_BSLS_CHAR {
			res.write("\\${tk.ch:1c}")
		}

		// IST_DOT_CHAR
		if ch == IST_DOT_CHAR {
			res.write(".")
		}

		// char alone
		if ch == IST_SIMPLE_CHAR {
			if byte(ch) in BSLS_ESCAPE_LIST {
				res.write("\\")
			}
			res.write("${tk.ch:c}")
		}

		// quantifier
		if !(tk.rep_min == 1 && tk.rep_max == 1) {
			if tk.rep_min == 0 && tk.rep_max == 1 {
				res.write("?")
			} else if tk.rep_min == 1 && tk.rep_max == MAX_QUANTIFIER {
				res.write("+")
			} else if tk.rep_min == 0 && tk.rep_max == MAX_QUANTIFIER {
				res.write("*")
			} else {
				if tk.rep_max == MAX_QUANTIFIER {
					res.write("{${tk.rep_min},MAX}")
				} else {
					res.write("{${tk.rep_min},${tk.rep_max}}")
				}
				if tk.greedy == true {
					res.write("?")
				}
			}
		}
		i++
	}
	if (re.flag & F_ME) != 0 {
		res.write("$")
	}

	return res.str()
}

/******************************************************************************
*
* Matching
*
******************************************************************************/
enum Match_state{
	start = 0
	stop
	end
	new_line

	ist_load     // load and execute instruction
	ist_next     // go to next instruction
	ist_next_ks  // go to next instruction without clenaning the state
	ist_quant_p  // match positive ,quantifier check
	ist_quant_n  // match negative, quantifier check
	ist_quant_pg // match positive ,group quantifier check
	ist_quant_ng // match negative ,group quantifier check
}

fn state_str(s Match_state) string {
	match s{
		.start        { return "start" }
		.stop         { return "stop" }
		.end          { return "end" }
		.new_line     { return "new line" }

		.ist_load     { return "ist_load" }
		.ist_next     { return "ist_next" }
		.ist_next_ks  { return "ist_next_ks" }
		.ist_quant_p  { return "ist_quant_p" }
		.ist_quant_n  { return "ist_quant_n" }
		.ist_quant_pg { return "ist_quant_pg" }
		.ist_quant_ng { return "ist_quant_ng" }
		else { return "UNKN" }
	}
}

struct StateObj {
pub mut:
	match_flag bool = false
	match_index int = -1
	match_first int = -1
}

pub fn (re mut RE) match_base(in_txt byteptr, in_txt_len int ) (int,int) {
	// result status
	mut result := NO_MATCH_FOUND     // function return
	mut first_match := -1             //index of the first match

	mut i := 0                       // source string index
	mut ch := u32(0)                 // examinated char
	mut char_len := 0                // utf8 examinated char len
	mut m_state := Match_state.start // start point for the matcher FSM

	mut pc := -1                     // program counter
	mut state := StateObj{}          // actual state
	mut ist := u32(0)                // actual instruction
	mut l_ist := u32(0)              // last matched instruction

	mut group_stack      := [-1].repeat(re.group_max)
	mut group_data       := [-1].repeat(re.group_max)

	mut group_index := -1            // group id used to know how many groups are open

	mut step_count := 0              // stats for debug
	mut dbg_line   := 0              // count debug line printed

	re.reset()

	if re.debug>0 {
		// print header
		mut h_buf := strings.new_builder(32)
		h_buf.write("flags: ")
		h_buf.write("${re.flag:8x}".replace(" ","0"))
		h_buf.write("\n")
		re.log_func(h_buf.str())
	}

	for m_state != .end {

		if pc >= 0 && pc < re.prog.len {
			ist = re.prog[pc].ist
		}else if pc >= re.prog.len {
			//C.printf("ERROR!! PC overflow!!\n")
			return ERR_INTERNAL_ERROR, i
		}

		//******************************************
		// DEBUG LOG
		//******************************************
		if re.debug>0 {
			mut buf2 := strings.new_builder(re.cc.len+128)

			// print all the instructions

			// end of the input text
			if i >= in_txt_len {
				buf2.write("# ${step_count:3d} END OF INPUT TEXT\n")
				re.log_func(buf2.str())
			}else{

				// print only the exe instruction
				if (re.debug == 1 && m_state == .ist_load) ||
					re.debug == 2
				{
					if ist == IST_PROG_END {
						buf2.write("# ${step_count:3d} PROG_END\n")
					}
					else if ist == 0 || m_state in [.start,.ist_next,.stop] {
						buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n")
					}else{
						ch, char_len = re.get_charb(in_txt,i)

						buf2.write("# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${pc:3d}=>")
						buf2.write("${ist:8x}".replace(" ","0"))
						buf2.write(" i,ch,len:[${i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${first_match:3d},${state.match_index:3d}] ")

						if ist == IST_SIMPLE_CHAR {
							buf2.write("query_ch: [${re.prog[pc].ch:1c}]")
						} else {
							if ist == IST_BSLS_CHAR {
								buf2.write("BSLS [\\${re.prog[pc].ch:1c}]")
							} else if ist == IST_PROG_END {
								buf2.write("PROG_END")
							} else if ist == IST_OR_BRANCH {
								buf2.write("OR")
							} else if ist == IST_CHAR_CLASS_POS {
								buf2.write("CHAR_CLASS_POS[${re.get_char_class(pc)}]")
							} else if ist == IST_CHAR_CLASS_NEG {
								buf2.write("CHAR_CLASS_NEG[${re.get_char_class(pc)}]")
							} else if ist == IST_DOT_CHAR {
								buf2.write("DOT_CHAR")
							} else if ist == IST_GROUP_START {
								tmp_gi :=re.prog[pc].group_id
								tmp_gr := re.prog[re.prog[pc].goto_pc].group_rep
								buf2.write("GROUP_START #:${tmp_gi} rep:${tmp_gr} ")
							} else if ist == IST_GROUP_END {
								buf2.write("GROUP_END   #:${re.prog[pc].group_id} deep:${group_index}")
							}
						}
						if re.prog[pc].rep_max == MAX_QUANTIFIER {
							buf2.write("{${re.prog[pc].rep_min},MAX}:${re.prog[pc].rep}")
						} else {
							buf2.write("{${re.prog[pc].rep_min},${re.prog[pc].rep_max}}:${re.prog[pc].rep}")
						}
						if re.prog[pc].greedy == true {
							buf2.write("?")
						}
						buf2.write(" (#${group_index})\n")
					}
					re.log_func(buf2.str())
				}
			}
			step_count++
			dbg_line++
		}
		//******************************************

		// we're out of text, manage it
		if i >= in_txt_len || m_state == .new_line {

			// manage groups
			if group_index >= 0 && state.match_index >= 0 {
				//C.printf("End text with open groups!\n")
				// close the groups
				for group_index >= 0 {
					tmp_pc := group_data[group_index]
					re.prog[tmp_pc].group_rep++
					/*
					C.printf("Closing group %d {%d,%d}:%d\n",
						group_index,
						re.prog[tmp_pc].rep_min,
						re.prog[tmp_pc].rep_max,
						re.prog[tmp_pc].group_rep
					)
					*/
					if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min && re.prog[tmp_pc].group_id >= 0{
						start_i   := group_stack[group_index]
	 					group_stack[group_index]=-1

	 					// save group results
						g_index := re.prog[tmp_pc].group_id*2
						if start_i >= 0 {
							re.groups[g_index] = start_i
						} else {
							re.groups[g_index] = 0
						}
						re.groups[g_index+1] = i

						// continuous save, save until we have space
						if re.group_csave_index > 0 {
							// check if we have space to save the record
							if (re.group_csave_index + 3) < re.group_csave.len {
								// incrment counter
								re.group_csave[0]++
								// save the record
								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
							}
						}

 					}

					group_index--
				}
			}

			// manage IST_DOT_CHAR

			m_state == .end
			break
			//return NO_MATCH_FOUND,0
		}

		// starting and init
		if m_state == .start {
			pc = -1
			i = 0
			m_state = .ist_next
			continue
		}

		// ist_next, next instruction reseting its state
		if m_state == .ist_next {
			pc = pc + 1
			re.prog[pc].reset()
			// check if we are in the program bounds
			if pc < 0 || pc > re.prog.len {
				//C.printf("ERROR!! PC overflow!!\n")
				return ERR_INTERNAL_ERROR, i
			}
			m_state = .ist_load
			continue
		}

		// ist_next_ks, next instruction keeping its state
		if m_state == .ist_next_ks {
			pc = pc + 1
			// check if we are in the program bounds
			if pc < 0 || pc > re.prog.len {
				//C.printf("ERROR!! PC overflow!!\n")
				return ERR_INTERNAL_ERROR, i
			}
			m_state = .ist_load
			continue
		}

		// load the char
		ch, char_len = re.get_charb(in_txt,i)

		// check new line if flag F_NL enabled
		if (re.flag & F_NL) != 0 && char_len == 1 && byte(ch) in NEW_LINE_LIST {
			m_state = .new_line
			continue
		}

		// check if stop
		if m_state == .stop {

			// we are in search mode, don't exit until the end
			if re.flag & F_SRC != 0 && ist != IST_PROG_END {
				pc = -1
				i += char_len
				m_state = .ist_next
				re.reset_src()
				state.match_index = -1
				first_match = -1
				continue
			}

			// if we are in restore state ,do it and restart
			//C.printf("re.state_stack_index %d\n",re.state_stack_index )
			if re.state_stack_index >=0 && re.state_stack[re.state_stack_index].pc >= 0 {
				i = re.state_stack[re.state_stack_index].i
				pc = re.state_stack[re.state_stack_index].pc
				state.match_index =	re.state_stack[re.state_stack_index].mi
				group_index = re.state_stack[re.state_stack_index].group_stack_index

				m_state = .ist_load
				continue
			}

			if ist == IST_PROG_END {
				return first_match,i
			}

			// exit on no match
			return result,0
		}

		// ist_load
		if m_state == .ist_load {

			// program end
			if ist == IST_PROG_END {
				// if we are in match exit well

				if group_index >= 0 && state.match_index >= 0 {
					group_index = -1
				}

				// we have a DOT MATCH on going
				//C.printf("IST_PROG_END l_ist: %08x\n", l_ist)
				if re.state_stack_index>=0 && l_ist == IST_DOT_CHAR {
					m_state = .stop
					continue
				}

				re.state_stack_index = -1
				m_state = .stop
				continue

			}

			// check GROUP start, no quantifier is checkd for this token!!
			else if ist == IST_GROUP_START {
				group_index++
				group_data[group_index] = re.prog[pc].goto_pc  // save where is IST_GROUP_END, we will use it for escape
				group_stack[group_index]=i                     // index where we start to manage
				//C.printf("group_index %d rep %d\n", group_index, re.prog[re.prog[pc].goto_pc].group_rep)

				m_state = .ist_next
				continue
			}

			// check GROUP end
			else if ist == IST_GROUP_END {
				// we are in matching streak
				if state.match_index >= 0 {
					// restore txt index stack and save the group data

					//C.printf("g.id: %d group_index: %d\n", re.prog[pc].group_id, group_index)
					if group_index >= 0 && re.prog[pc].group_id >= 0 {
	 					start_i   := group_stack[group_index]
	 					//group_stack[group_index]=-1

	 					// save group results
						g_index := re.prog[pc].group_id*2
						if start_i >= 0 {
							re.groups[g_index] = start_i
						} else {
							re.groups[g_index] = 0
						}
						re.groups[g_index+1] = i
						//C.printf("GROUP %d END [%d, %d]\n", re.prog[pc].group_id, re.groups[g_index], re.groups[g_index+1])

						// continuous save, save until we have space
						if re.group_csave_index > 0 {
							// check if we have space to save the record
							if (re.group_csave_index + 3) < re.group_csave.len {
								// incrment counter
								re.group_csave[0]++
								// save the record
								re.group_csave[re.group_csave_index++] = g_index >> 1          // group id
								re.group_csave[re.group_csave_index++] = re.groups[g_index]    // start
								re.group_csave[re.group_csave_index++] = re.groups[g_index+1]  // end
							}
						}
					}

					re.prog[pc].group_rep++ // increase repetitions
					//C.printf("GROUP %d END %d\n", group_index, re.prog[pc].group_rep)
					m_state = .ist_quant_pg
					continue

				}

				m_state = .ist_quant_ng
				continue
			}

			// check OR
			else if ist == IST_OR_BRANCH {
				if state.match_index >= 0 {
					pc = re.prog[pc].rep_max
					//C.printf("IST_OR_BRANCH True pc: %d\n", pc)
				}else{
					pc = re.prog[pc].rep_min
					//C.printf("IST_OR_BRANCH False pc: %d\n", pc)
				}
				re.prog[pc].reset()
				m_state == .ist_load
				continue
			}

			// check IST_DOT_CHAR
			else if ist == IST_DOT_CHAR {
				//C.printf("IST_DOT_CHAR rep: %d\n", re.prog[pc].rep)
				state.match_flag = true
				l_ist = u32(IST_DOT_CHAR)

				if first_match < 0 {
					first_match = i
				}
				state.match_index = i
				re.prog[pc].rep++

				//if re.prog[pc].rep >= re.prog[pc].rep_min && re.prog[pc].rep <= re.prog[pc].rep_max {
				if re.prog[pc].rep >= 0 && re.prog[pc].rep <= re.prog[pc].rep_max {
					//C.printf("DOT CHAR save state : %d\n", re.state_stack_index)
					// save the state

					// manage first dot char
					if re.state_stack_index < 0 {
						re.state_stack_index++
					}

					re.state_stack[re.state_stack_index].pc = pc
					re.state_stack[re.state_stack_index].mi = state.match_index
					re.state_stack[re.state_stack_index].group_stack_index = group_index
				} else {
					re.state_stack[re.state_stack_index].pc = -1
					re.state_stack[re.state_stack_index].mi = -1
					re.state_stack[re.state_stack_index].group_stack_index = -1
				}

				if re.prog[pc].rep >= 1 && re.state_stack_index >= 0 {
					re.state_stack[re.state_stack_index].i  = i + char_len
				}

				// manage * and {0,} quantifier
				if re.prog[pc].rep_min > 0 {
					i += char_len // next char
					l_ist = u32(IST_DOT_CHAR)
				}

				m_state = .ist_next
				continue

			}

			// char class IST
			else if ist == IST_CHAR_CLASS_POS || ist == IST_CHAR_CLASS_NEG {
				state.match_flag = false
				mut cc_neg := false

				if ist == IST_CHAR_CLASS_NEG {
					cc_neg = true
				}
				mut cc_res := re.check_char_class(pc,ch)

				if cc_neg {
					cc_res = !cc_res
				}

				if cc_res {
					state.match_flag = true
					l_ist = u32(IST_CHAR_CLASS_POS)

					if first_match < 0 {
						first_match = i
					}

					state.match_index = i

					re.prog[pc].rep++ // increase repetitions
					i += char_len // next char
					m_state = .ist_quant_p
					continue
				}
				m_state = .ist_quant_n
				continue
			}

			// check bsls
			else if ist == IST_BSLS_CHAR {
				state.match_flag = false
				tmp_res := re.prog[pc].validator(byte(ch))
				//C.printf("BSLS in_ch: %c res: %d\n", ch, tmp_res)
				if tmp_res {
					state.match_flag = true
					l_ist = u32(IST_BSLS_CHAR)

					if first_match < 0 {
						first_match = i
					}

					state.match_index = i

					re.prog[pc].rep++ // increase repetitions
					i += char_len // next char
					m_state = .ist_quant_p
					continue
				}
				m_state = .ist_quant_n
				continue
			}

			// simple char IST
			else if ist == IST_SIMPLE_CHAR {
				//C.printf("IST_SIMPLE_CHAR\n")
				state.match_flag = false

				if re.prog[pc].ch == ch
				{
					state.match_flag = true
					l_ist = IST_SIMPLE_CHAR

					if first_match < 0 {
						first_match = i
					}
					//C.printf("state.match_index: %d\n", state.match_index)
					state.match_index = i

					re.prog[pc].rep++ // increase repetitions
					i += char_len // next char
					m_state = .ist_quant_p
					continue
				}
				m_state = .ist_quant_n
				continue
			}
			/* UNREACHABLE */
			//C.printf("PANIC2!! state: %d\n", m_state)
			return ERR_INTERNAL_ERROR, i

		}

		/***********************************
		* Quantifier management
		***********************************/
		// ist_quant_ng
		if m_state == .ist_quant_ng {

			// we are finished here
			if group_index < 0 {
				//C.printf("Early stop!\n")
				result = NO_MATCH_FOUND
				m_state = .stop
				continue
			}

			tmp_pc := group_data[group_index]    // PC to the end of the group token
			rep    := re.prog[tmp_pc].group_rep  // use a temp variable
			re.prog[tmp_pc].group_rep = 0        // clear the repetitions

			//C.printf(".ist_quant_ng group_pc_end: %d rep: %d\n", tmp_pc,rep)

			if rep >= re.prog[tmp_pc].rep_min {
				//C.printf("ist_quant_ng GROUP CLOSED OK group_index: %d\n", group_index)

				i = group_stack[group_index]
				pc = tmp_pc
				group_index--
				m_state = .ist_next
				continue
			}
			else if re.prog[tmp_pc].next_is_or {
				//C.printf("ist_quant_ng OR Negative branch\n")

				i = group_stack[group_index]
				pc = re.prog[tmp_pc+1].rep_min -1
				group_index--
				m_state = .ist_next
				continue
			}
			else if rep>0 && rep < re.prog[tmp_pc].rep_min {
				//C.printf("ist_quant_ng UNDER THE MINIMUM g.i: %d\n", group_index)

				// check if we are inside a group, if yes exit from the nested groups
				if group_index > 0{
					group_index--
					pc = tmp_pc
					m_state = .ist_quant_ng //.ist_next
					continue
				}

				if group_index == 0 {
					group_index--
					pc = tmp_pc // TEST
					m_state = .ist_next
					continue
				}

				result = NO_MATCH_FOUND
				m_state = .stop
				continue
			}
			else if rep==0 && rep < re.prog[tmp_pc].rep_min {
				//C.printf("ist_quant_ng ZERO UNDER THE MINIMUM g.i: %d\n", group_index)

				if group_index > 0{
					group_index--
					pc = tmp_pc
					m_state = .ist_quant_ng //.ist_next
					continue
				}

				result = NO_MATCH_FOUND
				m_state = .stop
				continue
			}

			//C.printf("DO NOT STAY HERE!! {%d,%d}:%d\n", re.prog[tmp_pc].rep_min, re.prog[tmp_pc].rep_max, rep)
			/* UNREACHABLE */
			return ERR_INTERNAL_ERROR, i

		}
		// ist_quant_pg
		else if m_state == .ist_quant_pg {
			//C.printf(".ist_quant_pg\n")
			mut tmp_pc := pc
			if group_index >= 0 {
				tmp_pc = group_data[group_index]
			}

			rep := re.prog[tmp_pc].group_rep

			if rep < re.prog[tmp_pc].rep_min {
				//C.printf("ist_quant_pg UNDER RANGE\n")
				pc = re.prog[tmp_pc].goto_pc
				m_state = .ist_next
				continue
			}
			else if rep == re.prog[tmp_pc].rep_max {
				//C.printf("ist_quant_pg MAX RANGE\n")
				re.prog[tmp_pc].group_rep = 0 // clear the repetitions
				group_index--
				m_state = .ist_next
				continue
			}
			else if rep >= re.prog[tmp_pc].rep_min {
				//C.printf("ist_quant_pg IN RANGE group_index:%d\n", group_index)

				// check greedy flag, if true exit on minimum
				if re.prog[tmp_pc].greedy == true {
					re.prog[tmp_pc].group_rep = 0 // clear the repetitions
					group_index--
					m_state = .ist_next
					continue
				}

				pc = re.prog[tmp_pc].goto_pc - 1
				group_index--
				m_state = .ist_next
				continue
			}

			/* UNREACHABLE */
			//C.printf("PANIC3!! state: %d\n", m_state)
			return ERR_INTERNAL_ERROR, i
		}

		// ist_quant_n
		else if m_state == .ist_quant_n {
			rep := re.prog[pc].rep
			//C.printf("Here!! PC %d is_next_or: %d \n", pc, re.prog[pc].next_is_or)

			// zero quantifier * or ?
			if rep == 0 && re.prog[pc].rep_min == 0 {
				//C.printf("ist_quant_n ZERO RANGE MIN\n")
				m_state = .ist_next // go to next ist
				continue
			}
			// match + or *
			else if rep >= re.prog[pc].rep_min {
				//C.printf("ist_quant_n MATCH RANGE\n")
				m_state = .ist_next
				continue
			}

			// check the OR if present
			if re.prog[pc].next_is_or {
				//C.printf("OR present on failing\n")
				state.match_index = -1
				m_state = .ist_next
				continue
			}

			// we are in a group manage no match from here
			if group_index >= 0 {
				//C.printf("ist_quant_n FAILED insied a GROUP group_index:%d\n", group_index)
				m_state = .ist_quant_ng
				continue
			}

			// no other options
			//C.printf("ist_quant_n NO_MATCH_FOUND\n")
			result = NO_MATCH_FOUND
			m_state = .stop
			continue
			//return NO_MATCH_FOUND, 0
		}

		// ist_quant_p
		else if m_state == .ist_quant_p {
			// exit on first match
			if (re.flag & F_EFM) != 0 {
				return i,i+1
			}

			rep := re.prog[pc].rep

			// under range
			if rep > 0 && rep < re.prog[pc].rep_min {
				//C.printf("ist_quant_p UNDER RANGE\n")
				m_state = .ist_load // continue the loop
				continue
			}

			// range ok, continue loop
			else if rep >= re.prog[pc].rep_min && rep < re.prog[pc].rep_max {
				//C.printf("ist_quant_p IN RANGE\n")

				// check greedy flag, if true exit on minimum
				if re.prog[pc].greedy == true {
					m_state = .ist_next
					continue
				}
				m_state = .ist_load
				continue
			}

			// max reached
			else if rep == re.prog[pc].rep_max {
				//C.printf("ist_quant_p MAX RANGE\n")
				m_state = .ist_next
				continue
			}

		}
		/* UNREACHABLE */
		//C.printf("PANIC4!! state: %d\n", m_state)
		return ERR_INTERNAL_ERROR, i
	}

	// Check the results
	if state.match_index >= 0 {
		if group_index < 0 {
			//C.printf("OK match,natural end [%d,%d]\n", first_match, i)
			return first_match, i
		} else {
			//C.printf("Skip last group\n")
			return first_match,group_stack[group_index--]
		}
	}
	//C.printf("NO_MATCH_FOUND, natural end\n")
	return NO_MATCH_FOUND, 0
}

/******************************************************************************
*
* Public functions
*
******************************************************************************/

//
// Inits
//

// regex create a regex object from the query string
pub fn regex(in_query string) (RE,int,int){
	mut re := RE{}
	re.prog = [Token{}].repeat(in_query.len+1)
	re.cc = [CharClass{}].repeat(in_query.len+1)
	re.group_max_nested = 8

	re_err,err_pos := re.compile(in_query)
	return re, re_err, err_pos
}

// new_regex create a REgex of small size, usually sufficient for ordinary use
pub fn new_regex() RE {
	return new_regex_by_size(1)
}

// new_regex_by_size create a REgex of large size, mult specify the scale factor of the memory that will be allocated
pub fn new_regex_by_size(mult int) RE {
	mut re := RE{}
	re.prog = [Token{}].repeat(MAX_CODE_LEN*mult)       // max program length, default 256 istructions
	re.cc = [CharClass{}].repeat(MAX_CODE_LEN*mult)     // char class list
	re.group_max_nested = 3*mult                        // max nested group

	return re
}

//
// Matchers
//

pub fn (re mut RE) match_string(in_txt string) (int,int) {
	start, end := re.match_base(in_txt.str,in_txt.len)
	if start >= 0 && end > start {
		if (re.flag & F_MS) != 0 && start > 0 {
			return NO_MATCH_FOUND, 0
		}
		if (re.flag & F_ME) != 0 && end < in_txt.len {
			if in_txt[end] in NEW_LINE_LIST {
				return start, end
			}
			return NO_MATCH_FOUND, 0
		}
		return start, end
	}
	return start, end
}

//
// Finders
//

// find try to find the first match in the input string
pub fn (re mut RE) find(in_txt string) (int,int) {
	old_flag := re.flag
	re.flag |= F_SRC  // enable search mode
	start, end := re.match_base(in_txt.str, in_txt.len)
	re.flag = old_flag
	if start >= 0 && end > start {
		return start,end
	}
	return NO_MATCH_FOUND, 0
}

// find all the non overlapping occurrences of the match pattern
pub fn (re mut RE) find_all(in_txt string) []int {
	mut i := 0
	mut res := []int
	mut ls := -1
	for i < in_txt.len {
		s,e := re.find(in_txt[i..])
		if s >= 0 && e > s && i+s > ls {
			//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
			res << i+s
			res << i+e
			ls = i+s
			i = i+e
			continue
		} else {
			i++
		}

	}
	return res
}

// replace return a string where the matches are replaced with the replace string
pub fn (re mut RE) replace(in_txt string, repl string) string {
	pos := re.find_all(in_txt)
	if pos.len > 0 {
		mut res := ""
		mut i := 0

		mut s1 := 0
		mut e1 := in_txt.len

		for i < pos.len {
			e1 = pos[i]
			res += in_txt[s1..e1] + repl
			s1 = pos[i+1]
			i += 2
		}

		res += in_txt[s1..]
		return res
	}
	return in_txt
}