1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00
v/vlib/regex/regex.v

2616 lines
68 KiB
V
Raw Normal View History

/*
regex 1.0 alpha
2023-03-28 23:55:57 +03:00
Copyright (c) 2019-2023 Dario Deledda. All rights reserved.
Use of this source code is governed by an MIT license
that can be found in the LICENSE file.
This file contains regex module
Know limitation:
- find is implemented in a trivial way
- not full compliant PCRE
- not compliant POSIX ERE
*/
2020-01-13 15:30:41 +03:00
module regex
import strings
2020-01-13 15:30:41 +03:00
pub const (
v_regex_version = '1.0 alpha' // regex module version
2020-01-13 15:30:41 +03:00
max_code_len = 256 // default small base code len for the regex programs
max_quantifier = 1073741824 // default max repetitions allowed for the quantifiers = 2^30
2020-01-13 15:30:41 +03:00
// spaces chars (here only westerns!!) TODO: manage all the spaces from unicode
spaces = [` `, `\t`, `\n`, `\r`, `\v`, `\f`]
2020-01-13 15:30:41 +03:00
// new line chars for now only '\n'
new_line_list = [`\n`, `\r`]
2020-01-13 15:30:41 +03:00
// Results
no_match_found = -1
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// Errors
compile_ok = 0 // the regex string compiled, all ok
err_char_unknown = -2 // the char used is unknow to the system
err_undefined = -3 // the compiler symbol is undefined
err_internal_error = -4 // Bug in the regex system!!
err_cc_alloc_overflow = -5 // memory for char class full!!
err_syntax_error = -6 // syntax error in regex compiling
err_groups_overflow = -7 // max number of groups reached
err_groups_max_nested = -8 // max number of nested group reached
err_group_not_balanced = -9 // group not balanced
err_group_qm_notation = -10 // group invalid notation
err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
err_neg_group_quantifier = -12 // negation groups can not have quantifier
err_consecutive_dots = -13 // two consecutive dots is an error
2020-01-13 15:30:41 +03:00
)
const (
2020-01-13 15:30:41 +03:00
//*************************************
// regex program instructions
//*************************************
ist_simple_char = u32(0x7FFFFFFF) // single char instruction, 31 bit available to char
2020-01-13 15:30:41 +03:00
// char class 11 0100 AA xxxxxxxx
// AA = 00 regular class
// AA = 01 Negated class ^ char
ist_char_class = u32(0xD1000000) // MASK
ist_char_class_pos = u32(0xD0000000) // char class normal [abc]
ist_char_class_neg = u32(0xD1000000) // char class negate [^abc]
2020-01-13 15:30:41 +03:00
// dot char 10 0110 xx xxxxxxxx
ist_dot_char = u32(0x98000000) // match any char except \n
2020-01-13 15:30:41 +03:00
// backslash chars 10 0100 xx xxxxxxxx
ist_bsls_char = u32(0x90000000) // backslash char
2020-01-13 15:30:41 +03:00
// OR | 10 010Y xx xxxxxxxx
ist_or_branch = u32(0x91000000) // OR case
2020-01-13 15:30:41 +03:00
// groups 10 010Y xx xxxxxxxx
ist_group_start = u32(0x92000000) // group start (
ist_group_end = u32(0x94000000) // group end )
2020-01-13 15:30:41 +03:00
// control instructions
ist_prog_end = u32(0x88000000) // 10 0010 xx xxxxxxxx
//*************************************
2020-01-13 15:30:41 +03:00
)
/*
General Utilities
*/
2020-01-13 15:30:41 +03:00
// utf8util_char_len calculate the length in bytes of a utf8 char
[inline]
2022-04-15 18:25:45 +03:00
fn utf8util_char_len(b u8) int {
return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
2020-01-13 15:30:41 +03:00
}
// get_char get a char from position i and return an u32 with the unicode code
[direct_array_access; inline]
fn (re RE) get_char(in_txt string, i int) (u32, int) {
ini := unsafe { in_txt.str[i] }
2020-01-13 15:30:41 +03:00
// ascii 8 bit
if (re.flag & regex.f_bin) != 0 || ini & 0x80 == 0 {
return u32(ini), 1
2020-01-13 15:30:41 +03:00
}
// unicode char
char_len := utf8util_char_len(ini)
2020-01-13 15:30:41 +03:00
mut tmp := 0
mut ch := u32(0)
for tmp < char_len {
ch = (ch << 8) | unsafe { in_txt.str[i + tmp] }
2020-01-13 15:30:41 +03:00
tmp++
}
return ch, char_len
2020-01-13 15:30:41 +03:00
}
// get_charb get a char from position i and return an u32 with the unicode code
[direct_array_access; inline]
2022-04-15 18:25:45 +03:00
fn (re RE) get_charb(in_txt &u8, i int) (u32, int) {
2020-04-23 02:16:16 +03:00
// ascii 8 bit
if (re.flag & regex.f_bin) != 0 || unsafe { in_txt[i] } & 0x80 == 0 {
return u32(unsafe { in_txt[i] }), 1
2020-01-13 15:30:41 +03:00
}
// unicode char
char_len := utf8util_char_len(unsafe { in_txt[i] })
2020-01-13 15:30:41 +03:00
mut tmp := 0
mut ch := u32(0)
for tmp < char_len {
ch = (ch << 8) | unsafe { in_txt[i + tmp] }
2020-01-13 15:30:41 +03:00
tmp++
}
return ch, char_len
2020-01-13 15:30:41 +03:00
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_alnum(in_char u8) bool {
2020-01-13 15:30:41 +03:00
mut tmp := in_char - `A`
if tmp <= 25 {
return true
}
2020-01-13 15:30:41 +03:00
tmp = in_char - `a`
if tmp <= 25 {
return true
}
2020-01-13 15:30:41 +03:00
tmp = in_char - `0`
if tmp <= 9 {
return true
}
if in_char == `_` {
return true
}
2020-01-13 15:30:41 +03:00
return false
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_not_alnum(in_char u8) bool {
2020-01-13 15:30:41 +03:00
return !is_alnum(in_char)
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_space(in_char u8) bool {
return in_char in regex.spaces
2020-01-13 15:30:41 +03:00
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_not_space(in_char u8) bool {
2020-01-13 15:30:41 +03:00
return !is_space(in_char)
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_digit(in_char u8) bool {
2020-01-13 15:30:41 +03:00
tmp := in_char - `0`
return tmp <= 0x09
2020-01-13 15:30:41 +03:00
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_not_digit(in_char u8) bool {
2020-01-13 15:30:41 +03:00
return !is_digit(in_char)
}
/*
2020-01-13 15:30:41 +03:00
[inline]
fn is_wordchar(in_char byte) bool {
return is_alnum(in_char) || in_char == `_`
}
[inline]
fn is_not_wordchar(in_char byte) bool {
return !is_alnum(in_char)
}
*/
2020-01-13 15:30:41 +03:00
[inline]
2022-04-15 18:25:45 +03:00
fn is_lower(in_char u8) bool {
2020-01-13 15:30:41 +03:00
tmp := in_char - `a`
return tmp <= 25
2020-01-13 15:30:41 +03:00
}
[inline]
2022-04-15 18:25:45 +03:00
fn is_upper(in_char u8) bool {
2020-01-13 15:30:41 +03:00
tmp := in_char - `A`
return tmp <= 25
2020-01-13 15:30:41 +03:00
}
pub fn (re RE) get_parse_error_string(err int) string {
match err {
regex.compile_ok { return 'compile_ok' }
regex.no_match_found { return 'no_match_found' }
regex.err_char_unknown { return 'err_char_unknown' }
regex.err_undefined { return 'err_undefined' }
regex.err_internal_error { return 'err_internal_error' }
regex.err_cc_alloc_overflow { return 'err_cc_alloc_overflow' }
regex.err_syntax_error { return 'err_syntax_error' }
regex.err_groups_overflow { return 'err_groups_overflow' }
regex.err_groups_max_nested { return 'err_groups_max_nested' }
regex.err_group_not_balanced { return 'err_group_not_balanced' }
regex.err_group_qm_notation { return 'err_group_qm_notation' }
regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' }
regex.err_consecutive_dots { return 'err_consecutive_dots' }
else { return 'err_unknown' }
2020-01-13 15:30:41 +03:00
}
}
// utf8_str convert and utf8 sequence to a printable string
[inline]
2020-08-27 07:46:18 +03:00
fn utf8_str(ch rune) string {
mut i := 4
mut res := ''
for i > 0 {
2022-04-15 14:58:56 +03:00
v := u8((ch >> ((i - 1) * 8)) & 0xFF)
if v != 0 {
res += '${v:1c}'
}
i--
}
return res
}
2020-01-13 15:30:41 +03:00
// simple_log default log function
fn simple_log(txt string) {
2020-01-16 02:39:33 +03:00
print(txt)
2020-01-13 15:30:41 +03:00
}
/******************************************************************************
*
* Token Structs
*
******************************************************************************/
2022-04-15 18:25:45 +03:00
pub type FnValidator = fn (u8) bool
struct Token {
2020-01-13 15:30:41 +03:00
mut:
2020-08-27 07:46:18 +03:00
ist rune
2020-01-16 02:39:33 +03:00
// char
ch rune // char of the token if any
2022-04-15 18:25:45 +03:00
ch_len u8 // char len
2020-01-13 15:30:41 +03:00
// Quantifiers / branch
rep_min int // used also for jump next in the OR branch [no match] pc jump
rep_max int // used also for jump next in the OR branch [ match] pc jump
greedy bool // greedy quantifier flag
2020-01-13 15:30:41 +03:00
// Char class
cc_index int = -1
2020-01-13 15:30:41 +03:00
// counters for quantifier check (repetitions)
rep int
2020-01-18 09:38:00 +03:00
// validator function pointer
validator FnValidator
2020-01-13 15:30:41 +03:00
// groups variables
group_neg bool // negation flag for the group, 0 => no negation > 0 => negataion
group_rep int // repetition of the group
group_id int = -1 // id of the group
goto_pc int = -1 // jump to this PC if is needed
2020-04-23 02:16:16 +03:00
// OR flag for the token
next_is_or bool // true if the next token is an OR
2020-12-18 07:57:31 +03:00
// dot_char token variables
dot_check_pc int = -1 // pc of the next token to check for dots
bsls_check_pc int = -1 // pc of the next token to check for bsls
cc_check_pc int = -1 // pc of the next token to check for CC
last_dot_flag bool // if true indicate that is the last dot_char in the regex
// debug fields
source_index int
2020-01-13 15:30:41 +03:00
}
2020-01-31 04:29:54 +03:00
[inline]
2020-05-17 14:51:18 +03:00
fn (mut tok Token) reset() {
2020-01-13 15:30:41 +03:00
tok.rep = 0
}
/******************************************************************************
*
* Regex struct
*
******************************************************************************/
2020-01-13 15:30:41 +03:00
pub const (
f_nl = 0x00000001 // end the match when find a new line symbol
f_ms = 0x00000002 // match true only if the match is at the start of the string
f_me = 0x00000004 // match true only if the match is at the end of the string
f_efm = 0x00000100 // exit on first token matched, used by search
f_bin = 0x00000200 // work only on bytes, ignore utf-8
// behaviour modifier flags
f_src = 0x00020000 // search mode enabled
2020-01-13 15:30:41 +03:00
)
// Log function prototype
pub type FnLog = fn (string)
2020-04-25 23:42:48 +03:00
pub struct RE {
2020-01-13 15:30:41 +03:00
pub mut:
prog []Token
prog_len int // regex program len
2020-01-13 15:30:41 +03:00
// char classes storage
cc []CharClass // char class list
cc_index int // index
2020-01-13 15:30:41 +03:00
// groups
group_count int // number of groups in this regex struct
groups []int // groups index results
group_max_nested int = 3 // max nested group
group_max int = 8 // max allowed number of different groups
2020-01-13 15:30:41 +03:00
state_list []StateObj
2020-12-22 23:34:46 +03:00
group_csave_flag bool // flag to enable continuous saving
group_csave []int //= []int{} // groups continuous save list
2020-01-25 21:12:23 +03:00
group_map map[string]int // groups names map
2020-12-22 23:34:46 +03:00
group_stack []int
group_data []int
2020-01-13 15:30:41 +03:00
// flags
flag int // flag for optional parameters
2020-01-13 15:30:41 +03:00
// Debug/log
debug int // enable in order to have the unroll of the code 0 = NO_DEBUG, 1 = LIGHT 2 = VERBOSE
log_func FnLog = simple_log // log function, can be customized by the user
query string // query string
2020-01-13 15:30:41 +03:00
}
2020-01-31 04:29:54 +03:00
// Reset RE object
[direct_array_access; inline]
pub fn (mut re RE) reset() {
re.cc_index = 0
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
mut i := 0
2020-12-22 19:42:32 +03:00
for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token
2020-01-13 15:30:41 +03:00
i++
}
// init groups array
if re.group_count > 0 {
if re.groups.len == 0 {
// first run alloc memory
re.groups = []int{len: re.group_count * 2, init: -1}
} else {
// subsequent executions, only clean up the memory
i = 0
for i < re.groups.len {
re.groups[i] = -1
i++
}
}
}
2020-01-13 15:30:41 +03:00
2020-01-25 21:12:23 +03:00
// reset group_csave
2020-12-22 19:42:32 +03:00
if re.group_csave_flag == true {
re.group_csave.clear() // = []int{}
}
// reset state list
re.state_list.clear()
re.group_stack.clear()
2020-01-13 15:30:41 +03:00
}
2020-01-31 04:29:54 +03:00
// reset for search mode fail
// gcc bug, dont use [inline] or go 5 time slower
2020-12-22 19:42:32 +03:00
//[inline]
[direct_array_access]
fn (mut re RE) reset_src() {
2020-01-31 04:29:54 +03:00
mut i := 0
2020-12-22 19:42:32 +03:00
for i < re.prog_len {
re.prog[i].group_rep = 0 // clear repetition of the group
re.prog[i].rep = 0 // clear repetition of the token
2020-01-31 04:29:54 +03:00
i++
}
}
/******************************************************************************
*
* Backslashes chars
*
******************************************************************************/
2020-01-13 15:30:41 +03:00
struct BslsStruct {
ch rune // meta char
validator FnValidator // validator function pointer
2020-01-13 15:30:41 +03:00
}
const (
bsls_validator_array = [
2020-01-13 15:30:41 +03:00
BslsStruct{`w`, is_alnum},
BslsStruct{`W`, is_not_alnum},
BslsStruct{`s`, is_space},
BslsStruct{`S`, is_not_space},
BslsStruct{`d`, is_digit},
BslsStruct{`D`, is_not_digit},
BslsStruct{`a`, is_lower},
BslsStruct{`A`, is_upper},
]
// these chars are escape if preceded by a \
bsls_escape_list = [`\\`, `|`, `.`, `:`, `*`, `+`, `-`, `{`, `}`, `[`, `]`, `(`, `)`, `?`,
`^`, `!`]
2020-01-13 15:30:41 +03:00
)
enum BSLS_parse_state {
start
bsls_found
bsls_char
normal_char
2020-01-13 15:30:41 +03:00
}
// parse_bsls return (index, str_len) bsls_validator_array index, len of the backslash sequence if present
fn (re RE) parse_bsls(in_txt string, in_i int) (int, int) {
2020-01-13 15:30:41 +03:00
mut status := BSLS_parse_state.start
mut i := in_i
for i < in_txt.len {
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
2022-04-15 14:58:56 +03:00
ch := u8(char_tmp)
2020-01-13 15:30:41 +03:00
if status == .start && ch == `\\` {
status = .bsls_found
i += char_len
continue
}
// check if is our bsls char, for now only one length sequence
if status == .bsls_found {
for c, x in regex.bsls_validator_array {
2020-01-13 15:30:41 +03:00
if x.ch == ch {
return c, i - in_i + 1
2020-01-13 15:30:41 +03:00
}
}
status = .normal_char
continue
}
// no BSLS validator, manage as normal escape char char
if status == .normal_char {
if ch in regex.bsls_escape_list {
return regex.no_match_found, i - in_i + 1
2020-01-13 15:30:41 +03:00
}
return regex.err_syntax_error, i - in_i + 1
2020-01-13 15:30:41 +03:00
}
// at the present time we manage only one char after the \
break
}
// not our bsls return KO
return regex.err_syntax_error, i
2020-01-13 15:30:41 +03:00
}
/******************************************************************************
*
* Char class
*
******************************************************************************/
const (
cc_null = 0 // empty cc token
cc_char = 1 // simple char: a
cc_int = 2 // char interval: a-z
cc_bsls = 3 // backslash char
cc_end = 4 // cc sequence terminator
2020-01-13 15:30:41 +03:00
)
struct CharClass {
mut:
cc_type int // type of cc token
ch0 rune // first char of the interval a-b a in this case
ch1 rune // second char of the interval a-b b in this case
validator FnValidator // validator function pointer
2020-01-13 15:30:41 +03:00
}
enum CharClass_parse_state {
2020-04-23 02:16:16 +03:00
start
in_char
in_bsls
separator
finish
2020-01-13 15:30:41 +03:00
}
fn (re RE) get_char_class(pc int) string {
2022-04-15 15:35:35 +03:00
buf := []u8{len: (re.cc.len)}
2022-04-15 14:58:56 +03:00
mut buf_ptr := unsafe { &u8(&buf) }
2020-01-13 15:30:41 +03:00
mut cc_i := re.prog[pc].cc_index
mut i := 0
mut tmp := 0
for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
if re.cc[cc_i].cc_type == regex.cc_bsls {
unsafe {
buf_ptr[i] = `\\`
i++
2022-04-15 14:58:56 +03:00
buf_ptr[i] = u8(re.cc[cc_i].ch0)
i++
}
} else if re.cc[cc_i].ch0 == re.cc[cc_i].ch1 {
2020-01-13 15:30:41 +03:00
tmp = 3
for tmp >= 0 {
2022-04-15 14:58:56 +03:00
x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
2020-04-23 02:16:16 +03:00
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
2020-01-13 15:30:41 +03:00
}
tmp--
}
} else {
2020-01-13 15:30:41 +03:00
tmp = 3
for tmp >= 0 {
2022-04-15 14:58:56 +03:00
x := u8((re.cc[cc_i].ch0 >> (tmp * 8)) & 0xFF)
2020-04-23 02:16:16 +03:00
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
2020-01-13 15:30:41 +03:00
}
tmp--
}
unsafe {
buf_ptr[i] = `-`
i++
}
2020-01-13 15:30:41 +03:00
tmp = 3
for tmp >= 0 {
2022-04-15 14:58:56 +03:00
x := u8((re.cc[cc_i].ch1 >> (tmp * 8)) & 0xFF)
2020-04-23 02:16:16 +03:00
if x != 0 {
unsafe {
buf_ptr[i] = x
i++
}
2020-01-13 15:30:41 +03:00
}
tmp--
}
}
cc_i++
}
unsafe {
2022-04-15 14:58:56 +03:00
buf_ptr[i] = u8(0)
}
return unsafe { tos_clone(buf_ptr) }
2020-01-13 15:30:41 +03:00
}
2020-08-27 07:46:18 +03:00
fn (re RE) check_char_class(pc int, ch rune) bool {
2020-01-13 15:30:41 +03:00
mut cc_i := re.prog[pc].cc_index
for cc_i >= 0 && cc_i < re.cc.len && re.cc[cc_i].cc_type != regex.cc_end {
if re.cc[cc_i].cc_type == regex.cc_bsls {
2022-04-15 14:58:56 +03:00
if re.cc[cc_i].validator(u8(ch)) {
2020-01-13 15:30:41 +03:00
return true
}
} else if ch >= re.cc[cc_i].ch0 && ch <= re.cc[cc_i].ch1 {
2020-01-13 15:30:41 +03:00
return true
}
cc_i++
}
return false
}
// parse_char_class return (index, str_len, cc_type) of a char class [abcm-p], char class start after the [ char
2020-08-27 07:46:18 +03:00
fn (mut re RE) parse_char_class(in_txt string, in_i int) (int, int, rune) {
2020-01-13 15:30:41 +03:00
mut status := CharClass_parse_state.start
mut i := in_i
mut tmp_index := re.cc_index
res_index := re.cc_index
2020-01-13 15:30:41 +03:00
mut cc_type := u32(regex.ist_char_class_pos)
2020-01-13 15:30:41 +03:00
for i < in_txt.len {
// check if we are out of memory for char classes
if tmp_index >= re.cc.len {
return regex.err_cc_alloc_overflow, 0, u32(0)
2020-01-13 15:30:41 +03:00
}
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
2022-04-15 14:58:56 +03:00
ch := u8(char_tmp)
2020-01-13 15:30:41 +03:00
// println("CC #${i:3d} ch: ${ch:c}")
2020-01-13 15:30:41 +03:00
// negation
if status == .start && ch == `^` {
cc_type = u32(regex.ist_char_class_neg)
2020-01-13 15:30:41 +03:00
i += char_len
continue
}
// minus symbol
if status == .start && ch == `-` {
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
i += char_len
tmp_index++
continue
}
2020-01-13 15:30:41 +03:00
// bsls
if (status == .start || status == .in_char) && ch == `\\` {
// println("CC bsls.")
2020-01-13 15:30:41 +03:00
status = .in_bsls
i += char_len
continue
}
if status == .in_bsls {
// println("CC bsls validation.")
for c, x in regex.bsls_validator_array {
2020-01-13 15:30:41 +03:00
if x.ch == ch {
// println("CC bsls found [${ch:c}]")
re.cc[tmp_index].cc_type = regex.cc_bsls
re.cc[tmp_index].ch0 = regex.bsls_validator_array[c].ch
re.cc[tmp_index].ch1 = regex.bsls_validator_array[c].ch
re.cc[tmp_index].validator = regex.bsls_validator_array[c].validator
2020-01-13 15:30:41 +03:00
i += char_len
tmp_index++
status = .in_char
break
}
}
if status == .in_bsls {
// manage as a simple char
// println("CC bsls not found [${ch:c}]")
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
i += char_len
tmp_index++
2020-01-13 15:30:41 +03:00
status = .in_char
continue
} else {
2020-01-13 15:30:41 +03:00
continue
}
}
// simple char
if (status == .start || status == .in_char) && ch != `-` && ch != `]` {
2020-01-13 15:30:41 +03:00
status = .in_char
2020-04-23 02:16:16 +03:00
re.cc[tmp_index].cc_type = regex.cc_char
re.cc[tmp_index].ch0 = char_tmp
re.cc[tmp_index].ch1 = char_tmp
2020-01-13 15:30:41 +03:00
i += char_len
tmp_index++
continue
}
// check range separator
if status == .in_char && ch == `-` {
status = .separator
i += char_len
continue
}
// check range end
if status == .separator && ch != `]` && ch != `-` {
status = .in_char
re.cc[tmp_index - 1].cc_type = regex.cc_int
re.cc[tmp_index - 1].ch1 = char_tmp
2020-01-13 15:30:41 +03:00
i += char_len
continue
}
// char class end
if status == .in_char && ch == `]` {
re.cc[tmp_index].cc_type = regex.cc_end
re.cc[tmp_index].ch0 = 0
re.cc[tmp_index].ch1 = 0
re.cc_index = tmp_index + 1
2020-04-23 02:16:16 +03:00
return res_index, i - in_i + 2, cc_type
2020-01-13 15:30:41 +03:00
}
i++
}
return regex.err_syntax_error, 0, u32(0)
2020-01-13 15:30:41 +03:00
}
/******************************************************************************
*
* Re Compiler
*
******************************************************************************/
2020-01-13 15:30:41 +03:00
//
// Quantifier
//
enum Quant_parse_state {
2020-04-23 02:16:16 +03:00
start
min_parse
comma_checked
max_parse
greedy
gredy_parse
2020-01-13 15:30:41 +03:00
finish
}
// parse_quantifier return (min, max, str_len, greedy_flag) of a {min,max}? quantifier starting after the { char
2020-01-16 02:39:33 +03:00
fn (re RE) parse_quantifier(in_txt string, in_i int) (int, int, int, bool) {
2020-01-13 15:30:41 +03:00
mut status := Quant_parse_state.start
mut i := in_i
mut q_min := 0 // default min in a {} quantifier is 1
2022-12-15 19:54:07 +03:00
mut q_max := 0 // default max in a {} quantifier is max_quantifier
2020-01-13 15:30:41 +03:00
2022-04-15 14:58:56 +03:00
mut ch := u8(0)
2020-01-13 15:30:41 +03:00
for i < in_txt.len {
unsafe {
ch = in_txt.str[i]
}
// println("${ch:c} status: $status")
2020-01-13 15:30:41 +03:00
// exit on no compatible char with {} quantifier
if utf8util_char_len(ch) != 1 {
return regex.err_syntax_error, i, 0, false
2020-01-13 15:30:41 +03:00
}
// min parsing skip if comma present
if status == .start && ch == `,` {
2020-01-18 09:38:00 +03:00
q_min = 0 // default min in a {} quantifier is 0
2020-01-13 15:30:41 +03:00
status = .comma_checked
i++
continue
}
if status == .start && is_digit(ch) {
2020-01-13 15:30:41 +03:00
status = .min_parse
q_min *= 10
q_min += int(ch - `0`)
i++
continue
}
if status == .min_parse && is_digit(ch) {
2020-01-13 15:30:41 +03:00
q_min *= 10
q_min += int(ch - `0`)
i++
continue
}
// we have parsed the min, now check the max
if status == .min_parse && ch == `,` {
status = .comma_checked
i++
continue
}
// single value {4}
if status == .min_parse && ch == `}` {
q_max = q_min
2020-01-16 02:39:33 +03:00
status = .greedy
continue
2020-01-13 15:30:41 +03:00
}
// end without max
if status == .comma_checked && ch == `}` {
q_max = regex.max_quantifier
2020-01-16 02:39:33 +03:00
status = .greedy
continue
2020-01-13 15:30:41 +03:00
}
// start max parsing
if status == .comma_checked && is_digit(ch) {
2020-01-13 15:30:41 +03:00
status = .max_parse
q_max *= 10
q_max += int(ch - `0`)
i++
continue
}
// parse the max
if status == .max_parse && is_digit(ch) {
2020-01-13 15:30:41 +03:00
q_max *= 10
q_max += int(ch - `0`)
i++
continue
}
2020-01-16 02:39:33 +03:00
// finished the quantifier
2020-01-13 15:30:41 +03:00
if status == .max_parse && ch == `}` {
2020-01-16 02:39:33 +03:00
status = .greedy
continue
2020-01-13 15:30:41 +03:00
}
2020-01-16 02:39:33 +03:00
// check if greedy flag char ? is present
if status == .greedy {
if i + 1 < in_txt.len {
2020-01-16 02:39:33 +03:00
i++
status = .gredy_parse
continue
}
return q_min, q_max, i - in_i + 2, false
2020-01-16 02:39:33 +03:00
}
// check the greedy flag
if status == .gredy_parse {
if ch == `?` {
return q_min, q_max, i - in_i + 2, true
2020-01-16 02:39:33 +03:00
} else {
i--
return q_min, q_max, i - in_i + 2, false
2020-01-16 02:39:33 +03:00
}
}
2020-01-13 15:30:41 +03:00
// not a {} quantifier, exit
return regex.err_syntax_error, i, 0, false
2020-01-13 15:30:41 +03:00
}
// not a conform {} quantifier
return regex.err_syntax_error, i, 0, false
2020-01-13 15:30:41 +03:00
}
//
// Groups
//
enum Group_parse_state {
2020-04-23 02:16:16 +03:00
start
q_mark // (?
q_mark1 // (?:|P checking
p_status // (?P
p_start // (?P<
p_end // (?P<...>
p_in_name // (?P<...
finish
}
// parse_groups parse a group for ? (question mark) syntax, if found, return (error, capture_flag, negate_flag, name_of_the_group, next_index)
fn (re RE) parse_groups(in_txt string, in_i int) (int, bool, bool, string, int) {
mut status := Group_parse_state.start
mut i := in_i
mut name := ''
for i < in_txt.len && status != .finish {
// get our char
char_tmp, char_len := re.get_char(in_txt, i)
2022-04-15 14:58:56 +03:00
ch := u8(char_tmp)
// start
if status == .start && ch == `(` {
status = .q_mark
i += char_len
continue
}
// check for question marks
if status == .q_mark && ch == `?` {
status = .q_mark1
i += char_len
continue
}
// negate group
if status == .q_mark1 && ch == `!` {
i += char_len
return 0, false, true, name, i
}
// non capturing group
if status == .q_mark1 && ch == `:` {
i += char_len
return 0, false, false, name, i
}
// enter in P section
if status == .q_mark1 && ch == `P` {
status = .p_status
i += char_len
continue
}
// not a valid q mark found
if status == .q_mark1 {
// println("NO VALID Q MARK")
return -2, true, false, name, i
}
if status == .p_status && ch == `<` {
status = .p_start
i += char_len
continue
}
if status == .p_start && ch != `>` {
status = .p_in_name
name += '${ch:1c}' // TODO: manage utf8 chars
i += char_len
continue
}
// colect name
if status == .p_in_name && ch != `>` && is_alnum(ch) {
name += '${ch:1c}' // TODO: manage utf8 chars
i += char_len
continue
}
// end name
if status == .p_in_name && ch == `>` {
i += char_len
return 0, true, false, name, i
}
// error on name group
if status == .p_in_name {
return -2, true, false, name, i
}
// normal group, nothig to do, exit
return 0, true, false, name, i
}
// UNREACHABLE
// println("ERROR!! NOT MEANT TO BE HERE!!1")
return -2, true, false, name, i
}
2021-09-05 04:48:59 +03:00
const (
quntifier_chars = [rune(`+`), `*`, `?`, `{`]
)
2020-01-13 15:30:41 +03:00
//
// main compiler
//
// compile return (return code, index) where index is the index of the error in the query string if return code is an error code
fn (mut re RE) impl_compile(in_txt string) (int, int) {
mut i := 0 // input string index
mut pc := 0 // program counter
2020-01-13 15:30:41 +03:00
// group management variables
mut group_count := -1
mut group_stack := []int{len: re.group_max_nested, init: 0}
mut group_stack_txt_index := []int{len: re.group_max_nested, init: -1}
mut group_stack_index := -1
2020-01-13 15:30:41 +03:00
re.query = in_txt // save the query string
2020-01-13 15:30:41 +03:00
i = 0
for i < in_txt.len {
mut char_tmp := u32(0)
mut char_len := 0
// println("i: ${i:3d} ch: ${in_txt.str[i]:c}")
2020-01-13 15:30:41 +03:00
char_tmp, char_len = re.get_char(in_txt, i)
2020-01-13 15:30:41 +03:00
//
// check special cases: $ ^
//
2022-04-15 14:58:56 +03:00
if char_len == 1 && i == 0 && u8(char_tmp) == `^` {
re.flag = regex.f_ms
2020-01-13 15:30:41 +03:00
i = i + char_len
continue
}
2022-04-15 14:58:56 +03:00
if char_len == 1 && i == (in_txt.len - 1) && u8(char_tmp) == `$` {
re.flag = regex.f_me
2020-01-13 15:30:41 +03:00
i = i + char_len
continue
}
// ist_group_start
2022-04-15 14:58:56 +03:00
if char_len == 1 && pc >= 0 && u8(char_tmp) == `(` {
// check max groups allowed
2020-01-13 15:30:41 +03:00
if group_count > re.group_max {
return regex.err_groups_overflow, i + 1
2020-01-13 15:30:41 +03:00
}
group_stack_index++
// check max nested groups allowed
if group_stack_index > re.group_max_nested {
return regex.err_groups_max_nested, i + 1
2020-01-13 15:30:41 +03:00
}
tmp_res, cgroup_flag, negate_flag, cgroup_name, next_i := re.parse_groups(in_txt,
i)
2020-04-23 02:16:16 +03:00
// manage question mark format error
if tmp_res < -1 {
return regex.err_group_qm_notation, next_i
}
// println("Parse group: [$tmp_res, $cgroup_flag, ($i,$next_i), '${in_txt[i..next_i]}' ]")
i = next_i
if cgroup_flag == true {
group_count++
}
// calculate the group id
// if it is a named group, recycle the group id
// NOTE: **** the group index is +1 because map return 0 when not found!! ****
mut group_id := group_count
if cgroup_name.len > 0 {
// println("GROUP NAME: ${cgroup_name}")
if cgroup_name in re.group_map {
group_id = re.group_map[cgroup_name] - 1
group_count--
} else {
re.group_map[cgroup_name] = group_id + 1
}
}
2020-01-13 15:30:41 +03:00
group_stack_txt_index[group_stack_index] = i
group_stack[group_stack_index] = pc
re.prog[pc].ist = u32(0) | regex.ist_group_start
2020-01-13 15:30:41 +03:00
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
2020-04-23 02:16:16 +03:00
// manage negation groups
if negate_flag == true {
re.prog[pc].group_neg = true
re.prog[pc].rep_min = 0 // may be not catched, but it is ok
}
// set the group id
if cgroup_flag == false {
// println("NO CAPTURE GROUP")
2020-04-23 02:16:16 +03:00
re.prog[pc].group_id = -1
} else {
re.prog[pc].group_id = group_id
}
2020-01-13 15:30:41 +03:00
pc = pc + 1
continue
}
// ist_group_end
2022-04-15 14:58:56 +03:00
if char_len == 1 && pc > 0 && u8(char_tmp) == `)` {
2020-01-13 15:30:41 +03:00
if group_stack_index < 0 {
return regex.err_group_not_balanced, i + 1
2020-01-13 15:30:41 +03:00
}
goto_pc := group_stack[group_stack_index]
group_stack_index--
re.prog[pc].ist = u32(0) | regex.ist_group_end
2020-01-13 15:30:41 +03:00
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
re.prog[pc].goto_pc = goto_pc // PC where to jump if a group need
re.prog[pc].group_id = re.prog[goto_pc].group_id // id of this group, used for storing data
2020-04-23 02:16:16 +03:00
re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc
// re.prog[goto_pc].group_id = group_count // id of this group, used for storing data
2020-01-13 15:30:41 +03:00
// duplicate the negation group info and settings
if re.prog[goto_pc].group_neg == true {
re.prog[pc].group_neg = re.prog[goto_pc].group_neg
re.prog[pc].rep_min = re.prog[goto_pc].rep_min
}
2020-01-13 15:30:41 +03:00
pc = pc + 1
i = i + char_len
continue
}
// ist_dot_char match any char except the following token
2022-04-15 14:58:56 +03:00
if char_len == 1 && pc >= 0 && u8(char_tmp) == `.` {
// consecutive ist_dot_char is a syntax error
if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char {
return regex.err_consecutive_dots, i
}
re.prog[pc].ist = u32(0) | regex.ist_dot_char
2020-01-13 15:30:41 +03:00
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
pc = pc + 1
i = i + char_len
continue
}
// OR branch
2022-04-15 14:58:56 +03:00
if char_len == 1 && pc > 0 && u8(char_tmp) == `|` {
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
return regex.err_syntax_error, i
2020-01-13 15:30:41 +03:00
}
re.prog[pc].ist = u32(0) | regex.ist_or_branch
re.prog[pc].source_index = i
2020-01-13 15:30:41 +03:00
pc = pc + 1
i = i + char_len
continue
}
// Quantifiers
if char_len == 1 && pc > 0 {
2021-09-05 04:48:59 +03:00
mut char_next := rune(0)
mut char_next_len := 0
if (char_len + i) < in_txt.len {
char_next, char_next_len = re.get_char(in_txt, i + char_len)
}
2020-01-13 15:30:41 +03:00
mut quant_flag := true
// negation groups can not have quantifiers
if re.prog[pc - 1].group_neg == true && char_tmp in [`?`, `+`, `*`, `{`] {
return regex.err_neg_group_quantifier, i
}
2022-04-15 14:58:56 +03:00
match u8(char_tmp) {
2020-01-13 15:30:41 +03:00
`?` {
// println("q: ${char_tmp:c}")
2021-09-05 04:48:59 +03:00
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 0
re.prog[pc - 1].rep_max = 1
2020-01-13 15:30:41 +03:00
}
`+` {
// println("q: ${char_tmp:c}")
2021-09-05 04:48:59 +03:00
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 1
re.prog[pc - 1].rep_max = regex.max_quantifier
2020-01-13 15:30:41 +03:00
}
`*` {
// println("q: ${char_tmp:c}")
2021-09-05 04:48:59 +03:00
// check illegal quantifier sequences
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
re.prog[pc - 1].rep_min = 0
re.prog[pc - 1].rep_max = regex.max_quantifier
2020-01-13 15:30:41 +03:00
}
`{` {
min, max, tmp, greedy := re.parse_quantifier(in_txt, i + 1)
2020-01-13 15:30:41 +03:00
// it is a quantifier
if min >= 0 {
// println("{$min,$max}\n str:[${in_txt[i..i+tmp]}] greedy:$greedy")
2020-01-13 15:30:41 +03:00
i = i + tmp
re.prog[pc - 1].rep_min = min
re.prog[pc - 1].rep_max = max
re.prog[pc - 1].greedy = greedy
2021-09-05 04:48:59 +03:00
// check illegal quantifier sequences
if i <= in_txt.len {
char_next, char_next_len = re.get_char(in_txt, i)
if char_next_len == 1 && char_next in regex.quntifier_chars {
return regex.err_syntax_error, i
}
}
2020-01-13 15:30:41 +03:00
continue
} else {
return min, i
2020-01-13 15:30:41 +03:00
}
2021-09-05 04:48:59 +03:00
2020-01-13 15:30:41 +03:00
// TODO: decide if the open bracket can be conform without the close bracket
/*
// no conform, parse as normal char
else {
quant_flag = false
}
*/
}
else {
2020-01-13 15:30:41 +03:00
quant_flag = false
}
}
if quant_flag {
i = i + char_len
continue
}
}
2020-01-16 02:39:33 +03:00
// IST_CHAR_CLASS_*
if char_len == 1 && pc >= 0 {
2022-04-15 14:58:56 +03:00
if u8(char_tmp) == `[` {
cc_index, tmp, cc_type := re.parse_char_class(in_txt, i + 1)
2020-01-13 15:30:41 +03:00
if cc_index >= 0 {
// println("index: $cc_index str:${in_txt[i..i+tmp]}")
2020-01-13 15:30:41 +03:00
i = i + tmp
re.prog[pc].ist = u32(0) | cc_type
2020-01-13 15:30:41 +03:00
re.prog[pc].cc_index = cc_index
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
2020-01-13 15:30:41 +03:00
pc = pc + 1
continue
}
// cc_class vector memory full
else if cc_index < 0 {
return cc_index, i
}
}
}
2020-04-23 02:16:16 +03:00
// ist_bsls_char
if char_len == 1 && pc >= 0 {
2022-04-15 14:58:56 +03:00
if u8(char_tmp) == `\\` {
bsls_index, tmp := re.parse_bsls(in_txt, i)
// println("index: $bsls_index str:${in_txt[i..i+tmp]}")
2020-01-13 15:30:41 +03:00
if bsls_index >= 0 {
i = i + tmp
re.prog[pc].ist = u32(0) | regex.ist_bsls_char
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
re.prog[pc].validator = regex.bsls_validator_array[bsls_index].validator
re.prog[pc].ch = regex.bsls_validator_array[bsls_index].ch
2020-01-13 15:30:41 +03:00
pc = pc + 1
continue
2020-04-23 02:16:16 +03:00
}
2020-01-13 15:30:41 +03:00
// this is an escape char, skip the bsls and continue as a normal char
else if bsls_index == regex.no_match_found {
2020-01-13 15:30:41 +03:00
i += char_len
char_tmp, char_len = re.get_char(in_txt, i)
2020-01-13 15:30:41 +03:00
// continue as simple char
}
// if not an escape or a bsls char then it is an error (at least for now!)
else {
return bsls_index, i + tmp
2020-01-13 15:30:41 +03:00
}
}
}
// ist_simple_char
re.prog[pc].ist = regex.ist_simple_char
re.prog[pc].ch = char_tmp
2022-04-15 14:58:56 +03:00
re.prog[pc].ch_len = u8(char_len)
2020-01-13 15:30:41 +03:00
re.prog[pc].rep_min = 1
re.prog[pc].rep_max = 1
// println("char: ${char_tmp:c}")
pc = pc + 1
2020-01-13 15:30:41 +03:00
i += char_len
2020-01-13 15:30:41 +03:00
}
// add end of the program
re.prog[pc].ist = regex.ist_prog_end
2020-12-22 19:42:32 +03:00
re.prog_len = pc
2020-01-13 15:30:41 +03:00
// check for unbalanced groups
if group_stack_index != -1 {
return regex.err_group_not_balanced, group_stack_txt_index[group_stack_index] + 1
2020-01-13 15:30:41 +03:00
}
// check for OR at the end of the program
if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
return regex.err_syntax_error, in_txt.len - 1
2020-01-13 15:30:41 +03:00
}
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// store the number of groups in the query
re.group_count = group_count + 1
2020-01-13 15:30:41 +03:00
//******************************************
// Post processing
//******************************************
//
// manage ist_dot_char
//
2020-12-18 07:57:31 +03:00
// find the checks for dot chars, if any...
2020-01-13 15:30:41 +03:00
mut pc1 := 0
2020-12-18 07:57:31 +03:00
mut dot_char_count := 0
mut last_dot_char_pc := -1
2020-01-13 15:30:41 +03:00
for pc1 < pc {
if re.prog[pc1].ist == regex.ist_dot_char {
// println("Dot_char pc: $pc1")
2020-12-18 07:57:31 +03:00
last_dot_char_pc = pc1
dot_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
// consecutive dot chars is an error
if re.prog[pc2].ist == regex.ist_dot_char {
return regex.err_syntax_error, 0
2020-12-18 07:57:31 +03:00
}
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next dot char check is PC: ${pc2}")
2020-12-18 07:57:31 +03:00
re.prog[pc1].dot_check_pc = pc2
break
}
pc2++
}
2020-01-13 15:30:41 +03:00
}
pc1++
}
2020-01-18 09:38:00 +03:00
// println("last_dot_char_pc: ${last_dot_char_pc}")
2020-12-18 07:57:31 +03:00
if last_dot_char_pc >= 0 {
pc1 = last_dot_char_pc + 1
mut is_last_dot := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
2020-12-18 07:57:31 +03:00
is_last_dot = false
break
}
2020-12-18 07:57:31 +03:00
pc1++
}
if is_last_dot {
re.prog[last_dot_char_pc].last_dot_flag = true
}
}
//
// manage bsls_char
//
// find the checks for bsls, if any...
pc1 = 0
mut bsls_char_count := 0
mut last_bsls_char_pc := -1
for pc1 < pc {
if re.prog[pc1].ist == regex.ist_bsls_char {
// println("bsls_char pc: $pc1")
last_bsls_char_pc = pc1
bsls_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next bsls check is PC: ${pc2}")
re.prog[pc1].bsls_check_pc = pc2
break
}
pc2++
}
}
pc1++
}
// println('last_bsls_char_pc: ${last_bsls_char_pc}')
if last_bsls_char_pc >= 0 {
pc1 = last_bsls_char_pc + 1
mut is_last_bsls := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
is_last_bsls = false
break
}
pc1++
}
if is_last_bsls {
re.prog[last_bsls_char_pc].last_dot_flag = true
}
}
//
// manage CC
//
pc1 = 0
mut cc_char_count := 0
mut last_cc_char_pc := -1
for pc1 < pc {
if re.prog[pc1].ist in [rune(regex.ist_char_class_pos), regex.ist_char_class_neg] {
last_cc_char_pc = pc1
cc_char_count++
mut pc2 := pc1 + 1
for pc2 < pc {
if re.prog[pc2].ist !in [rune(regex.ist_prog_end), regex.ist_group_end,
regex.ist_group_start] {
// println("Next CC check is PC: ${pc2}")
re.prog[pc1].cc_check_pc = pc2
break
}
pc2++
}
}
pc1++
}
// println('last_cc_char_pc: ${last_cc_char_pc}')
if last_cc_char_pc >= 0 {
pc1 = last_cc_char_pc + 1
mut is_last_cc := true
for pc1 < pc {
if re.prog[pc1].ist !in [rune(regex.ist_prog_end), regex.ist_group_end] {
is_last_cc = false
break
}
pc1++
}
if is_last_cc {
re.prog[last_cc_char_pc].last_dot_flag = true
}
}
//******************************************
2020-01-13 15:30:41 +03:00
// OR branch
// a|b|cd
// d exit point
// a,b,c branches
// set the jump in the right places
pc1 = 0
for pc1 < pc - 2 {
// println("Here $pc1 ${pc-2}")
// println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
if re.prog[pc1 + 1].ist == regex.ist_or_branch {
// two consecutive OR are a syntax error
if re.prog[pc1 + 2].ist == regex.ist_or_branch {
return regex.err_syntax_error, i
}
// check for []|[] errors
if re.prog[pc1].ist == regex.ist_char_class_pos
&& re.prog[pc1 + 2].ist == regex.ist_char_class_pos {
return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
}
2020-01-13 15:30:41 +03:00
}
// manange a|b chains like a|(b)|c|d...
// standard solution
if re.prog[pc1].ist != regex.ist_or_branch && re.prog[pc1 + 1].ist == regex.ist_or_branch
&& re.prog[pc1 + 2].ist != regex.ist_or_branch {
re.prog[pc1].next_is_or = true // set that the next token is an OR
re.prog[pc1 + 1].rep_min = pc1 + 2 // failed match jump
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// match jump, if an OR chain the next token will be an OR token
mut pc2 := pc1 + 2
for pc2 < pc - 1 {
2020-01-13 15:30:41 +03:00
ist := re.prog[pc2].ist
if ist == regex.ist_group_start {
re.prog[pc1 + 1].rep_max = re.prog[pc2].goto_pc + 1
2020-01-13 15:30:41 +03:00
break
}
if ist != regex.ist_or_branch {
re.prog[pc1 + 1].rep_max = pc2 + 1
2020-01-13 15:30:41 +03:00
break
}
2020-01-13 15:30:41 +03:00
pc2++
}
// special case query of few chars, the true can't go on the first instruction
if re.prog[pc1 + 1].rep_max == pc1 {
re.prog[pc1 + 1].rep_max = 3
2020-12-05 03:51:48 +03:00
}
// println("Compile OR postproc. [$pc1,OR ${pc1+1},$pc2]")
2020-04-23 02:16:16 +03:00
pc1 = pc2
2020-01-13 15:30:41 +03:00
continue
}
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
pc1++
}
//******************************************
// DEBUG PRINT REGEX GENERATED CODE
//******************************************
if re.debug > 0 {
2020-04-25 23:42:48 +03:00
gc := re.get_code()
re.log_func(gc)
2020-01-13 15:30:41 +03:00
}
//******************************************
return regex.compile_ok, 0
2020-01-13 15:30:41 +03:00
}
// get_code return the compiled code as regex string, note: may be different from the source!
pub fn (re RE) get_code() string {
mut pc1 := 0
mut res := strings.new_builder(re.cc.len * 2 * re.prog.len)
res.write_string('========================================\nv RegEx compiler v ${regex.v_regex_version} output:\n')
mut stop_flag := false
for pc1 <= re.prog.len {
tk := re.prog[pc1]
res.write_string('PC:${pc1:3d}')
res.write_string(' ist: ')
res.write_string('${tk.ist:8x}'.replace(' ', '0'))
res.write_string(' ')
ist := tk.ist
if ist == regex.ist_bsls_char {
res.write_string('[\\${tk.ch:1c}] BSLS')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_prog_end {
res.write_string('PROG_END')
stop_flag = true
} else if ist == regex.ist_or_branch {
res.write_string('OR ')
} else if ist == regex.ist_char_class_pos {
res.write_string('[${re.get_char_class(pc1)}] CHAR_CLASS_POS')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_char_class_neg {
res.write_string('[^${re.get_char_class(pc1)}] CHAR_CLASS_NEG')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_dot_char {
res.write_string('. DOT_CHAR nx chk: ${tk.dot_check_pc}')
if tk.last_dot_flag == true {
res.write_string(' last!')
}
} else if ist == regex.ist_group_start {
res.write_string('( GROUP_START #:${tk.group_id}')
if tk.group_id == -1 {
res.write_string(' ?:')
} else {
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id + 1) {
res.write_string(' ?P<${x}>')
break
}
}
2020-01-13 15:30:41 +03:00
}
} else if ist == regex.ist_group_end {
res.write_string(') GROUP_END #:${tk.group_id}')
} else if ist == regex.ist_simple_char {
res.write_string('[${tk.ch:1c}] query_ch')
}
2020-01-13 15:30:41 +03:00
if tk.rep_max == regex.max_quantifier {
res.write_string(' {${tk.rep_min:3d},MAX}')
} else {
if ist == regex.ist_or_branch {
res.write_string(' if false go: ${tk.rep_min:3d} if true go: ${tk.rep_max:3d}')
} else {
res.write_string(' {${tk.rep_min:3d},${tk.rep_max:3d}}')
2020-01-13 15:30:41 +03:00
}
if tk.greedy == true {
res.write_string('?')
2020-01-13 15:30:41 +03:00
}
}
res.write_string('\n')
if stop_flag {
break
}
pc1++
}
res.write_string('========================================\n')
return res.str()
2020-01-13 15:30:41 +03:00
}
// get_query return a string with a reconstruction of the query starting from the regex program code
pub fn (re RE) get_query() string {
mut res := strings.new_builder(re.query.len * 2)
2020-01-13 15:30:41 +03:00
if (re.flag & regex.f_ms) != 0 {
res.write_string('^')
2020-01-13 15:30:41 +03:00
}
mut i := 0
for i < re.prog.len && re.prog[i].ist != regex.ist_prog_end && re.prog[i].ist != 0 {
tk := unsafe { &re.prog[i] }
2020-01-25 21:12:23 +03:00
ch := tk.ist
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// GROUP start
if ch == regex.ist_group_start {
if re.debug > 0 {
res.write_string('#${tk.group_id}')
}
res.write_string('(')
if tk.group_neg == true {
res.write_string('?!') // negation group
} else if tk.group_id == -1 {
res.write_string('?:') // non capturing group
2020-01-13 15:30:41 +03:00
}
2020-04-23 02:16:16 +03:00
for x in re.group_map.keys() {
if re.group_map[x] == (tk.group_id + 1) {
res.write_string('?P<${x}>')
break
}
}
2020-01-13 15:30:41 +03:00
i++
continue
}
// GROUP end
if ch == regex.ist_group_end {
res.write_string(')')
2020-01-13 15:30:41 +03:00
}
// OR branch
if ch == regex.ist_or_branch {
res.write_string('|')
2020-01-13 15:30:41 +03:00
if re.debug > 0 {
res.write_string('{${tk.rep_min},${tk.rep_max}}')
2020-01-13 15:30:41 +03:00
}
i++
continue
}
// char class
if ch == regex.ist_char_class_neg || ch == regex.ist_char_class_pos {
res.write_string('[')
if ch == regex.ist_char_class_neg {
res.write_string('^')
2020-01-13 15:30:41 +03:00
}
res.write_string('${re.get_char_class(i)}')
res.write_string(']')
2020-01-13 15:30:41 +03:00
}
// bsls char
if ch == regex.ist_bsls_char {
res.write_string('\\${tk.ch:1c}')
2020-01-13 15:30:41 +03:00
}
// ist_dot_char
if ch == regex.ist_dot_char {
res.write_string('.')
2020-01-13 15:30:41 +03:00
}
// char alone
if ch == regex.ist_simple_char {
2022-04-15 14:58:56 +03:00
if u8(ch) in regex.bsls_escape_list {
res.write_string('\\')
2020-01-13 15:30:41 +03:00
}
res.write_string('${tk.ch:c}')
2020-01-13 15:30:41 +03:00
}
// quantifier
if !(tk.rep_min == 1 && tk.rep_max == 1) && tk.group_neg == false {
2020-01-25 21:12:23 +03:00
if tk.rep_min == 0 && tk.rep_max == 1 {
res.write_string('?')
} else if tk.rep_min == 1 && tk.rep_max == regex.max_quantifier {
res.write_string('+')
} else if tk.rep_min == 0 && tk.rep_max == regex.max_quantifier {
res.write_string('*')
2020-01-13 15:30:41 +03:00
} else {
if tk.rep_max == regex.max_quantifier {
res.write_string('{${tk.rep_min},MAX}')
2020-01-13 15:30:41 +03:00
} else {
res.write_string('{${tk.rep_min},${tk.rep_max}}')
2020-01-13 15:30:41 +03:00
}
2020-01-25 21:12:23 +03:00
if tk.greedy == true {
res.write_string('?')
2020-01-16 02:39:33 +03:00
}
2020-01-13 15:30:41 +03:00
}
}
i++
}
if (re.flag & regex.f_me) != 0 {
res.write_string('$')
2020-01-13 15:30:41 +03:00
}
return res.str()
2020-01-13 15:30:41 +03:00
}
/******************************************************************************
*
* Groups saving utilities
*
******************************************************************************/
2020-12-22 19:42:32 +03:00
[direct_array_access]
fn (mut re RE) group_continuous_save(g_index int) {
if re.group_csave_flag == true {
// continuous save, save until we have space
// init the first element as counter
if re.group_csave.len == 0 {
re.group_csave << 0
}
gi := g_index >> 1
start := re.groups[g_index]
end := re.groups[g_index + 1]
// check if we are simply increasing the size ot the found group
if re.group_csave.len >= 4 && gi == re.group_csave[re.group_csave.len - 3]
&& start == re.group_csave[re.group_csave.len - 2] {
re.group_csave[re.group_csave.len - 1] = end
return
}
// otherwise append a new group to the list
// increment counter
re.group_csave[0]++
// save the record
re.group_csave << (g_index >> 1) // group id
re.group_csave << re.groups[g_index] // start
re.group_csave << re.groups[g_index + 1] // end
}
}
/******************************************************************************
*
* Matching
*
******************************************************************************/
enum Match_state {
2020-04-23 02:16:16 +03:00
start = 0
stop
end
new_line
ist_load // load and execute instruction
ist_next // go to next instruction
ist_next_ks // go to next instruction without clenaning the state
ist_quant_p // match positive ,quantifier check
ist_quant_n // match negative, quantifier check
2020-04-23 02:16:16 +03:00
ist_quant_pg // match positive ,group quantifier check
ist_quant_ng // match negative ,group quantifier check
2020-01-13 15:30:41 +03:00
}
fn state_str(s Match_state) string {
match s {
.start { return 'start' }
.stop { return 'stop' }
.end { return 'end' }
.new_line { return 'new line' }
.ist_load { return 'ist_load' }
.ist_next { return 'ist_next' }
.ist_next_ks { return 'ist_next_ks' }
.ist_quant_p { return 'ist_quant_p' }
.ist_quant_n { return 'ist_quant_n' }
.ist_quant_pg { return 'ist_quant_pg' }
.ist_quant_ng { return 'ist_quant_ng' }
2020-04-23 02:16:16 +03:00
}
2020-01-13 15:30:41 +03:00
}
struct StateObj {
pub mut:
group_index int = -1 // group id used to know how many groups are open
match_flag bool // indicate if we are in a match condition
match_index int = -1 // index of the last match
first_match int = -1 // index of the first match
pc int = -1 // program counter
i int = -1 // source string index
char_len int // last char legth
last_dot_pc int = -1 // last dot chat pc
2020-01-13 15:30:41 +03:00
}
2020-12-22 19:42:32 +03:00
[direct_array_access]
2022-04-15 18:25:45 +03:00
pub fn (mut re RE) match_base(in_txt &u8, in_txt_len int) (int, int) {
2020-01-13 15:30:41 +03:00
// result status
mut result := regex.no_match_found // function return
2020-01-13 15:30:41 +03:00
mut ch := rune(0) // examinated char
mut char_len := 0 // utf8 examinated char len
mut m_state := Match_state.start // start point for the matcher FSM
mut src_end := false
2020-12-05 03:51:48 +03:00
mut last_fnd_pc := -1
mut state := StateObj{} // actual state
mut ist := rune(0) // actual instruction
mut l_ist := rune(0) // last matched instruction
2020-01-13 15:30:41 +03:00
mut step_count := 0 // stats for debug
mut dbg_line := 0 // count debug line printed
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
re.reset()
if re.debug > 0 {
2020-01-13 15:30:41 +03:00
// print header
mut h_buf := strings.new_builder(32)
h_buf.write_string('flags: ')
h_buf.write_string('${re.flag:8x}'.replace(' ', '0'))
h_buf.write_string('\n')
2020-04-25 23:42:48 +03:00
sss := h_buf.str()
re.log_func(sss)
2020-01-13 15:30:41 +03:00
}
for m_state != .end {
2020-12-18 07:57:31 +03:00
if state.pc >= 0 && state.pc < re.prog.len {
ist = re.prog[state.pc].ist
} else if state.pc >= re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
2020-01-13 15:30:41 +03:00
}
//******************************************
// DEBUG LOG
//******************************************
if re.debug > 0 {
mut buf2 := strings.new_builder(re.cc.len + 128)
2020-01-13 15:30:41 +03:00
2020-04-23 02:16:16 +03:00
// print all the instructions
2020-01-13 15:30:41 +03:00
// end of the input text
2020-12-18 07:57:31 +03:00
if state.i >= in_txt_len {
buf2.write_string('# ${step_count:3d} END OF INPUT TEXT\n')
2020-04-25 23:42:48 +03:00
sss := buf2.str()
re.log_func(sss)
} else {
2020-01-16 02:39:33 +03:00
// print only the exe instruction
if (re.debug == 1 && m_state == .ist_load) || re.debug == 2 {
if ist == regex.ist_prog_end {
buf2.write_string('# ${step_count:3d} PROG_END\n')
} else if ist == 0 || m_state in [.start, .ist_next, .stop] {
buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: NA\n')
} else {
2020-12-18 07:57:31 +03:00
ch, char_len = re.get_charb(in_txt, state.i)
2020-04-23 02:16:16 +03:00
buf2.write_string('# ${step_count:3d} s: ${state_str(m_state):12s} PC: ${state.pc:3d}=>')
buf2.write_string('${ist:8x}'.replace(' ', '0'))
buf2.write_string(" i,ch,len:[${state.i:3d},'${utf8_str(ch)}',${char_len}] f.m:[${state.first_match:3d},${state.match_index:3d}] ")
2020-01-13 15:30:41 +03:00
if ist == regex.ist_simple_char {
buf2.write_string('query_ch: [${re.prog[state.pc].ch:1c}]')
2020-01-13 15:30:41 +03:00
} else {
if ist == regex.ist_bsls_char {
buf2.write_string('BSLS [\\${re.prog[state.pc].ch:1c}]')
} else if ist == regex.ist_prog_end {
buf2.write_string('PROG_END')
} else if ist == regex.ist_or_branch {
buf2.write_string('OR')
} else if ist == regex.ist_char_class_pos {
buf2.write_string('CHAR_CLASS_POS[${re.get_char_class(state.pc)}]')
} else if ist == regex.ist_char_class_neg {
buf2.write_string('CHAR_CLASS_NEG[${re.get_char_class(state.pc)}]')
} else if ist == regex.ist_dot_char {
buf2.write_string('DOT_CHAR')
} else if ist == regex.ist_group_start {
tmp_gi := re.prog[state.pc].group_id
2020-12-18 07:57:31 +03:00
tmp_gr := re.prog[re.prog[state.pc].goto_pc].group_rep
buf2.write_string('GROUP_START #:${tmp_gi} rep:${tmp_gr} ')
} else if ist == regex.ist_group_end {
buf2.write_string('GROUP_END #:${re.prog[state.pc].group_id} deep:${state.group_index}')
2020-01-13 15:30:41 +03:00
}
}
if re.prog[state.pc].rep_max == regex.max_quantifier {
buf2.write_string('{${re.prog[state.pc].rep_min},MAX}:${re.prog[state.pc].rep}')
2020-01-13 15:30:41 +03:00
} else {
buf2.write_string('{${re.prog[state.pc].rep_min},${re.prog[state.pc].rep_max}}:${re.prog[state.pc].rep}')
2020-01-13 15:30:41 +03:00
}
2020-12-18 07:57:31 +03:00
if re.prog[state.pc].greedy == true {
buf2.write_string('?')
2020-01-16 02:39:33 +03:00
}
buf2.write_string(' (#${state.group_index})')
2020-12-18 07:57:31 +03:00
if ist == regex.ist_dot_char {
buf2.write_string(' last!')
2020-12-18 07:57:31 +03:00
}
buf2.write_string('\n')
2020-01-13 15:30:41 +03:00
}
2020-04-25 23:42:48 +03:00
sss2 := buf2.str()
re.log_func(sss2)
2020-01-13 15:30:41 +03:00
}
}
step_count++
dbg_line++
}
//******************************************
if ist == regex.ist_prog_end {
// println("HERE we end!")
2020-12-05 03:51:48 +03:00
break
}
2020-12-18 07:57:31 +03:00
2020-01-13 15:30:41 +03:00
// we're out of text, manage it
if state.i >= in_txt_len || m_state == .new_line {
// println("Finished text!!")
2020-12-05 03:51:48 +03:00
src_end = true
2020-04-23 02:16:16 +03:00
// we have fished the text, we must manage out pf bound indexes
if state.i >= in_txt_len {
state.i = in_txt_len - 1
}
2020-01-13 15:30:41 +03:00
// manage groups
2020-12-18 07:57:31 +03:00
if state.group_index >= 0 && state.match_index >= 0 {
// println("End text with open groups!")
// println("state.group_index: ${state.group_index}")
2020-01-13 15:30:41 +03:00
// close the groups
2020-12-18 07:57:31 +03:00
for state.group_index >= 0 {
2020-12-22 23:34:46 +03:00
tmp_pc := re.group_data[state.group_index]
2020-01-13 15:30:41 +03:00
re.prog[tmp_pc].group_rep++
// println("Closing group $state.group_index {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:${re.prog[tmp_pc].group_rep}")
2020-05-17 14:51:18 +03:00
if re.prog[tmp_pc].group_rep >= re.prog[tmp_pc].rep_min
&& re.prog[tmp_pc].group_id >= 0 {
start_i := re.group_stack[state.group_index]
re.group_stack[state.group_index] = -1
2020-01-13 15:30:41 +03:00
// save group results
g_index := re.prog[tmp_pc].group_id * 2
// println("group_id: ${re.prog[tmp_pc].group_id} g_index: ${g_index}")
2020-01-13 15:30:41 +03:00
if start_i >= 0 {
re.groups[g_index] = start_i
} else {
re.groups[g_index] = 0
}
re.groups[g_index + 1] = state.i
if re.groups[g_index + 1] >= in_txt_len {
// println("clamp group on stop!")
re.groups[g_index + 1] = in_txt_len - 1
}
2020-01-25 21:12:23 +03:00
// continuous save, save until we have space
re.group_continuous_save(g_index)
}
2020-12-18 07:57:31 +03:00
state.group_index--
2020-01-13 15:30:41 +03:00
}
}
// println("re.groups: ${re.groups}")
2020-12-18 07:57:31 +03:00
// the text is finished and the groups closed and we are the last group, ok exit
if ist == regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
// println("Last group end")
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
}
if state.pc == -1 {
state.pc = last_fnd_pc
2020-12-05 03:51:48 +03:00
}
// println("Finished text!!")
// println("Instruction: ${ist:08x} pc: $state.pc")
// println("min_rep: ${re.prog[state.pc].rep_min} max_rep: ${re.prog[state.pc].rep_max} rep: ${re.prog[state.pc].rep}")
2020-12-05 03:51:48 +03:00
// program end
if ist == regex.ist_prog_end {
// println("Program end on end of text!")
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
}
if l_ist in [
rune(regex.ist_char_class_neg),
regex.ist_char_class_pos,
regex.ist_bsls_char,
regex.ist_dot_char,
] {
// println("***** We have a last special token")
// println("PC: ${state.pc} last_dot_flag:${re.prog[state.pc].last_dot_flag}")
// println("rep: ${re.prog[state.pc].group_rep} min: ${re.prog[state.pc].rep_min} max: ${re.prog[state.pc].rep_max}")
// println("first match: ${state.first_match}")
if re.prog[state.pc].last_dot_flag == true
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
}
// println("Not fitted!!")
}
2021-09-06 03:11:38 +03:00
// no groups open, check the last token quantifier
if ist != regex.ist_group_end && re.prog[state.pc + 1].ist == regex.ist_prog_end {
if re.prog[state.pc].rep >= re.prog[state.pc].rep_min
&& re.prog[state.pc].rep <= re.prog[state.pc].rep_max {
// println("We are in good repetition")
return state.first_match, state.i
}
}
// println("No good exit!!")
if re.prog[re.prog_len - 1].ist == regex.ist_group_end {
// println("last ist is a group end!")
if re.prog[re.prog_len - 1].group_rep >= re.prog[re.prog_len - 1].rep_min {
return state.first_match, state.i
}
}
return regex.no_match_found, state.i
2020-01-13 15:30:41 +03:00
}
// starting and init
if m_state == .start {
2020-12-18 07:57:31 +03:00
state.pc = -1
state.i = 0
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
2020-01-16 02:39:33 +03:00
// ist_next, next instruction reseting its state
2020-12-22 19:42:32 +03:00
else if m_state == .ist_next {
2020-12-18 07:57:31 +03:00
state.pc = state.pc + 1
re.prog[state.pc].reset()
2020-01-13 15:30:41 +03:00
// check if we are in the program bounds
2020-12-18 07:57:31 +03:00
if state.pc < 0 || state.pc > re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
2020-04-23 02:16:16 +03:00
}
2020-01-13 15:30:41 +03:00
m_state = .ist_load
continue
}
2020-01-16 02:39:33 +03:00
// ist_next_ks, next instruction keeping its state
2020-12-22 19:42:32 +03:00
else if m_state == .ist_next_ks {
2020-12-18 07:57:31 +03:00
state.pc = state.pc + 1
2020-01-13 15:30:41 +03:00
// check if we are in the program bounds
2020-12-18 07:57:31 +03:00
if state.pc < 0 || state.pc > re.prog.len {
// println("ERROR!! PC overflow!!")
return regex.err_internal_error, state.i
2020-04-23 02:16:16 +03:00
}
2020-01-13 15:30:41 +03:00
m_state = .ist_load
continue
}
// load the char
2020-12-18 07:57:31 +03:00
ch, char_len = re.get_charb(in_txt, state.i)
2020-01-16 02:39:33 +03:00
// check new line if flag f_nl enabled
2022-04-15 14:58:56 +03:00
if (re.flag & regex.f_nl) != 0 && char_len == 1 && u8(ch) in regex.new_line_list {
2020-01-16 02:39:33 +03:00
m_state = .new_line
continue
}
2020-04-23 02:16:16 +03:00
// check if stop
2020-12-22 19:42:32 +03:00
else if m_state == .stop {
2020-01-31 04:29:54 +03:00
// we are in search mode, don't exit until the end
if (re.flag & regex.f_src) != 0 && ist != regex.ist_prog_end {
2020-12-18 07:57:31 +03:00
last_fnd_pc = state.pc
state.pc = -1
state.i += char_len
2020-01-31 04:29:54 +03:00
m_state = .ist_next
re.reset_src()
state.match_index = -1
2020-12-18 07:57:31 +03:00
state.first_match = -1
// reset state list
re.reset()
2020-01-31 04:29:54 +03:00
continue
}
if ist == regex.ist_prog_end {
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
2020-01-13 15:30:41 +03:00
}
2020-12-18 07:57:31 +03:00
// manage here dot char
2020-12-22 23:34:46 +03:00
if re.state_list.len > 0 {
// println("Here we are, with stop: state buffer: [${re.state_list.len}]")
2020-12-22 23:34:46 +03:00
state = re.state_list.pop()
2020-12-18 07:57:31 +03:00
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
2020-12-18 07:57:31 +03:00
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
continue
2020-01-13 15:30:41 +03:00
}
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// exit on no match
return result, state.i
2020-01-13 15:30:41 +03:00
}
// ist_load
2020-12-22 19:42:32 +03:00
else if m_state == .ist_load {
2020-01-13 15:30:41 +03:00
// program end
if ist == regex.ist_prog_end {
2020-01-13 15:30:41 +03:00
// if we are in match exit well
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
if state.group_index >= 0 && state.match_index >= 0 {
state.group_index = -1
2020-01-13 15:30:41 +03:00
}
2020-01-18 09:38:00 +03:00
2020-01-13 15:30:41 +03:00
m_state = .stop
continue
}
// check GROUP start, no quantifier is checkd for this token!!
else if ist == regex.ist_group_start {
2020-12-18 07:57:31 +03:00
state.group_index++
re.group_data[state.group_index] = re.prog[state.pc].goto_pc // save where is ist_group_end, we will use it for escape
re.group_stack[state.group_index] = state.i // index where we start to manage
// println("group_index $state.group_index rep ${re.prog[re.prog[state.pc].goto_pc].group_rep}")
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
// check GROUP end
else if ist == regex.ist_group_end {
2020-01-13 15:30:41 +03:00
// we are in matching streak
// println("Group END!! last ist: ${l_ist:08x}")
2020-01-13 15:30:41 +03:00
if state.match_index >= 0 {
// restore txt index stack and save the group data
2020-04-23 02:16:16 +03:00
// println("g.id: ${re.prog[state.pc].group_id} group_index: ${state.group_index}")
2020-12-18 07:57:31 +03:00
if state.group_index >= 0 && re.prog[state.pc].group_id >= 0 {
start_i := re.group_stack[state.group_index]
2020-01-13 15:30:41 +03:00
// save group results
g_index := re.prog[state.pc].group_id * 2
2020-01-13 15:30:41 +03:00
if start_i >= 0 {
re.groups[g_index] = start_i
} else {
re.groups[g_index] = 0
}
2020-12-18 07:57:31 +03:00
re.groups[g_index + 1] = state.i
2020-12-18 07:57:31 +03:00
if g_index > 0 && re.groups[g_index] <= re.groups[g_index - 1] {
re.groups[g_index] = re.groups[g_index - 1]
2020-12-21 07:36:14 +03:00
}
if re.groups[g_index + 1] >= in_txt_len {
// println("clamp group!")
re.groups[g_index + 1] = in_txt_len - 1
}
// println("GROUP ${re.prog[state.pc].group_id} END [${re.groups[g_index]}, ${re.groups[g_index+1]}] i: $state.i in_txt_len: $in_txt_len")
2020-01-25 21:12:23 +03:00
// continuous save, save until we have space
re.group_continuous_save(g_index)
2020-01-13 15:30:41 +03:00
}
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
re.prog[state.pc].group_rep++ // increase repetitions
// println("GROUP $group_index END ${re.prog[state.pc].group_rep}")
if re.prog[state.pc].group_rep > in_txt_len - 1 {
m_state = .ist_quant_ng
continue
}
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_pg
continue
}
m_state = .ist_quant_ng
2020-04-23 02:16:16 +03:00
continue
2020-01-13 15:30:41 +03:00
}
// check OR
else if ist == regex.ist_or_branch {
2020-01-13 15:30:41 +03:00
if state.match_index >= 0 {
2020-12-18 07:57:31 +03:00
state.pc = re.prog[state.pc].rep_max
// println("ist_or_branch True pc: $state.pc")
} else {
2020-12-18 07:57:31 +03:00
state.pc = re.prog[state.pc].rep_min
// println("ist_or_branch False pc: $state.pc")
2020-01-13 15:30:41 +03:00
}
2020-12-18 07:57:31 +03:00
re.prog[state.pc].reset()
m_state = .ist_load
2020-01-13 15:30:41 +03:00
continue
}
// check ist_dot_char
else if ist == regex.ist_dot_char {
// println("ist_dot_char rep: ${re.prog[state.pc].rep}")
2020-01-13 15:30:41 +03:00
2020-12-18 07:57:31 +03:00
// check next token to be false
mut next_check_flag := false
2020-12-18 07:57:31 +03:00
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
2020-12-22 23:34:46 +03:00
re.state_list.pop()
2020-12-18 07:57:31 +03:00
m_state = .ist_next
continue
2020-01-13 15:30:41 +03:00
}
if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].dot_check_pc >= 0
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
2020-12-18 07:57:31 +03:00
// load the char
// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
2020-12-18 07:57:31 +03:00
ch_t := ch
chk_pc := re.prog[state.pc].dot_check_pc
2020-12-18 07:57:31 +03:00
// simple char
if re.prog[chk_pc].ist == regex.ist_simple_char {
2020-12-18 07:57:31 +03:00
if re.prog[chk_pc].ch == ch_t {
next_check_flag = true
}
// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
2020-12-18 07:57:31 +03:00
}
// char char_class
else if re.prog[chk_pc].ist == regex.ist_char_class_pos
|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
2020-12-18 07:57:31 +03:00
mut cc_neg := false
if re.prog[chk_pc].ist == regex.ist_char_class_neg {
2020-12-18 07:57:31 +03:00
cc_neg = true
}
mut cc_res := re.check_char_class(chk_pc, ch_t)
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
if cc_neg {
cc_res = !cc_res
}
next_check_flag = cc_res
// println("Check [ist_char_class] => $next_check_flag")
2020-01-18 09:38:00 +03:00
}
2020-12-18 07:57:31 +03:00
// check bsls
else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2022-04-15 14:58:56 +03:00
next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
// println("Check [ist_bsls_char] => $next_check_flag")
2020-12-18 07:57:31 +03:00
}
2020-04-23 02:16:16 +03:00
}
2020-12-18 07:57:31 +03:00
// check if we must continue or pass to the next IST
if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
// println("save the state!!")
mut dot_state := StateObj{
2020-12-18 07:57:31 +03:00
group_index: state.group_index
match_flag: state.match_flag
2020-12-18 07:57:31 +03:00
match_index: state.match_index
first_match: state.first_match
pc: state.pc
i: state.i + char_len
char_len: char_len
2020-12-18 07:57:31 +03:00
last_dot_pc: state.pc
}
2021-01-03 03:33:34 +03:00
// if we are mananging a .* stay on the same char on return
if re.prog[state.pc].rep_min == 0 {
dot_state.i -= char_len
}
re.state_list << dot_state
2020-12-18 07:57:31 +03:00
m_state = .ist_quant_n
// println("dot_char stack len: ${re.state_list.len}")
2020-12-05 03:51:48 +03:00
continue
2020-01-13 15:30:41 +03:00
}
2020-12-18 07:57:31 +03:00
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
2020-12-18 07:57:31 +03:00
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
2020-01-18 09:38:00 +03:00
continue
2020-01-13 15:30:41 +03:00
}
// char class IST
else if ist == regex.ist_char_class_pos || ist == regex.ist_char_class_neg {
2020-01-13 15:30:41 +03:00
state.match_flag = false
mut cc_neg := false
2020-04-23 02:16:16 +03:00
if ist == regex.ist_char_class_neg {
2020-01-13 15:30:41 +03:00
cc_neg = true
}
mut cc_res := re.check_char_class(state.pc, ch)
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
if cc_neg {
cc_res = !cc_res
}
if cc_res {
state.match_flag = true
l_ist = u32(regex.ist_char_class_pos)
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
if state.first_match < 0 {
state.first_match = state.i
2020-01-13 15:30:41 +03:00
}
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
state.match_index = state.i
2020-01-13 15:30:41 +03:00
2020-12-18 07:57:31 +03:00
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
}
// check bsls
else if ist == regex.ist_bsls_char {
// println("ist_bsls_char rep: ${re.prog[state.pc].rep}")
// check next token to be false
mut next_check_flag := false
// if we are done with max go on dot char are dedicated case!!
if re.prog[state.pc].rep >= re.prog[state.pc].rep_max {
re.state_list.pop()
m_state = .ist_next
continue
}
if re.prog[state.pc].last_dot_flag == false && re.prog[state.pc].bsls_check_pc >= 0
&& re.prog[state.pc].rep >= re.prog[state.pc].rep_min {
// load the char
// ch_t, _ := re.get_charb(in_txt, state.i+char_len)
ch_t := ch
chk_pc := re.prog[state.pc].bsls_check_pc
// simple char
if re.prog[chk_pc].ist == regex.ist_simple_char {
if re.prog[chk_pc].ch == ch_t {
next_check_flag = true
}
// println("Check [ist_simple_char] [${re.prog[chk_pc].ch}]==[${ch_t:c}] => $next_check_flag")
}
// char char_class
else if re.prog[chk_pc].ist == regex.ist_char_class_pos
|| re.prog[chk_pc].ist == regex.ist_char_class_neg {
mut cc_neg := false
if re.prog[chk_pc].ist == regex.ist_char_class_neg {
cc_neg = true
}
mut cc_res := re.check_char_class(chk_pc, ch_t)
if cc_neg {
cc_res = !cc_res
}
next_check_flag = cc_res
// println("Check [ist_char_class] => $next_check_flag")
}
// check bsls
else if re.prog[chk_pc].ist == regex.ist_bsls_char {
2022-04-15 14:58:56 +03:00
next_check_flag = re.prog[chk_pc].validator(u8(ch_t))
// println("Check [ist_bsls_char] => $next_check_flag")
}
}
// check if we must continue or pass to the next IST
if next_check_flag == true && re.prog[state.pc + 1].ist != regex.ist_prog_end {
// println("save the state!!")
mut dot_state := StateObj{
group_index: state.group_index
match_flag: state.match_flag
match_index: state.match_index
first_match: state.first_match
pc: state.pc
i: state.i + char_len
char_len: char_len
last_dot_pc: state.pc
}
// if we are managing a \[something]* stay on the same char on return
if re.prog[state.pc].rep_min == 0 {
dot_state.i -= char_len
}
re.state_list << dot_state
m_state = .ist_quant_n
// println("dot_char stack len: ${re.state_list.len}")
continue
}
2022-04-15 14:58:56 +03:00
tmp_res := re.prog[state.pc].validator(u8(ch))
if tmp_res == false {
m_state = .ist_quant_n
continue
}
// println("${ch} => ${tmp_res}")
state.match_flag = true
l_ist = u32(regex.ist_dot_char)
if state.first_match < 0 {
state.first_match = state.i
}
state.match_index = state.i
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len
m_state = .ist_quant_p
continue
}
2020-01-13 15:30:41 +03:00
// simple char IST
else if ist == regex.ist_simple_char {
// println("ist_simple_char")
2020-01-13 15:30:41 +03:00
state.match_flag = false
if re.prog[state.pc].ch == ch {
2020-01-13 15:30:41 +03:00
state.match_flag = true
l_ist = regex.ist_simple_char
2020-04-23 02:16:16 +03:00
2020-12-18 07:57:31 +03:00
if state.first_match < 0 {
state.first_match = state.i
2020-01-13 15:30:41 +03:00
}
// println("state.match_index: ${state.match_index}")
2020-12-18 07:57:31 +03:00
state.match_index = state.i
2020-01-13 15:30:41 +03:00
2020-12-18 07:57:31 +03:00
re.prog[state.pc].rep++ // increase repetitions
state.i += char_len // next char
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_p
continue
}
m_state = .ist_quant_n
continue
2020-04-23 02:16:16 +03:00
}
// UNREACHABLE
// println("PANIC2!! state: $m_state")
return regex.err_internal_error, state.i
2020-01-13 15:30:41 +03:00
}
/***********************************
2020-04-23 02:16:16 +03:00
* Quantifier management
2020-01-13 15:30:41 +03:00
***********************************/
// ist_quant_ng => quantifier negative test on group
2020-12-22 19:42:32 +03:00
else if m_state == .ist_quant_ng {
2020-01-13 15:30:41 +03:00
// we are finished here
2020-12-18 07:57:31 +03:00
if state.group_index < 0 {
// println("Early stop!")
result = regex.no_match_found
2020-01-13 15:30:41 +03:00
m_state = .stop
continue
}
tmp_pc := re.group_data[state.group_index] // PC to the end of the group token
rep := re.prog[tmp_pc].group_rep // use a temp variable
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2020-01-13 15:30:41 +03:00
// println(".ist_quant_ng group_pc_end: $tmp_pc rep: $rep")
2020-01-13 15:30:41 +03:00
if rep >= re.prog[tmp_pc].rep_min {
// println("ist_quant_ng GROUP CLOSED OK group_index: $state.group_index")
2020-04-23 02:16:16 +03:00
2020-12-22 23:34:46 +03:00
state.i = re.group_stack[state.group_index]
2020-12-18 07:57:31 +03:00
state.pc = tmp_pc
state.group_index--
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
} else if re.prog[tmp_pc].next_is_or {
// println("ist_quant_ng OR Negative branch")
2020-01-13 15:30:41 +03:00
2020-12-22 23:34:46 +03:00
state.i = re.group_stack[state.group_index]
state.pc = re.prog[tmp_pc + 1].rep_min - 1
2020-12-18 07:57:31 +03:00
state.group_index--
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
} else if rep > 0 && rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_ng UNDER THE MINIMUM g.i: $state.group_index")
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// check if we are inside a group, if yes exit from the nested groups
if state.group_index > 0 {
2020-12-18 07:57:31 +03:00
state.group_index--
state.pc = tmp_pc
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_ng //.ist_next
continue
}
2020-12-18 07:57:31 +03:00
if state.group_index == 0 {
state.group_index--
state.pc = tmp_pc // TEST
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
result = regex.no_match_found
2020-01-13 15:30:41 +03:00
m_state = .stop
continue
} else if rep == 0 && rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_ng c_zero UNDER THE MINIMUM g.i: $state.group_index")
2020-01-13 15:30:41 +03:00
if state.group_index > 0 {
2020-12-18 07:57:31 +03:00
state.group_index--
state.pc = tmp_pc
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_ng //.ist_next
continue
}
result = regex.no_match_found
2020-01-13 15:30:41 +03:00
m_state = .stop
continue
}
// println("DO NOT STAY HERE!! {${re.prog[tmp_pc].rep_min},${re.prog[tmp_pc].rep_max}}:$rep")
// UNREACHABLE
return regex.err_internal_error, state.i
2020-01-13 15:30:41 +03:00
}
// ist_quant_pg => quantifier positive test on group
else if m_state == .ist_quant_pg {
// println(".ist_quant_pg")
2020-12-18 07:57:31 +03:00
mut tmp_pc := state.pc
if state.group_index >= 0 {
2020-12-22 23:34:46 +03:00
tmp_pc = re.group_data[state.group_index]
2020-01-13 15:30:41 +03:00
}
if re.prog[tmp_pc].group_neg == true {
// println("***** Negation of the group")
result = regex.no_match_found
m_state = .stop
continue
}
2020-01-13 15:30:41 +03:00
rep := re.prog[tmp_pc].group_rep
if rep < re.prog[tmp_pc].rep_min {
// println("ist_quant_pg UNDER RANGE")
2020-12-18 07:57:31 +03:00
state.pc = re.prog[tmp_pc].goto_pc
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
} else if rep == re.prog[tmp_pc].rep_max {
// println("ist_quant_pg MAX RANGE")
2020-01-13 15:30:41 +03:00
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2020-12-18 07:57:31 +03:00
state.group_index--
2020-01-13 15:30:41 +03:00
m_state = .ist_next
2020-01-13 15:30:41 +03:00
continue
} else if rep >= re.prog[tmp_pc].rep_min {
// println("ist_quant_pg IN RANGE group_index:$state.group_index")
2020-01-16 02:39:33 +03:00
// check greedy flag, if true exit on minimum
if re.prog[tmp_pc].greedy == true {
re.prog[tmp_pc].group_rep = 0 // clear the repetitions
2020-12-18 07:57:31 +03:00
state.group_index--
2020-01-16 02:39:33 +03:00
m_state = .ist_next
continue
}
2020-12-18 07:57:31 +03:00
state.pc = re.prog[tmp_pc].goto_pc - 1
state.group_index--
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
2020-04-23 02:16:16 +03:00
// UNREACHABLE
// println("PANIC3!! state: $m_state")
return regex.err_internal_error, state.i
2020-01-13 15:30:41 +03:00
}
// ist_quant_n => quantifier negative test on token
2020-01-13 15:30:41 +03:00
else if m_state == .ist_quant_n {
2020-12-18 07:57:31 +03:00
rep := re.prog[state.pc].rep
// println("Here!! PC $state.pc is_next_or: ${re.prog[state.pc].next_is_or}")
2020-01-13 15:30:41 +03:00
// zero quantifier * or ?
2020-12-18 07:57:31 +03:00
if rep == 0 && re.prog[state.pc].rep_min == 0 {
// println("ist_quant_n c_zero RANGE MIN")
2020-01-13 15:30:41 +03:00
m_state = .ist_next // go to next ist
continue
}
// match + or *
2020-12-18 07:57:31 +03:00
else if rep >= re.prog[state.pc].rep_min {
// println("ist_quant_n MATCH RANGE")
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
// check the OR if present
2020-12-18 07:57:31 +03:00
if re.prog[state.pc].next_is_or {
// println("OR present on failing")
2020-01-13 15:30:41 +03:00
state.match_index = -1
m_state = .ist_next
continue
}
// we are in a group manage no match from here
2020-12-18 07:57:31 +03:00
if state.group_index >= 0 {
// println("ist_quant_n FAILED insied a GROUP group_index:$state.group_index")
2020-01-13 15:30:41 +03:00
m_state = .ist_quant_ng
continue
}
// no other options
// println("ist_quant_n no_match_found")
result = regex.no_match_found
2020-01-13 15:30:41 +03:00
m_state = .stop
2023-03-20 11:37:09 +03:00
// stop already started matching outside a capturing group
if re.state_list.len > 0 && re.state_list.last().group_index == -1
&& re.state_list.last().last_dot_pc > 0 {
if ist == regex.ist_dot_char || ist == regex.ist_bsls_char {
return regex.no_match_found, 0
}
2023-03-20 11:37:09 +03:00
}
2020-01-13 15:30:41 +03:00
continue
}
// ist_quant_p => quantifier positive test on token
2020-01-13 15:30:41 +03:00
else if m_state == .ist_quant_p {
2021-09-06 03:11:38 +03:00
// println("Here .ist_quant_p")
2020-01-13 15:30:41 +03:00
// exit on first match
if (re.flag & regex.f_efm) != 0 {
return state.i, state.i + 1
2020-01-13 15:30:41 +03:00
}
2020-12-18 07:57:31 +03:00
rep := re.prog[state.pc].rep
2021-09-06 03:11:38 +03:00
// println(rep)
2020-04-23 02:16:16 +03:00
2020-01-13 15:30:41 +03:00
// under range
2020-12-18 07:57:31 +03:00
if rep > 0 && rep < re.prog[state.pc].rep_min {
// println("ist_quant_p UNDER RANGE")
2020-01-13 15:30:41 +03:00
m_state = .ist_load // continue the loop
continue
}
// range ok, continue loop
2020-12-18 07:57:31 +03:00
else if rep >= re.prog[state.pc].rep_min && rep < re.prog[state.pc].rep_max {
// println("ist_quant_p IN RANGE")
2020-04-23 02:16:16 +03:00
2020-01-16 02:39:33 +03:00
// check greedy flag, if true exit on minimum
2020-12-18 07:57:31 +03:00
if re.prog[state.pc].greedy == true {
2020-01-16 02:39:33 +03:00
m_state = .ist_next
continue
}
2020-01-13 15:30:41 +03:00
m_state = .ist_load
continue
}
// max reached
2020-12-18 07:57:31 +03:00
else if rep == re.prog[state.pc].rep_max {
// println("ist_quant_p MAX RANGE")
2020-01-13 15:30:41 +03:00
m_state = .ist_next
continue
}
}
// UNREACHABLE
// println("PANIC4!! state: $m_state")
return regex.err_internal_error, state.i
2020-01-13 15:30:41 +03:00
}
// println("Check end of text!")
2020-01-13 15:30:41 +03:00
// Check the results
if state.match_index >= 0 {
2020-12-18 07:57:31 +03:00
if state.group_index < 0 {
if re.prog[state.pc].ist == regex.ist_prog_end {
// println("program ended!!")
if (re.flag & regex.f_src) != 0 {
// println("find return")
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
2020-12-05 03:51:48 +03:00
} else {
// println("Here!!")
2020-12-18 07:57:31 +03:00
return 0, state.i
2020-12-05 03:51:48 +03:00
}
}
// println("No Group here, natural end [$state.first_match,$state.i] state: ${state_str(m_state)} ist: $ist pgr_end: $re.prog.len")
if re.prog[state.pc + 1].ist == regex.ist_prog_end
|| re.prog[state.pc].ist == regex.ist_prog_end {
2020-12-18 07:57:31 +03:00
rep := re.prog[state.pc].rep
// println("rep: $rep re.prog[state.pc].rep_min: ${re.prog[state.pc].rep_min} re.prog[state.pc].rep_max: ${re.prog[state.pc].rep_max}")
2020-12-18 07:57:31 +03:00
if rep >= re.prog[state.pc].rep_min && rep <= re.prog[state.pc].rep_max {
return state.first_match, state.i
2020-12-05 03:51:48 +03:00
}
// println("Program not finished! ")
return regex.no_match_found, state.i
2020-12-05 03:51:48 +03:00
}
if src_end {
// println("program end")
2020-12-18 07:57:31 +03:00
return state.first_match, state.i
2020-12-05 03:51:48 +03:00
}
// print("No match found!!")
return regex.no_match_found, state.i
2020-01-13 15:30:41 +03:00
} else {
// println("Group match! OK")
// println("first_match: $state.first_match, i: $state.i")
2020-12-05 03:51:48 +03:00
// println("Skip last group")
return state.first_match, state.i
// return state.first_match,re.group_stack[state.group_index--]
2020-01-13 15:30:41 +03:00
}
}
// println("no_match_found, natural end")
return regex.no_match_found, state.i
2020-01-13 15:30:41 +03:00
}