2020-12-20 06:52:02 +03:00
|
|
|
/*
|
|
|
|
|
|
|
|
regex 1.0 alpha
|
|
|
|
|
2021-01-18 15:20:06 +03:00
|
|
|
Copyright (c) 2019-2021 Dario Deledda. All rights reserved.
|
2020-12-20 06:52:02 +03:00
|
|
|
Use of this source code is governed by an MIT license
|
|
|
|
that can be found in the LICENSE file.
|
|
|
|
|
|
|
|
*/
|
|
|
|
module regex
|
2021-03-20 02:54:12 +03:00
|
|
|
import strings
|
2020-12-20 06:52:02 +03:00
|
|
|
|
|
|
|
/******************************************************************************
|
|
|
|
*
|
|
|
|
* Inits
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
2021-02-20 22:39:08 +03:00
|
|
|
// regex create a regex object from the query string, retunr RE object and errors as re_err, err_pos
|
|
|
|
pub fn regex_base(pattern string) (RE,int,int){
|
2020-12-22 23:34:46 +03:00
|
|
|
// init regex
|
|
|
|
mut re := regex.RE{}
|
|
|
|
re.prog = []Token {len: pattern.len + 1} // max program length, can not be longer then the pattern
|
|
|
|
re.cc = []CharClass{len: pattern.len} // can not be more char class the the length of the pattern
|
|
|
|
re.group_csave_flag = false // enable continuos group saving
|
|
|
|
re.group_max_nested = 128 // set max 128 group nested
|
|
|
|
re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern legth
|
|
|
|
|
|
|
|
re.group_stack = []int{len: re.group_max, init: -1}
|
|
|
|
re.group_data = []int{len: re.group_max, init: -1}
|
|
|
|
|
2021-02-20 22:39:08 +03:00
|
|
|
re_err,err_pos := re.impl_compile(pattern)
|
2020-12-20 06:52:02 +03:00
|
|
|
return re, re_err, err_pos
|
|
|
|
}
|
|
|
|
|
|
|
|
/******************************************************************************
|
|
|
|
*
|
|
|
|
* Utilities
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
|
|
|
// get_group_bounds_by_name get a group boundaries by its name
|
|
|
|
pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int) {
|
|
|
|
if group_name in re.group_map {
|
|
|
|
tmp_index := re.group_map[group_name]-1
|
|
|
|
start := re.groups[tmp_index * 2]
|
|
|
|
end := re.groups[tmp_index * 2 + 1]
|
|
|
|
return start,end
|
|
|
|
}
|
|
|
|
return -1, -1
|
|
|
|
}
|
|
|
|
|
|
|
|
// get_group_by_name get a group boundaries by its name
|
|
|
|
pub fn (re RE) get_group_by_name(in_txt string, group_name string) string {
|
|
|
|
if group_name in re.group_map {
|
|
|
|
tmp_index := re.group_map[group_name]-1
|
|
|
|
start := re.groups[tmp_index * 2]
|
|
|
|
end := re.groups[tmp_index * 2 + 1]
|
|
|
|
return in_txt[start..end]
|
|
|
|
}
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
// get_group_by_id get a group string by its id
|
|
|
|
pub fn (re RE) get_group_by_id(in_txt string, group_id int) string {
|
|
|
|
if group_id < (re.groups.len >> 1) {
|
|
|
|
index := group_id << 1
|
|
|
|
start := re.groups[index]
|
|
|
|
end := re.groups[index + 1]
|
|
|
|
return in_txt[start..end]
|
|
|
|
}
|
|
|
|
return ""
|
|
|
|
}
|
|
|
|
|
|
|
|
// get_group_by_id get a group boundaries by its id
|
|
|
|
pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int) {
|
2021-03-20 02:54:12 +03:00
|
|
|
if group_id < re.group_count {
|
2020-12-20 06:52:02 +03:00
|
|
|
index := group_id << 1
|
|
|
|
return re.groups[index], re.groups[index + 1]
|
|
|
|
}
|
|
|
|
return -1, -1
|
|
|
|
}
|
|
|
|
|
|
|
|
pub
|
|
|
|
struct Re_group {
|
|
|
|
pub:
|
|
|
|
start int = -1
|
|
|
|
end int = -1
|
|
|
|
}
|
|
|
|
|
|
|
|
// get_group_list return a list of Re_group for the found groups
|
|
|
|
pub fn (re RE) get_group_list() []Re_group {
|
|
|
|
mut res := []Re_group{len: re.groups.len >> 1}
|
|
|
|
mut gi := 0
|
|
|
|
//println("len: ${re.groups.len} groups: ${re.groups}")
|
2020-12-26 23:08:56 +03:00
|
|
|
|
2020-12-20 06:52:02 +03:00
|
|
|
for gi < re.groups.len {
|
|
|
|
if re.groups[gi] >= 0 {
|
|
|
|
txt_st := re.groups[gi]
|
|
|
|
txt_en := re.groups[gi+1]
|
|
|
|
|
|
|
|
//println("#${gi/2} start: ${re.groups[gi]} end: ${re.groups[gi + 1]} ")
|
|
|
|
if txt_st >= 0 && txt_en > txt_st {
|
|
|
|
tmp := Re_group{ start: re.groups[gi], end: re.groups[gi + 1]}
|
|
|
|
//println(tmp)
|
|
|
|
res[gi >> 1] = tmp
|
|
|
|
} else {
|
|
|
|
res[gi >> 1] = Re_group{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
gi += 2
|
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
2021-02-20 22:39:08 +03:00
|
|
|
/******************************************************************************
|
|
|
|
*
|
|
|
|
* Matchers
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
|
|
|
// match_string Match the pattern with the in_txt string
|
|
|
|
[direct_array_access]
|
|
|
|
pub fn (mut re RE) match_string(in_txt string) (int,int) {
|
|
|
|
|
|
|
|
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
|
|
|
|
if end > in_txt.len {
|
|
|
|
end = in_txt.len
|
|
|
|
}
|
|
|
|
|
|
|
|
if start >= 0 && end > start {
|
|
|
|
if (re.flag & f_ms) != 0 && start > 0 {
|
|
|
|
return no_match_found, 0
|
|
|
|
}
|
|
|
|
if (re.flag & f_me) != 0 && end < in_txt.len {
|
|
|
|
if in_txt[end] in new_line_list {
|
|
|
|
return start, end
|
|
|
|
}
|
|
|
|
return no_match_found, 0
|
|
|
|
}
|
|
|
|
return start, end
|
|
|
|
}
|
|
|
|
return start, end
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2020-12-24 08:27:46 +03:00
|
|
|
/******************************************************************************
|
|
|
|
*
|
|
|
|
* Finders
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
2021-01-03 18:59:00 +03:00
|
|
|
/*
|
2021-03-20 02:54:12 +03:00
|
|
|
// find internal implementation HERE for reference do not remove!!
|
2020-12-24 08:27:46 +03:00
|
|
|
[direct_array_access]
|
2021-01-03 18:59:00 +03:00
|
|
|
fn (mut re RE) find_imp(in_txt string) (int,int) {
|
2020-12-24 08:27:46 +03:00
|
|
|
old_flag := re.flag
|
|
|
|
re.flag |= f_src // enable search mode
|
|
|
|
|
|
|
|
start, mut end := re.match_base(in_txt.str, in_txt.len + 1)
|
|
|
|
//print("Find [$start,$end] '${in_txt[start..end]}'")
|
|
|
|
if end > in_txt.len {
|
|
|
|
end = in_txt.len
|
|
|
|
}
|
|
|
|
re.flag = old_flag
|
|
|
|
|
|
|
|
if start >= 0 && end > start {
|
|
|
|
return start, end
|
|
|
|
}
|
|
|
|
return no_match_found, 0
|
|
|
|
}
|
2021-01-03 18:59:00 +03:00
|
|
|
*/
|
|
|
|
|
|
|
|
// find try to find the first match in the input string
|
|
|
|
[direct_array_access]
|
|
|
|
pub fn (mut re RE) find(in_txt string) (int,int) {
|
2021-03-20 02:54:12 +03:00
|
|
|
//old_flag := re.flag
|
|
|
|
//re.flag |= f_src // enable search mode
|
|
|
|
|
2021-01-03 18:59:00 +03:00
|
|
|
mut i := 0
|
|
|
|
for i < in_txt.len {
|
|
|
|
//--- speed references ---
|
|
|
|
mut s := -1
|
|
|
|
mut e := -1
|
|
|
|
unsafe {
|
|
|
|
tmp_str := tos(in_txt.str+i, in_txt.len-i)
|
|
|
|
s,e = re.match_string(tmp_str)
|
|
|
|
}
|
|
|
|
//------------------------
|
|
|
|
//s,e := re.find_imp(in_txt[i..])
|
|
|
|
//------------------------
|
|
|
|
if s >= 0 && e > s {
|
|
|
|
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
|
2021-03-20 02:54:12 +03:00
|
|
|
//re.flag = old_flag
|
|
|
|
return i+s, i+e
|
|
|
|
} else {
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
//re.flag = old_flag
|
|
|
|
return -1, -1
|
|
|
|
}
|
|
|
|
|
|
|
|
// find try to find the first match in the input string strarting from start index
|
|
|
|
[direct_array_access]
|
|
|
|
pub fn (mut re RE) find_from(in_txt string, start int) (int,int) {
|
|
|
|
old_flag := re.flag
|
|
|
|
re.flag |= f_src // enable search mode
|
|
|
|
|
|
|
|
mut i := start
|
|
|
|
if i < 0 {
|
|
|
|
return -1, -1
|
|
|
|
}
|
|
|
|
for i < in_txt.len {
|
|
|
|
//--- speed references ---
|
|
|
|
|
|
|
|
mut s := -1
|
|
|
|
mut e := -1
|
|
|
|
|
|
|
|
unsafe {
|
|
|
|
tmp_str := tos(in_txt.str+i, in_txt.len-i)
|
|
|
|
s,e = re.match_string(tmp_str)
|
|
|
|
}
|
|
|
|
//------------------------
|
|
|
|
//s,e = re.find_imp(in_txt[i..])
|
|
|
|
//------------------------
|
|
|
|
if s >= 0 && e > s {
|
|
|
|
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}]")
|
|
|
|
re.flag = old_flag
|
2021-01-03 18:59:00 +03:00
|
|
|
return i+s, i+e
|
|
|
|
} else {
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2021-03-20 02:54:12 +03:00
|
|
|
re.flag = old_flag
|
2021-01-03 18:59:00 +03:00
|
|
|
return -1, -1
|
|
|
|
}
|
2020-12-24 08:27:46 +03:00
|
|
|
|
|
|
|
// find_all find all the non overlapping occurrences of the match pattern
|
|
|
|
[direct_array_access]
|
|
|
|
pub fn (mut re RE) find_all(in_txt string) []int {
|
2021-03-20 02:54:12 +03:00
|
|
|
//old_flag := re.flag
|
|
|
|
//re.flag |= f_src // enable search mode
|
|
|
|
|
2020-12-24 08:27:46 +03:00
|
|
|
mut i := 0
|
|
|
|
mut res := []int{}
|
|
|
|
mut ls := -1
|
|
|
|
|
|
|
|
for i < in_txt.len {
|
2020-12-26 23:08:56 +03:00
|
|
|
//--- speed references ---
|
|
|
|
mut s := -1
|
|
|
|
mut e := -1
|
|
|
|
unsafe {
|
|
|
|
tmp_str := tos(in_txt.str+i, in_txt.len-i)
|
2021-01-03 18:59:00 +03:00
|
|
|
s,e = re.match_string(tmp_str)
|
2020-12-26 23:08:56 +03:00
|
|
|
}
|
|
|
|
//------------------------
|
2021-01-03 18:59:00 +03:00
|
|
|
//s,e := re.find_imp(in_txt[i..])
|
2020-12-26 23:08:56 +03:00
|
|
|
//------------------------
|
2020-12-24 08:27:46 +03:00
|
|
|
if s >= 0 && e > s && i+s > ls {
|
|
|
|
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
|
|
|
|
res << i+s
|
|
|
|
res << i+e
|
|
|
|
ls = i+s
|
|
|
|
i = i+e
|
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
2021-03-20 02:54:12 +03:00
|
|
|
//re.flag = old_flag
|
2020-12-24 08:27:46 +03:00
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
|
|
|
// find_all_str find all the non overlapping occurrences of the match pattern, return a string list
|
|
|
|
[direct_array_access]
|
|
|
|
pub fn (mut re RE) find_all_str(in_txt string) []string {
|
|
|
|
mut i := 0
|
|
|
|
mut res := []string{}
|
|
|
|
mut ls := -1
|
|
|
|
|
|
|
|
for i < in_txt.len {
|
2020-12-26 23:08:56 +03:00
|
|
|
//--- speed references ---
|
|
|
|
mut s := -1
|
|
|
|
mut e := -1
|
|
|
|
unsafe {
|
|
|
|
tmp_str := tos(in_txt.str+i, in_txt.len-i)
|
|
|
|
s,e = re.find(tmp_str)
|
|
|
|
}
|
|
|
|
//------------------------
|
|
|
|
//s,e := re.find(in_txt[i..])
|
|
|
|
//------------------------
|
2020-12-24 08:27:46 +03:00
|
|
|
if s >= 0 && e > s && i+s > ls {
|
|
|
|
//println("find match in: ${i+s},${i+e} [${in_txt[i+s..i+e]}] ls:$ls")
|
|
|
|
res << in_txt[i+s..i+e]
|
|
|
|
ls = i+s
|
|
|
|
i = i+e
|
|
|
|
continue
|
|
|
|
} else {
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
/******************************************************************************
|
|
|
|
*
|
|
|
|
* Replacers
|
|
|
|
*
|
|
|
|
******************************************************************************/
|
|
|
|
// replace return a string where the matches are replaced with the replace string
|
|
|
|
pub fn (mut re RE) replace(in_txt string, repl string) string {
|
|
|
|
pos := re.find_all(in_txt)
|
|
|
|
|
|
|
|
if pos.len > 0 {
|
|
|
|
mut res := ""
|
|
|
|
mut i := 0
|
|
|
|
|
|
|
|
mut s1 := 0
|
|
|
|
mut e1 := in_txt.len
|
|
|
|
|
|
|
|
for i < pos.len {
|
|
|
|
e1 = pos[i]
|
|
|
|
res += in_txt[s1..e1] + repl
|
|
|
|
s1 = pos[i+1]
|
|
|
|
i += 2
|
|
|
|
}
|
|
|
|
|
|
|
|
res += in_txt[s1..]
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
return in_txt
|
|
|
|
}
|
|
|
|
|
|
|
|
// type of function used for custom replace
|
|
|
|
// in_txt source text
|
|
|
|
// start index of the start of the match in in_txt
|
|
|
|
// end index of the end of the match in in_txt
|
|
|
|
// the match is in in_txt[start..end]
|
2021-01-18 15:20:06 +03:00
|
|
|
pub type FnReplace = fn (re RE, in_txt string, start int, end int) string
|
2020-12-24 08:27:46 +03:00
|
|
|
|
|
|
|
// replace_by_fn return a string where the matches are replaced with the string from the repl_fn callback function
|
|
|
|
pub fn (mut re RE) replace_by_fn(in_txt string, repl_fn FnReplace) string {
|
|
|
|
mut i := 0
|
2021-03-20 02:54:12 +03:00
|
|
|
mut res := strings.new_builder(in_txt.len)
|
|
|
|
mut last_end := 0
|
2020-12-24 08:27:46 +03:00
|
|
|
|
|
|
|
for i < in_txt.len {
|
2021-03-20 02:54:12 +03:00
|
|
|
//println("Find Start. $i [${in_txt[i..]}]")
|
|
|
|
s, e := re.find_from(in_txt,i)
|
|
|
|
//println("Find End.")
|
|
|
|
if s >= 0 && e > s {
|
|
|
|
//println("find match in: ${s},${e} [${in_txt[s..e]}]")
|
|
|
|
|
|
|
|
if last_end < s {
|
|
|
|
res.write_string(in_txt[last_end..s])
|
2020-12-24 08:27:46 +03:00
|
|
|
}
|
|
|
|
|
2021-03-20 02:54:12 +03:00
|
|
|
for g_i in 0..re.group_count {
|
|
|
|
re.groups[g_i << 1 ] += i
|
|
|
|
re.groups[(g_i << 1) + 1] += i
|
|
|
|
}
|
|
|
|
|
|
|
|
repl := repl_fn(re, in_txt, s, e)
|
|
|
|
//println("repl res: $repl")
|
|
|
|
res.write_string(repl)
|
|
|
|
//res.write_string("[[${in_txt[s..e]}]]")
|
|
|
|
|
|
|
|
last_end = e
|
|
|
|
i = e
|
2020-12-24 08:27:46 +03:00
|
|
|
} else {
|
2021-03-20 02:54:12 +03:00
|
|
|
break
|
|
|
|
//i++
|
2020-12-24 08:27:46 +03:00
|
|
|
}
|
2021-03-20 02:54:12 +03:00
|
|
|
//println(i)
|
2020-12-24 08:27:46 +03:00
|
|
|
}
|
2021-03-20 02:54:12 +03:00
|
|
|
if last_end >= 0 && last_end < in_txt.len {
|
|
|
|
res.write_string(in_txt[last_end..])
|
|
|
|
}
|
|
|
|
return res.str()
|
2020-12-24 08:27:46 +03:00
|
|
|
}
|