1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

regex: lots of fixes (#7380)

This commit is contained in:
penguindark 2020-12-18 05:57:31 +01:00 committed by GitHub
parent 05e15bdd59
commit a6baffcb8c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 635 additions and 485 deletions

View File

@ -54,13 +54,13 @@ fn convert_html_rgb_n(in_col string) u32 {
println("start: $start, end: $end") println("start: $start, end: $end")
mut res := u32(0) mut res := u32(0)
if start >= 0 { if start >= 0 {
red_s, red_e := re.get_group("red") red_s, red_e := re.get_group_bounds_by_name("red")
r := ("0x" + in_col[red_s..red_e]).int() << col_mul r := ("0x" + in_col[red_s..red_e]).int() << col_mul
green_s, green_e := re.get_group("green") green_s, green_e := re.get_group_bounds_by_name("green")
g := ("0x" + in_col[green_s..green_e]).int() << col_mul g := ("0x" + in_col[green_s..green_e]).int() << col_mul
blue_s, blue_e := re.get_group("blue") blue_s, blue_e := re.get_group_bounds_by_name("blue")
b := ("0x" + in_col[blue_s..blue_e]).int() << col_mul b := ("0x" + in_col[blue_s..blue_e]).int() << col_mul
println("r: $r g: $g b: $b") println("r: $r g: $g b: $b")

View File

@ -1,4 +1,4 @@
# V RegEx (Regular expression) 0.9h # V RegEx (Regular expression) 1.0 alpha
[TOC] [TOC]
@ -226,7 +226,18 @@ fn convert_html_rgb(in_col string) u32 {
} }
``` ```
Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
that get directly the string of a group using its `id`:
```v ignore
txt := "my used string...."
for g_index := 0; g_index < re.group_count ; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
}
```
more helper functions are listed in the **Groups query functions** section.
### Groups Continuous saving ### Groups Continuous saving
@ -251,59 +262,54 @@ The regex save until finish or found that the array have no space.
If the space ends no error is raised, further records will not be saved. If the space ends no error is raised, further records will not be saved.
```v ignore ```v ignore
fn example2() { import regex
test_regex() fn main(){
text := 'tst: 01,23,45 ,56, 78' txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r'.*:(\s*\d+[\s,]*)+' query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
mut re := new() or { panic(err) }
// re.debug = 2 mut re := regex.regex_opt(query) or { panic(err) }
re.group_csave_flag = true // enable continuous capture //println(re.get_code()) // uncomment to see the print of the regex execution code
re.compile_opt(query) or { re.debug=2 // enable maximum log
println(err) println("String: ${txt}")
return println("Query : ${re.get_query()}")
} re.debug=0 // disable log
q_str := re.get_query() re.group_csave_flag = true
println('Query: $q_str') start, end := re.match_string(txt)
start, end := re.match_string(text) if start >= 0 {
if start < 0 { println("Match ($start, $end) => [${txt[start..end]}]")
println('ERROR : ${re.get_parse_error_string(start)}, $start') } else {
} else { println("No Match")
println('found in [$start, $end] => [${text[start..end]}]') }
}
// groups capture if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
mut gi := 0 println("cg: $re.group_csave")
for gi < re.groups.len { mut cs_i := 1
if re.groups[gi] >= 0 { for cs_i < re.group_csave[0]*3 {
println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi + g_id := re.group_csave[cs_i]
1]]}]') st := re.group_csave[cs_i+1]
} en := re.group_csave[cs_i+2]
gi += 2 println("cg[$g_id] $st $en:[${txt[st..en]}]")
} cs_i += 3
// continuous saving }
gi = 0 }
println('num: ${re.group_csave[0]}')
for gi < re.group_csave[0] {
id := re.group_csave[1 + gi * 3]
st := re.group_csave[1 + gi * 3 + 1]
en := re.group_csave[1 + gi * 3 + 2]
println('cg id: $id [$st, $en] => [${text[st..en]}]')
gi++
}
} }
``` ```
The output will be: The output will be:
``` ```
Query: .*:(\s*\d+[\s,]*)+ String: http://www.ciao.mondo/hello/pippo12_/pera.html
found in [0, 21] => [tst: 01,23,45 ,56, 78] Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
0 19,21 :[78] Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
num: 5 cg: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
cg id: 0 [4, 8] => [ 01,] cg[0] 0 4:[http]
cg id: 0 [8, 11] => [23,] cg[1] 7 11:[www.]
cg id: 0 [11, 15] => [45 ,] cg[1] 11 16:[ciao.]
cg id: 0 [15, 19] => [56, ] cg[1] 16 22:[mondo/]
cg id: 0 [19, 21] => [78] cg[1] 22 28:[hello/]
cg[1] 28 37:[pippo12_/]
cg[1] 37 42:[pera.]
cg[1] 42 46:[html]
``` ```
### Named capturing groups ### Named capturing groups
@ -323,89 +329,42 @@ example:
```v ignore ```v ignore
import regex import regex
fn main(){
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
fn main() { mut re := regex.regex_opt(query) or { panic(err) }
test_regex() //println(re.get_code()) // uncomment to see the print of the regex execution code
text := 'http://www.ciao.mondo/hello/pippo12_/pera.html' re.debug=2 // enable maximum log
query := r'(?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+' println("String: ${txt}")
mut re := new() println("Query : ${re.get_query()}")
re.debug = 2 re.debug=0 // disable log
// must provide an array of the right size if want the continuous saving of the groups start, end := re.match_string(txt)
re.group_csave = [-1].repeat(3 * 20 + 1) if start >= 0 {
re.compile_opt(query) or { println("Match ($start, $end) => [${txt[start..end]}]")
println(err) } else {
return println("No Match")
} }
q_str := re.get_query()
println('O.Query: $query') for name in re.group_map.keys() {
println('Query : $q_str') println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
re.debug = 0 bounds: ${re.get_group_bounds_by_name(name)}")
start, end := re.match_string(text) }
if start < 0 {
err_str := re.get_parse_error_string(start)
println('ERROR : $err_str, $start')
} else {
text1 := text[start..end]
println('found in [$start, $end] => [$text1]')
}
// groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println('${gi / 2} ${re.groups[gi]},${re.groups[gi + 1]} :[${text[re.groups[gi]..re.groups[gi +
1]]}]')
}
gi += 2
}
// continuous saving
gi = 0
println('num of group item saved: ${re.group_csave[0]}')
for gi < re.group_csave[0] {
id := re.group_csave[1 + gi * 3]
st := re.group_csave[1 + gi * 3 + 1]
en := re.group_csave[1 + gi * 3 + 2]
println('cg id: $id [$st, $en] => [${text[st..en]}]')
gi++
}
println('raw array: ${re.group_csave[0..gi * 3 + 2 - 1]}')
// named capturing groups
println('named capturing groups:')
for g_name in re.group_map.keys() {
s, e := re.get_group(g_name)
if s >= 0 && e > s {
println("'$g_name':[$s, $e] => '${text[s..e]}'")
} else {
println("Group [$g_name] doesn't exist.")
}
}
} }
``` ```
Output: Output:
``` ```
O.Query: (?P<format>https?)|(?:ftps?)://(?P<token>[\w_]+.)+ String: http://www.ciao.mondo/hello/pippo12_/pera.html
Query : #0(?P<format>https?)|{8,14}(?:ftps?)://#1(?P<token>[\w_]+.)+ Query : #0(?P<format>https?)|{8,14}#0(?P<format>ftps?)://#1(?P<token>[\w_]+.)+
found in [0, 46] => [http://www.ciao.mondo/hello/pippo12_/pera.html] Match (0, 46) => [http://www.ciao.mondo/hello/pippo12_/pera.html]
0 0,4 :[http] group:'format' => [http] bounds: (0, 4)
1 42,46 :[html] group:'token' => [html] bounds: (42, 46)
num of group item saved: 8
cg id: 0 [0, 4] => [http]
cg id: 1 [7, 11] => [www.]
cg id: 1 [11, 16] => [ciao.]
cg id: 1 [16, 22] => [mondo/]
cg id: 1 [22, 28] => [hello/]
cg id: 1 [28, 37] => [pippo12_/]
cg id: 1 [37, 42] => [pera.]
cg id: 1 [42, 46] => [html]
raw array: [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
named capturing groups:
'format':[0, 4] => 'http'
'token':[42, 46] => 'html'
``` ```
In order to simplify the use of the named groups it possible to use names map in the `re` In order to simplify the use of the named groups it possible to use names map in the `re`
struct using the function `re.get_group`. struct using the function `re.get_group_by_name`.
Here a more complex example of use: Here a more complex example of use:
@ -420,11 +379,11 @@ fn convert_html_rgb_n(in_col string) u32 {
println('start: $start, end: $end') println('start: $start, end: $end')
mut res := u32(0) mut res := u32(0)
if start >= 0 { if start >= 0 {
red_s, red_e := re.get_group('red') red_s, red_e := re.get_group_by_name('red')
r := ('0x' + in_col[red_s..red_e]).int() << col_mul r := ('0x' + in_col[red_s..red_e]).int() << col_mul
green_s, green_e := re.get_group('green') green_s, green_e := re.get_group_by_name('green')
g := ('0x' + in_col[green_s..green_e]).int() << col_mul g := ('0x' + in_col[green_s..green_e]).int() << col_mul
blue_s, blue_e := re.get_group('blue') blue_s, blue_e := re.get_group_by_name('blue')
b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul b := ('0x' + in_col[blue_s..blue_e]).int() << col_mul
println('r: $r g: $g b: $b') println('r: $r g: $g b: $b')
res = u32(r) << 16 | u32(g) << 8 | u32(b) res = u32(r) << 16 | u32(g) << 8 | u32(b)
@ -433,7 +392,45 @@ fn convert_html_rgb_n(in_col string) u32 {
} }
``` ```
Others utility functions are `get_group_by_name` and `get_group_bounds_by_name`
that get directly the string of a group using its `name`:
```v ignore
txt := "my used string...."
for name in re.group_map.keys() {
println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
```
### Groups query functions
These functions are helpers to query the captured groups
```v ignore
// get_group_bounds_by_name get a group boundaries by its name
pub fn (re RE) get_group_bounds_by_name(group_name string) (int, int)
// get_group_by_name get a group boundaries by its name
pub fn (re RE) get_group_by_name(group_name string) string
// get_group_by_id get a group boundaries by its id
pub fn (re RE) get_group_bounds_by_id(group_id int) (int,int)
// get_group_by_id get a group string by its id
pub fn (re RE) get_group_by_id(in_txt string, group_id int) string
struct Re_group {
pub:
start int = -1
end int = -1
}
// get_group_list return a list of Re_group for the found groups
pub fn (re RE) get_group_list() []Re_group
```
## Flags ## Flags
@ -501,6 +498,48 @@ pub fn (re mut RE) find_all(in_txt string) []int
pub fn (re mut RE) replace(in_txt string, repl string) string pub fn (re mut RE) replace(in_txt string, repl string) string
``` ```
## Find and Replace
For complex find and replace operations it is available the function `replace_by_fn` .
The`replace_by_fn` use a custom replace function making possible customizations.
**The custom function is called for every non overlapped find.**
The custom function must be of the type:
```v ignore
fn (re RE, in_txt string, start int, end int) string
```
The following example will clarify the use:
```v ignore
import regex
// customized replace functions
// it will be called on each non overlapped find
fn my_repl(re regex.RE, in_txt string, start int, end int) string {
g0 := re.get_group_by_id(in_txt, 0)
g1 := re.get_group_by_id(in_txt, 1)
g2 := re.get_group_by_id(in_txt, 2)
return "*$g0*$g1*$g2*"
}
fn main(){
txt := "today [John] is gone to his house with (Jack) and [Marie]."
query := r"(.)(\A\w+)(.)"
mut re := regex.regex_opt(query) or { panic(err) }
result := re.replace_by_fn(txt, my_repl)
println(result)
}
```
Output:
```
today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
```
## Debugging ## Debugging
This module has few small utilities to help the writing of regex expressions. This module has few small utilities to help the writing of regex expressions.
@ -527,11 +566,20 @@ The result will be something like this:
``` ```
======================================== ========================================
v RegEx compiler v 0.9c output: v RegEx compiler v 1.0 alpha output:
PC: 0 ist: 7fffffff [a] query_ch { 1, 1} PC: 0 ist: 92000000 ( GROUP_START #:0 { 1, 1}
PC: 1 ist: 7fffffff [b] query_ch { 1,MAX} PC: 1 ist: 98000000 . DOT_CHAR nx chk: 4 { 1, 1}
PC: 2 ist: 88000000 PROG_END { 0, 0} PC: 2 ist: 94000000 ) GROUP_END #:0 { 1, 1}
PC: 3 ist: 92000000 ( GROUP_START #:1 { 1, 1}
PC: 4 ist: 90000000 [\A] BSLS { 1, 1}
PC: 5 ist: 90000000 [\w] BSLS { 1,MAX}
PC: 6 ist: 94000000 ) GROUP_END #:1 { 1, 1}
PC: 7 ist: 92000000 ( GROUP_START #:2 { 1, 1}
PC: 8 ist: 98000000 . DOT_CHAR nx chk: -1 last! { 1, 1}
PC: 9 ist: 94000000 ) GROUP_END #:2 { 1, 1}
PC: 10 ist: 88000000 PROG_END { 0, 0}
======================================== ========================================
``` ```
`PC`:`int` is the program counter or step of execution, each single step is a token. `PC`:`int` is the program counter or step of execution, each single step is a token.
@ -625,54 +673,29 @@ re.log_func = custom_print
Here there is a simple code to perform some basically match of strings Here there is a simple code to perform some basically match of strings
```v oksyntax ```v ignore
struct TestObj { import regex
source string // source string to parse
query string // regex query string
s int // expected match start index
e int // expected match end index
}
const ( fn main(){
tests = [ txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
TestObj{'this is a good.', r'this (\w+) a', 0, 9}, query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
TestObj{'this,these,those. over', r'(th[eio]se?[,. ])+', 0, 17},
TestObj{'test1@post.pip.com, pera', r'[\w]+@([\w]+\.)+\w+', 0, 18},
TestObj{'cpapaz ole. pippo,', r'.*c.+ole.*pi', 0, 14},
TestObj{'adce aabe', r'(a(ab)+)|(a(dc)+)e', 0, 4},
]
)
fn example() { mut re := regex.regex_opt(query) or { panic(err) }
for c, tst in tests {
mut re := regex.new()
re.compile_opt(tst.query) or {
println(err)
continue
}
// print the query parsed with the groups ids
re.debug = 1 // set debug on at minimum level
println('#${c:2d} query parsed: $re.get_query()')
re.debug = 0
// do the match
start, end := re.match_string(tst.source)
if start >= 0 && end > start {
println('#${c:2d} found in: [$start, $end] => [${tst.source[start..end]}]')
}
// print the groups
mut gi := 0
for gi < re.groups.len {
if re.groups[gi] >= 0 {
println('group ${gi / 2:2d} :[${tst.source[re.groups[gi]..re.groups[gi + 1]]}]')
}
gi += 2
}
println('')
}
}
fn main() { start, end := re.match_string(txt)
example() if start >= 0 {
println("Match ($start, $end) => [${txt[start..end]}]")
for g_index := 0; g_index < re.group_count ; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
}
for name in re.group_map.keys() {
println("group:'$name' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
} else {
println("No Match")
}
} }
``` ```

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@ pub fn (mut re RE) compile_opt(pattern string) ? {
if re_err != compile_ok { if re_err != compile_ok {
mut err_msg := strings.new_builder(300) mut err_msg := strings.new_builder(300)
err_msg.write("query: $pattern\n") err_msg.write("\nquery: $pattern\n")
line := "-".repeat(err_pos) line := "-".repeat(err_pos)
err_msg.write("err : ${line}^\n") err_msg.write("err : ${line}^\n")
err_str := re.get_parse_error_string(re_err) err_str := re.get_parse_error_string(re_err)

View File

@ -21,6 +21,10 @@ match_test_suite = [
TestItem{"b",r"b|a",0,1}, TestItem{"b",r"b|a",0,1},
TestItem{"c",r"b|a",-1,0}, TestItem{"c",r"b|a",-1,0},
// test base
TestItem{"[ciao]",r"(.)ciao(.)",0,6},
TestItem{"[ciao] da me",r"(.)ciao(.)",0,6},
// positive // positive
TestItem{"this is a good.",r"this",0,4}, TestItem{"this is a good.",r"this",0,4},
TestItem{"this is a good.",r"good",10,14}, TestItem{"this is a good.",r"good",10,14},
@ -193,7 +197,8 @@ cgroups_test_suite = [
TestItemCGroup{ TestItemCGroup{
"http://www.ciao.mondo/hello/pippo12_/pera.html", "http://www.ciao.mondo/hello/pippo12_/pera.html",
r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46, r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+",0,46,
[8, 0, 0, 4, 1, 7, 12, 1, 11, 17, 1, 16, 23, 1, 22, 29, 1, 28, 38, 1, 37, 43, 1, 42, 46], [8, 0, 0, 4, 1, 7, 11, 1, 11, 16, 1, 16, 22, 1, 22, 28, 1, 28, 37, 1, 37, 42, 1, 42, 46]
//[8, 0, 0, 4, 1, 7, 10, 1, 11, 15, 1, 16, 21, 1, 22, 27, 1, 28, 36, 1, 37, 41, 1, 42, 46],
{'format':int(0),'token':1} {'format':int(0),'token':1}
}, },
TestItemCGroup{ TestItemCGroup{