1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

regex: fix formatting inconsistencies in README.md (#17940)

This commit is contained in:
Turiiya 2023-04-13 13:44:45 +02:00 committed by GitHub
parent 524f7c3ead
commit 489ac892b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,5 @@
# Description # Description
`regex` is a small but powerful regular expression library, `regex` is a small but powerful regular expression library,
written in pure V. written in pure V.
@ -17,7 +18,6 @@ are valid for all the `regex` module features:
2. The basic atomic elements of this regex engine are the tokens. 2. The basic atomic elements of this regex engine are the tokens.
In a query string a simple character is a token. In a query string a simple character is a token.
## Differences with PCRE: ## Differences with PCRE:
> **Note** > **Note**
@ -39,25 +39,24 @@ Note: **Two char classes with an `OR` in the middle is a syntax error.**
- The **match operation stops at the end of the string**. It does *NOT* stop - The **match operation stops at the end of the string**. It does *NOT* stop
at new line characters. at new line characters.
- The **match operation stops at the end of the string**. It does *NOT* stop
at new line characters.
## Tokens ## Tokens
The tokens are the atomic units, used by this regex engine. The tokens are the atomic units, used by this regex engine.
They can be one of the following: They can be one of the following:
### Simple char ### Simple char
This token is a simple single character like `a` or `b` etc. This token is a simple single character like `a` or `b` etc.
### Match positional delimiters ### Match positional delimiters
`^` Matches the start of the string. `^` Matches the start of the string.
`$` Matches the end of the string. `$` Matches the end of the string.
### Char class (cc) ### Char class (cc)
The character classes match all the chars specified inside. Use square The character classes match all the chars specified inside. Use square
@ -98,14 +97,14 @@ For example `\w` is the meta-char `w`.
A meta-char can match different types of characters. A meta-char can match different types of characters.
* `\w` matches a word char char `[a-zA-Z0-9_]` - `\w` matches a word char char `[a-zA-Z0-9_]`
* `\W` matches a non word char - `\W` matches a non word char
* `\d` matches a digit `[0-9]` - `\d` matches a digit `[0-9]`
* `\D` matches a non digit - `\D` matches a non digit
* `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']` - `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
* `\S` matches a non space char - `\S` matches a non space char
* `\a` matches only a lowercase char `[a-z]` - `\a` matches only a lowercase char `[a-z]`
* `\A` matches only an uppercase char `[A-Z]` - `\A` matches only an uppercase char `[A-Z]`
### Quantifier ### Quantifier
@ -142,11 +141,12 @@ with a regex. The following table show the query strings and the result of
parsing source string. parsing source string.
| query string | result | | query string | result |
|--------------|-------------| | ------------ | ----------- |
| `.*c` | `abc` | | `.*c` | `abc` |
| `.*dd` | `abcc dd` | | `.*dd` | `abcc dd` |
| `ab.*e` | `abccc dde` | | `ab.*e` | `abccc dde` |
| `ab.{3} .*e` | `abccc dde` | | `ab.{3} .*e` | `abccc dde` |
The dot matches any character, until the next token match is satisfied. The dot matches any character, until the next token match is satisfied.
> Important Note: Consecutive dots, for example `...`, are not allowed. > Important Note: Consecutive dots, for example `...`, are not allowed.
@ -278,10 +278,10 @@ Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
that get directly the string of a group using its `id`: that get directly the string of a group using its `id`:
```v ignore ```v ignore
txt := "my used string...." txt := 'my used string....'
for g_index := 0; g_index < re.group_count; g_index++ { for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}") bounds: ${re.get_group_bounds_by_id(g_index)}')
} }
``` ```
@ -311,32 +311,33 @@ not be saved.
```v ignore ```v ignore
import regex import regex
fn main() { fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) } mut re := regex.regex_opt(query) or { panic(err) }
// println(re.get_code()) // uncomment to see the print of the regex execution code // println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug = 2 // enable maximum log re.debug = 2 // enable maximum log
println("String: ${txt}") println('String: ${txt}')
println("Query : ${re.get_query()}") println('Query : ${re.get_query()}')
re.debug = 0 // disable log re.debug = 0 // disable log
re.group_csave_flag = true re.group_csave_flag = true
start, end := re.match_string(txt) start, end := re.match_string(txt)
if start >= 0 { if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]") println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else { } else {
println("No Match") println('No Match')
} }
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 { if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
println("cg: ${re.group_csave}") println('cg: ${re.group_csave}')
mut cs_i := 1 mut cs_i := 1
for cs_i < re.group_csave[0] * 3 { for cs_i < re.group_csave[0] * 3 {
g_id := re.group_csave[cs_i] g_id := re.group_csave[cs_i]
st := re.group_csave[cs_i + 1] st := re.group_csave[cs_i + 1]
en := re.group_csave[cs_i + 2] en := re.group_csave[cs_i + 2]
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]") println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
cs_i += 3 cs_i += 3
} }
} }
@ -374,23 +375,25 @@ that is a map from `string` to `int`, where the value is the index in
`group_csave` list of indexes. `group_csave` list of indexes.
Here is an example for how to use them: Here is an example for how to use them:
```v ignore ```v ignore
import regex import regex
fn main() { fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) } mut re := regex.regex_opt(query) or { panic(err) }
// println(re.get_code()) // uncomment to see the print of the regex execution code // println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug = 2 // enable maximum log re.debug = 2 // enable maximum log
println("String: ${txt}") println('String: ${txt}')
println("Query : ${re.get_query()}") println('Query : ${re.get_query()}')
re.debug = 0 // disable log re.debug = 0 // disable log
start, end := re.match_string(txt) start, end := re.match_string(txt)
if start >= 0 { if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]") println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else { } else {
println("No Match") println('No Match')
} }
for name in re.group_map.keys() { for name in re.group_map.keys() {
@ -414,6 +417,7 @@ In order to simplify the use of the named groups, it is possible to
use a name map in the `re` struct, using the function `re.get_group_by_name`. use a name map in the `re` struct, using the function `re.get_group_by_name`.
Here is a more complex example of using them: Here is a more complex example of using them:
```v oksyntax ```v oksyntax
// This function demonstrate the use of the named groups // This function demonstrate the use of the named groups
fn convert_html_rgb_n(in_col string) u32 { fn convert_html_rgb_n(in_col string) u32 {
@ -443,15 +447,13 @@ Other utilities are `get_group_by_name` and `get_group_bounds_by_name`,
that return the string of a group using its `name`: that return the string of a group using its `name`:
```v ignore ```v ignore
txt := "my used string...." txt := 'my used string....'
for name in re.group_map.keys() { for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \ println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}") bounds: ${re.get_group_bounds_by_name(name)}")
} }
``` ```
### Groups query functions ### Groups query functions
These functions are helpers to query the captured groups These functions are helpers to query the captured groups
@ -522,12 +524,14 @@ pub fn regex_opt(in_query string) ?RE
```v ignore ```v ignore
// new_regex create a REgex of small size, usually sufficient for ordinary use // new_regex create a REgex of small size, usually sufficient for ordinary use
pub fn new() RE pub fn new() RE
``` ```
#### **Custom initialization** #### **Custom initialization**
For some particular needs, it is possible to initialize a fully customized regex: For some particular needs, it is possible to initialize a fully customized regex:
```v ignore ```v ignore
pattern = r"ab(.*)(ac)" pattern = r'ab(.*)(ac)'
// init custom regex // init custom regex
mut re := regex.RE{} mut re := regex.RE{}
// max program length, can not be longer then the pattern // max program length, can not be longer then the pattern
@ -541,13 +545,14 @@ re.group_max = pattern.len>>1 // we can't have more groups than the half
re.group_stack = []int{len: re.group_max, init: -1} re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1} re.group_data = []int{len: re.group_max, init: -1}
``` ```
### Compiling ### Compiling
After an initializer is used, the regex expression must be compiled with: After an initializer is used, the regex expression must be compiled with:
```v ignore ```v ignore
// compile compiles the REgex returning an error if the compilation fails // compile compiles the REgex returning an error if the compilation fails
pub fn (re mut RE) compile_opt(in_txt string)? pub fn (mut re RE) compile_opt(in_txt string) ?
``` ```
### Matching Functions ### Matching Functions
@ -556,8 +561,7 @@ These are the matching functions
```v ignore ```v ignore
// match_string try to match the input string, return start and end index if found else start is -1 // match_string try to match the input string, return start and end index if found else start is -1
pub fn (re mut RE) match_string(in_txt string) (int,int) pub fn (mut re RE) match_string(in_txt string) (int, int)
``` ```
## Find and Replace ## Find and Replace
@ -569,16 +573,16 @@ There are the following find and replace functions:
```v ignore ```v ignore
// find try to find the first match in the input string // find try to find the first match in the input string
// return start and end index if found else start is -1 // return start and end index if found else start is -1
pub fn (re mut RE) find(in_txt string) (int,int) pub fn (mut re RE) find(in_txt string) (int, int)
// find_all find all the "non overlapping" occurrences of the matching pattern // find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of start end indexes like: [3,4,6,8] // return a list of start end indexes like: [3,4,6,8]
// the matches are [3,4] and [6,8] // the matches are [3,4] and [6,8]
pub fn (re mut RE) find_all(in_txt string) []int pub fn (mut re RE) find_all(in_txt string) []int
// find_all find all the "non overlapping" occurrences of the matching pattern // find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of strings // return a list of strings
// the result is like ["first match","secon match"] // the result is like ['first match','secon match']
pub fn (mut re RE) find_all_str(in_txt string) []string pub fn (mut re RE) find_all_str(in_txt string) []string
``` ```
@ -587,16 +591,16 @@ pub fn (mut re RE) find_all_str(in_txt string) []string
```v ignore ```v ignore
// replace return a string where the matches are replaced with the repl_str string, // replace return a string where the matches are replaced with the repl_str string,
// this function support groups in the replace string // this function support groups in the replace string
pub fn (re mut RE) replace(in_txt string, repl string) string pub fn (mut re RE) replace(in_txt string, repl string) string
``` ```
replace string can include groups references: replace string can include groups references:
```v ignore ```v ignore
txt := "Today it is a good day." txt := 'Today it is a good day.'
query := r'(a\w)[ ,.]' query := r'(a\w)[ ,.]'
mut re := regex.regex_opt(query)? mut re := regex.regex_opt(query)?
res := re.replace(txt, r"__[\0]__") res := re.replace(txt, r'__[\0]__')
``` ```
in this example we used the group `0` in the replace string: `\0`, the result will be: in this example we used the group `0` in the replace string: `\0`, the result will be:
@ -617,6 +621,7 @@ pub fn (mut re RE) replace_simple(in_txt string, repl string) string
``` ```
If it is needed to replace N instances of the found strings it is possible to use: If it is needed to replace N instances of the found strings it is possible to use:
```v ignore ```v ignore
// replace_n return a string where the first `count` matches are replaced with the repl_str string // replace_n return a string where the first `count` matches are replaced with the repl_str string
// `count` indicate the number of max replacements that will be done. // `count` indicate the number of max replacements that will be done.
@ -650,16 +655,17 @@ The following example will clarify its usage:
import regex import regex
// customized replace functions // customized replace functions
// it will be called on each non overlapped find // it will be called on each non overlapped find
fn my_repl(re regex.RE, in_txt string, start int, end int) string { fn my_repl(re regex.RE, in_txt string, start int, end int) string {
g0 := re.get_group_by_id(in_txt, 0) g0 := re.get_group_by_id(in_txt, 0)
g1 := re.get_group_by_id(in_txt, 1) g1 := re.get_group_by_id(in_txt, 1)
g2 := re.get_group_by_id(in_txt, 2) g2 := re.get_group_by_id(in_txt, 2)
return "*${g0}*${g1}*${g2}*" return '*${g0}*${g1}*${g2}*'
} }
fn main() { fn main() {
txt := "today [John] is gone to his house with (Jack) and [Marie]." txt := 'today [John] is gone to his house with (Jack) and [Marie].'
query := r"(.)(\A\w+)(.)" query := r'(.)(\A\w+)(.)'
mut re := regex.regex_opt(query) or { panic(err) } mut re := regex.regex_opt(query) or { panic(err) }
@ -674,8 +680,6 @@ Output:
today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*. today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
``` ```
## Debugging ## Debugging
This module has few small utilities to you write regex patterns. This module has few small utilities to you write regex patterns.
@ -811,34 +815,36 @@ Here an example that perform some basically match of strings
import regex import regex
fn main() { fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html" txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+" query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) } mut re := regex.regex_opt(query) or { panic(err) }
start, end := re.match_string(txt) start, end := re.match_string(txt)
if start >= 0 { if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]") println('Match (${start}, ${end}) => [${txt[start..end]}]')
for g_index := 0; g_index < re.group_count; g_index++ { for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}") bounds: ${re.get_group_bounds_by_id(g_index)}')
} }
for name in re.group_map.keys() { for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \ println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}") bounds: ${re.get_group_bounds_by_name(name)}")
} }
} else { } else {
println("No Match") println('No Match')
} }
} }
``` ```
Here an example of total customization of the regex environment creation: Here an example of total customization of the regex environment creation:
```v ignore ```v ignore
import regex import regex
fn main() { fn main() {
txt := "today John is gone to his house with Jack and Marie." txt := 'today John is gone to his house with Jack and Marie.'
query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+" query := r'(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+'
// init regex // init regex
mut re := regex.RE{} mut re := regex.RE{}
@ -859,37 +865,37 @@ fn main(){
start, end := re.match_string(txt) start, end := re.match_string(txt)
if start >= 0 { if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]") println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else { } else {
println("No Match") println('No Match')
} }
// show results for continuous group saving // show results for continuous group saving
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 { if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
println("cg: ${re.group_csave}") println('cg: ${re.group_csave}')
mut cs_i := 1 mut cs_i := 1
for cs_i < re.group_csave[0] * 3 { for cs_i < re.group_csave[0] * 3 {
g_id := re.group_csave[cs_i] g_id := re.group_csave[cs_i]
st := re.group_csave[cs_i + 1] st := re.group_csave[cs_i + 1]
en := re.group_csave[cs_i + 2] en := re.group_csave[cs_i + 2]
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]") println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
cs_i += 3 cs_i += 3
} }
} }
// show results for captured groups // show results for captured groups
if start >= 0 { if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]") println('Match (${start}, ${end}) => [${txt[start..end]}]')
for g_index := 0; g_index < re.group_count; g_index++ { for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \ println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}") bounds: ${re.get_group_bounds_by_id(g_index)}')
} }
for name in re.group_map.keys() { for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \ println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}") bounds: ${re.get_group_bounds_by_name(name)}")
} }
} else { } else {
println("No Match") println('No Match')
} }
} }
``` ```