1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

regex: fix formatting inconsistencies in README.md (#17940)

This commit is contained in:
Turiiya 2023-04-13 13:44:45 +02:00 committed by GitHub
parent 524f7c3ead
commit 489ac892b9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,4 +1,5 @@
# Description
`regex` is a small but powerful regular expression library,
written in pure V.
@ -17,7 +18,6 @@ are valid for all the `regex` module features:
2. The basic atomic elements of this regex engine are the tokens.
In a query string a simple character is a token.
## Differences with PCRE:
> **Note**
@ -39,25 +39,24 @@ Note: **Two char classes with an `OR` in the middle is a syntax error.**
- The **match operation stops at the end of the string**. It does *NOT* stop
at new line characters.
- The **match operation stops at the end of the string**. It does *NOT* stop
at new line characters.
## Tokens
The tokens are the atomic units, used by this regex engine.
They can be one of the following:
### Simple char
This token is a simple single character like `a` or `b` etc.
### Match positional delimiters
`^` Matches the start of the string.
`$` Matches the end of the string.
### Char class (cc)
The character classes match all the chars specified inside. Use square
@ -98,14 +97,14 @@ For example `\w` is the meta-char `w`.
A meta-char can match different types of characters.
* `\w` matches a word char char `[a-zA-Z0-9_]`
* `\W` matches a non word char
* `\d` matches a digit `[0-9]`
* `\D` matches a non digit
* `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
* `\S` matches a non space char
* `\a` matches only a lowercase char `[a-z]`
* `\A` matches only an uppercase char `[A-Z]`
- `\w` matches a word char char `[a-zA-Z0-9_]`
- `\W` matches a non word char
- `\d` matches a digit `[0-9]`
- `\D` matches a non digit
- `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
- `\S` matches a non space char
- `\a` matches only a lowercase char `[a-z]`
- `\A` matches only an uppercase char `[A-Z]`
### Quantifier
@ -142,11 +141,12 @@ with a regex. The following table show the query strings and the result of
parsing source string.
| query string | result |
|--------------|-------------|
| ------------ | ----------- |
| `.*c` | `abc` |
| `.*dd` | `abcc dd` |
| `ab.*e` | `abccc dde` |
| `ab.{3} .*e` | `abccc dde` |
The dot matches any character, until the next token match is satisfied.
> Important Note: Consecutive dots, for example `...`, are not allowed.
@ -278,10 +278,10 @@ Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
that get directly the string of a group using its `id`:
```v ignore
txt := "my used string...."
txt := 'my used string....'
for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}')
}
```
@ -311,32 +311,33 @@ not be saved.
```v ignore
import regex
fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) }
// println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug = 2 // enable maximum log
println("String: ${txt}")
println("Query : ${re.get_query()}")
println('String: ${txt}')
println('Query : ${re.get_query()}')
re.debug = 0 // disable log
re.group_csave_flag = true
start, end := re.match_string(txt)
if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]")
println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else {
println("No Match")
println('No Match')
}
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
println("cg: ${re.group_csave}")
println('cg: ${re.group_csave}')
mut cs_i := 1
for cs_i < re.group_csave[0] * 3 {
g_id := re.group_csave[cs_i]
st := re.group_csave[cs_i + 1]
en := re.group_csave[cs_i + 2]
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
cs_i += 3
}
}
@ -374,23 +375,25 @@ that is a map from `string` to `int`, where the value is the index in
`group_csave` list of indexes.
Here is an example for how to use them:
```v ignore
import regex
fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) }
// println(re.get_code()) // uncomment to see the print of the regex execution code
re.debug = 2 // enable maximum log
println("String: ${txt}")
println("Query : ${re.get_query()}")
println('String: ${txt}')
println('Query : ${re.get_query()}')
re.debug = 0 // disable log
start, end := re.match_string(txt)
if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]")
println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else {
println("No Match")
println('No Match')
}
for name in re.group_map.keys() {
@ -414,6 +417,7 @@ In order to simplify the use of the named groups, it is possible to
use a name map in the `re` struct, using the function `re.get_group_by_name`.
Here is a more complex example of using them:
```v oksyntax
// This function demonstrate the use of the named groups
fn convert_html_rgb_n(in_col string) u32 {
@ -443,15 +447,13 @@ Other utilities are `get_group_by_name` and `get_group_bounds_by_name`,
that return the string of a group using its `name`:
```v ignore
txt := "my used string...."
txt := 'my used string....'
for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
```
### Groups query functions
These functions are helpers to query the captured groups
@ -522,12 +524,14 @@ pub fn regex_opt(in_query string) ?RE
```v ignore
// new_regex create a REgex of small size, usually sufficient for ordinary use
pub fn new() RE
```
#### **Custom initialization**
For some particular needs, it is possible to initialize a fully customized regex:
```v ignore
pattern = r"ab(.*)(ac)"
pattern = r'ab(.*)(ac)'
// init custom regex
mut re := regex.RE{}
// max program length, can not be longer then the pattern
@ -541,13 +545,14 @@ re.group_max = pattern.len>>1 // we can't have more groups than the half
re.group_stack = []int{len: re.group_max, init: -1}
re.group_data = []int{len: re.group_max, init: -1}
```
### Compiling
After an initializer is used, the regex expression must be compiled with:
```v ignore
// compile compiles the REgex returning an error if the compilation fails
pub fn (re mut RE) compile_opt(in_txt string)?
pub fn (mut re RE) compile_opt(in_txt string) ?
```
### Matching Functions
@ -556,8 +561,7 @@ These are the matching functions
```v ignore
// match_string try to match the input string, return start and end index if found else start is -1
pub fn (re mut RE) match_string(in_txt string) (int,int)
pub fn (mut re RE) match_string(in_txt string) (int, int)
```
## Find and Replace
@ -569,16 +573,16 @@ There are the following find and replace functions:
```v ignore
// find try to find the first match in the input string
// return start and end index if found else start is -1
pub fn (re mut RE) find(in_txt string) (int,int)
pub fn (mut re RE) find(in_txt string) (int, int)
// find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of start end indexes like: [3,4,6,8]
// the matches are [3,4] and [6,8]
pub fn (re mut RE) find_all(in_txt string) []int
pub fn (mut re RE) find_all(in_txt string) []int
// find_all find all the "non overlapping" occurrences of the matching pattern
// return a list of strings
// the result is like ["first match","secon match"]
// the result is like ['first match','secon match']
pub fn (mut re RE) find_all_str(in_txt string) []string
```
@ -587,16 +591,16 @@ pub fn (mut re RE) find_all_str(in_txt string) []string
```v ignore
// replace return a string where the matches are replaced with the repl_str string,
// this function support groups in the replace string
pub fn (re mut RE) replace(in_txt string, repl string) string
pub fn (mut re RE) replace(in_txt string, repl string) string
```
replace string can include groups references:
```v ignore
txt := "Today it is a good day."
txt := 'Today it is a good day.'
query := r'(a\w)[ ,.]'
mut re := regex.regex_opt(query)?
res := re.replace(txt, r"__[\0]__")
res := re.replace(txt, r'__[\0]__')
```
in this example we used the group `0` in the replace string: `\0`, the result will be:
@ -617,6 +621,7 @@ pub fn (mut re RE) replace_simple(in_txt string, repl string) string
```
If it is needed to replace N instances of the found strings it is possible to use:
```v ignore
// replace_n return a string where the first `count` matches are replaced with the repl_str string
// `count` indicate the number of max replacements that will be done.
@ -650,16 +655,17 @@ The following example will clarify its usage:
import regex
// customized replace functions
// it will be called on each non overlapped find
fn my_repl(re regex.RE, in_txt string, start int, end int) string {
g0 := re.get_group_by_id(in_txt, 0)
g1 := re.get_group_by_id(in_txt, 1)
g2 := re.get_group_by_id(in_txt, 2)
return "*${g0}*${g1}*${g2}*"
return '*${g0}*${g1}*${g2}*'
}
fn main() {
txt := "today [John] is gone to his house with (Jack) and [Marie]."
query := r"(.)(\A\w+)(.)"
txt := 'today [John] is gone to his house with (Jack) and [Marie].'
query := r'(.)(\A\w+)(.)'
mut re := regex.regex_opt(query) or { panic(err) }
@ -674,8 +680,6 @@ Output:
today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
```
## Debugging
This module has few small utilities to you write regex patterns.
@ -811,34 +815,36 @@ Here an example that perform some basically match of strings
import regex
fn main() {
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
mut re := regex.regex_opt(query) or { panic(err) }
start, end := re.match_string(txt)
if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]")
println('Match (${start}, ${end}) => [${txt[start..end]}]')
for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}')
}
for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
} else {
println("No Match")
println('No Match')
}
}
```
Here an example of total customization of the regex environment creation:
```v ignore
import regex
fn main() {
txt := "today John is gone to his house with Jack and Marie."
query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+"
txt := 'today John is gone to his house with Jack and Marie.'
query := r'(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+'
// init regex
mut re := regex.RE{}
@ -859,37 +865,37 @@ fn main(){
start, end := re.match_string(txt)
if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]")
println('Match (${start}, ${end}) => [${txt[start..end]}]')
} else {
println("No Match")
println('No Match')
}
// show results for continuous group saving
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
println("cg: ${re.group_csave}")
println('cg: ${re.group_csave}')
mut cs_i := 1
for cs_i < re.group_csave[0] * 3 {
g_id := re.group_csave[cs_i]
st := re.group_csave[cs_i + 1]
en := re.group_csave[cs_i + 2]
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
cs_i += 3
}
}
// show results for captured groups
if start >= 0 {
println("Match (${start}, ${end}) => [${txt[start..end]}]")
println('Match (${start}, ${end}) => [${txt[start..end]}]')
for g_index := 0; g_index < re.group_count; g_index++ {
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}")
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
bounds: ${re.get_group_bounds_by_id(g_index)}')
}
for name in re.group_map.keys() {
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
bounds: ${re.get_group_bounds_by_name(name)}")
}
} else {
println("No Match")
println('No Match')
}
}
```