mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
regex: fix formatting inconsistencies in README.md (#17940)
This commit is contained in:
parent
524f7c3ead
commit
489ac892b9
@ -1,4 +1,5 @@
|
||||
# Description
|
||||
|
||||
`regex` is a small but powerful regular expression library,
|
||||
written in pure V.
|
||||
|
||||
@ -17,7 +18,6 @@ are valid for all the `regex` module features:
|
||||
2. The basic atomic elements of this regex engine are the tokens.
|
||||
In a query string a simple character is a token.
|
||||
|
||||
|
||||
## Differences with PCRE:
|
||||
|
||||
> **Note**
|
||||
@ -39,25 +39,24 @@ Note: **Two char classes with an `OR` in the middle is a syntax error.**
|
||||
- The **match operation stops at the end of the string**. It does *NOT* stop
|
||||
at new line characters.
|
||||
|
||||
- The **match operation stops at the end of the string**. It does *NOT* stop
|
||||
at new line characters.
|
||||
|
||||
## Tokens
|
||||
|
||||
The tokens are the atomic units, used by this regex engine.
|
||||
They can be one of the following:
|
||||
|
||||
|
||||
### Simple char
|
||||
|
||||
This token is a simple single character like `a` or `b` etc.
|
||||
|
||||
|
||||
### Match positional delimiters
|
||||
|
||||
`^` Matches the start of the string.
|
||||
|
||||
`$` Matches the end of the string.
|
||||
|
||||
|
||||
### Char class (cc)
|
||||
|
||||
The character classes match all the chars specified inside. Use square
|
||||
@ -98,14 +97,14 @@ For example `\w` is the meta-char `w`.
|
||||
|
||||
A meta-char can match different types of characters.
|
||||
|
||||
* `\w` matches a word char char `[a-zA-Z0-9_]`
|
||||
* `\W` matches a non word char
|
||||
* `\d` matches a digit `[0-9]`
|
||||
* `\D` matches a non digit
|
||||
* `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
|
||||
* `\S` matches a non space char
|
||||
* `\a` matches only a lowercase char `[a-z]`
|
||||
* `\A` matches only an uppercase char `[A-Z]`
|
||||
- `\w` matches a word char char `[a-zA-Z0-9_]`
|
||||
- `\W` matches a non word char
|
||||
- `\d` matches a digit `[0-9]`
|
||||
- `\D` matches a non digit
|
||||
- `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
|
||||
- `\S` matches a non space char
|
||||
- `\a` matches only a lowercase char `[a-z]`
|
||||
- `\A` matches only an uppercase char `[A-Z]`
|
||||
|
||||
### Quantifier
|
||||
|
||||
@ -142,11 +141,12 @@ with a regex. The following table show the query strings and the result of
|
||||
parsing source string.
|
||||
|
||||
| query string | result |
|
||||
|--------------|-------------|
|
||||
| ------------ | ----------- |
|
||||
| `.*c` | `abc` |
|
||||
| `.*dd` | `abcc dd` |
|
||||
| `ab.*e` | `abccc dde` |
|
||||
| `ab.{3} .*e` | `abccc dde` |
|
||||
|
||||
The dot matches any character, until the next token match is satisfied.
|
||||
|
||||
> Important Note: Consecutive dots, for example `...`, are not allowed.
|
||||
@ -278,10 +278,10 @@ Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
|
||||
that get directly the string of a group using its `id`:
|
||||
|
||||
```v ignore
|
||||
txt := "my used string...."
|
||||
txt := 'my used string....'
|
||||
for g_index := 0; g_index < re.group_count; g_index++ {
|
||||
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}")
|
||||
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}')
|
||||
}
|
||||
```
|
||||
|
||||
@ -311,32 +311,33 @@ not be saved.
|
||||
|
||||
```v ignore
|
||||
import regex
|
||||
|
||||
fn main() {
|
||||
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
|
||||
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
|
||||
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
|
||||
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
|
||||
|
||||
mut re := regex.regex_opt(query) or { panic(err) }
|
||||
// println(re.get_code()) // uncomment to see the print of the regex execution code
|
||||
re.debug = 2 // enable maximum log
|
||||
println("String: ${txt}")
|
||||
println("Query : ${re.get_query()}")
|
||||
println('String: ${txt}')
|
||||
println('Query : ${re.get_query()}')
|
||||
re.debug = 0 // disable log
|
||||
re.group_csave_flag = true
|
||||
start, end := re.match_string(txt)
|
||||
if start >= 0 {
|
||||
println("Match (${start}, ${end}) => [${txt[start..end]}]")
|
||||
println('Match (${start}, ${end}) => [${txt[start..end]}]')
|
||||
} else {
|
||||
println("No Match")
|
||||
println('No Match')
|
||||
}
|
||||
|
||||
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
|
||||
println("cg: ${re.group_csave}")
|
||||
println('cg: ${re.group_csave}')
|
||||
mut cs_i := 1
|
||||
for cs_i < re.group_csave[0] * 3 {
|
||||
g_id := re.group_csave[cs_i]
|
||||
st := re.group_csave[cs_i + 1]
|
||||
en := re.group_csave[cs_i + 2]
|
||||
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
|
||||
println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
|
||||
cs_i += 3
|
||||
}
|
||||
}
|
||||
@ -374,23 +375,25 @@ that is a map from `string` to `int`, where the value is the index in
|
||||
`group_csave` list of indexes.
|
||||
|
||||
Here is an example for how to use them:
|
||||
|
||||
```v ignore
|
||||
import regex
|
||||
|
||||
fn main() {
|
||||
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
|
||||
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
|
||||
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
|
||||
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
|
||||
|
||||
mut re := regex.regex_opt(query) or { panic(err) }
|
||||
// println(re.get_code()) // uncomment to see the print of the regex execution code
|
||||
re.debug = 2 // enable maximum log
|
||||
println("String: ${txt}")
|
||||
println("Query : ${re.get_query()}")
|
||||
println('String: ${txt}')
|
||||
println('Query : ${re.get_query()}')
|
||||
re.debug = 0 // disable log
|
||||
start, end := re.match_string(txt)
|
||||
if start >= 0 {
|
||||
println("Match (${start}, ${end}) => [${txt[start..end]}]")
|
||||
println('Match (${start}, ${end}) => [${txt[start..end]}]')
|
||||
} else {
|
||||
println("No Match")
|
||||
println('No Match')
|
||||
}
|
||||
|
||||
for name in re.group_map.keys() {
|
||||
@ -414,6 +417,7 @@ In order to simplify the use of the named groups, it is possible to
|
||||
use a name map in the `re` struct, using the function `re.get_group_by_name`.
|
||||
|
||||
Here is a more complex example of using them:
|
||||
|
||||
```v oksyntax
|
||||
// This function demonstrate the use of the named groups
|
||||
fn convert_html_rgb_n(in_col string) u32 {
|
||||
@ -443,15 +447,13 @@ Other utilities are `get_group_by_name` and `get_group_bounds_by_name`,
|
||||
that return the string of a group using its `name`:
|
||||
|
||||
```v ignore
|
||||
txt := "my used string...."
|
||||
txt := 'my used string....'
|
||||
for name in re.group_map.keys() {
|
||||
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
|
||||
bounds: ${re.get_group_bounds_by_name(name)}")
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
|
||||
### Groups query functions
|
||||
|
||||
These functions are helpers to query the captured groups
|
||||
@ -522,12 +524,14 @@ pub fn regex_opt(in_query string) ?RE
|
||||
```v ignore
|
||||
// new_regex create a REgex of small size, usually sufficient for ordinary use
|
||||
pub fn new() RE
|
||||
|
||||
```
|
||||
|
||||
#### **Custom initialization**
|
||||
|
||||
For some particular needs, it is possible to initialize a fully customized regex:
|
||||
|
||||
```v ignore
|
||||
pattern = r"ab(.*)(ac)"
|
||||
pattern = r'ab(.*)(ac)'
|
||||
// init custom regex
|
||||
mut re := regex.RE{}
|
||||
// max program length, can not be longer then the pattern
|
||||
@ -541,13 +545,14 @@ re.group_max = pattern.len>>1 // we can't have more groups than the half
|
||||
re.group_stack = []int{len: re.group_max, init: -1}
|
||||
re.group_data = []int{len: re.group_max, init: -1}
|
||||
```
|
||||
|
||||
### Compiling
|
||||
|
||||
After an initializer is used, the regex expression must be compiled with:
|
||||
|
||||
```v ignore
|
||||
// compile compiles the REgex returning an error if the compilation fails
|
||||
pub fn (re mut RE) compile_opt(in_txt string)?
|
||||
pub fn (mut re RE) compile_opt(in_txt string) ?
|
||||
```
|
||||
|
||||
### Matching Functions
|
||||
@ -556,8 +561,7 @@ These are the matching functions
|
||||
|
||||
```v ignore
|
||||
// match_string try to match the input string, return start and end index if found else start is -1
|
||||
pub fn (re mut RE) match_string(in_txt string) (int,int)
|
||||
|
||||
pub fn (mut re RE) match_string(in_txt string) (int, int)
|
||||
```
|
||||
|
||||
## Find and Replace
|
||||
@ -569,16 +573,16 @@ There are the following find and replace functions:
|
||||
```v ignore
|
||||
// find try to find the first match in the input string
|
||||
// return start and end index if found else start is -1
|
||||
pub fn (re mut RE) find(in_txt string) (int,int)
|
||||
pub fn (mut re RE) find(in_txt string) (int, int)
|
||||
|
||||
// find_all find all the "non overlapping" occurrences of the matching pattern
|
||||
// return a list of start end indexes like: [3,4,6,8]
|
||||
// the matches are [3,4] and [6,8]
|
||||
pub fn (re mut RE) find_all(in_txt string) []int
|
||||
pub fn (mut re RE) find_all(in_txt string) []int
|
||||
|
||||
// find_all find all the "non overlapping" occurrences of the matching pattern
|
||||
// return a list of strings
|
||||
// the result is like ["first match","secon match"]
|
||||
// the result is like ['first match','secon match']
|
||||
pub fn (mut re RE) find_all_str(in_txt string) []string
|
||||
```
|
||||
|
||||
@ -587,16 +591,16 @@ pub fn (mut re RE) find_all_str(in_txt string) []string
|
||||
```v ignore
|
||||
// replace return a string where the matches are replaced with the repl_str string,
|
||||
// this function support groups in the replace string
|
||||
pub fn (re mut RE) replace(in_txt string, repl string) string
|
||||
pub fn (mut re RE) replace(in_txt string, repl string) string
|
||||
```
|
||||
|
||||
replace string can include groups references:
|
||||
|
||||
```v ignore
|
||||
txt := "Today it is a good day."
|
||||
txt := 'Today it is a good day.'
|
||||
query := r'(a\w)[ ,.]'
|
||||
mut re := regex.regex_opt(query)?
|
||||
res := re.replace(txt, r"__[\0]__")
|
||||
res := re.replace(txt, r'__[\0]__')
|
||||
```
|
||||
|
||||
in this example we used the group `0` in the replace string: `\0`, the result will be:
|
||||
@ -617,6 +621,7 @@ pub fn (mut re RE) replace_simple(in_txt string, repl string) string
|
||||
```
|
||||
|
||||
If it is needed to replace N instances of the found strings it is possible to use:
|
||||
|
||||
```v ignore
|
||||
// replace_n return a string where the first `count` matches are replaced with the repl_str string
|
||||
// `count` indicate the number of max replacements that will be done.
|
||||
@ -650,16 +655,17 @@ The following example will clarify its usage:
|
||||
import regex
|
||||
// customized replace functions
|
||||
// it will be called on each non overlapped find
|
||||
|
||||
fn my_repl(re regex.RE, in_txt string, start int, end int) string {
|
||||
g0 := re.get_group_by_id(in_txt, 0)
|
||||
g1 := re.get_group_by_id(in_txt, 1)
|
||||
g2 := re.get_group_by_id(in_txt, 2)
|
||||
return "*${g0}*${g1}*${g2}*"
|
||||
return '*${g0}*${g1}*${g2}*'
|
||||
}
|
||||
|
||||
fn main() {
|
||||
txt := "today [John] is gone to his house with (Jack) and [Marie]."
|
||||
query := r"(.)(\A\w+)(.)"
|
||||
txt := 'today [John] is gone to his house with (Jack) and [Marie].'
|
||||
query := r'(.)(\A\w+)(.)'
|
||||
|
||||
mut re := regex.regex_opt(query) or { panic(err) }
|
||||
|
||||
@ -674,8 +680,6 @@ Output:
|
||||
today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Debugging
|
||||
|
||||
This module has few small utilities to you write regex patterns.
|
||||
@ -811,34 +815,36 @@ Here an example that perform some basically match of strings
|
||||
import regex
|
||||
|
||||
fn main() {
|
||||
txt := "http://www.ciao.mondo/hello/pippo12_/pera.html"
|
||||
query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
|
||||
txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
|
||||
query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'
|
||||
|
||||
mut re := regex.regex_opt(query) or { panic(err) }
|
||||
|
||||
start, end := re.match_string(txt)
|
||||
if start >= 0 {
|
||||
println("Match (${start}, ${end}) => [${txt[start..end]}]")
|
||||
println('Match (${start}, ${end}) => [${txt[start..end]}]')
|
||||
for g_index := 0; g_index < re.group_count; g_index++ {
|
||||
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}")
|
||||
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}')
|
||||
}
|
||||
for name in re.group_map.keys() {
|
||||
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
|
||||
bounds: ${re.get_group_bounds_by_name(name)}")
|
||||
}
|
||||
} else {
|
||||
println("No Match")
|
||||
println('No Match')
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
Here an example of total customization of the regex environment creation:
|
||||
|
||||
```v ignore
|
||||
import regex
|
||||
|
||||
fn main() {
|
||||
txt := "today John is gone to his house with Jack and Marie."
|
||||
query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+"
|
||||
txt := 'today John is gone to his house with Jack and Marie.'
|
||||
query := r'(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+'
|
||||
|
||||
// init regex
|
||||
mut re := regex.RE{}
|
||||
@ -859,37 +865,37 @@ fn main(){
|
||||
|
||||
start, end := re.match_string(txt)
|
||||
if start >= 0 {
|
||||
println("Match (${start}, ${end}) => [${txt[start..end]}]")
|
||||
println('Match (${start}, ${end}) => [${txt[start..end]}]')
|
||||
} else {
|
||||
println("No Match")
|
||||
println('No Match')
|
||||
}
|
||||
|
||||
// show results for continuous group saving
|
||||
if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
|
||||
println("cg: ${re.group_csave}")
|
||||
println('cg: ${re.group_csave}')
|
||||
mut cs_i := 1
|
||||
for cs_i < re.group_csave[0] * 3 {
|
||||
g_id := re.group_csave[cs_i]
|
||||
st := re.group_csave[cs_i + 1]
|
||||
en := re.group_csave[cs_i + 2]
|
||||
println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
|
||||
println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
|
||||
cs_i += 3
|
||||
}
|
||||
}
|
||||
|
||||
// show results for captured groups
|
||||
if start >= 0 {
|
||||
println("Match (${start}, ${end}) => [${txt[start..end]}]")
|
||||
println('Match (${start}, ${end}) => [${txt[start..end]}]')
|
||||
for g_index := 0; g_index < re.group_count; g_index++ {
|
||||
println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}")
|
||||
println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
|
||||
bounds: ${re.get_group_bounds_by_id(g_index)}')
|
||||
}
|
||||
for name in re.group_map.keys() {
|
||||
println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
|
||||
bounds: ${re.get_group_bounds_by_name(name)}")
|
||||
}
|
||||
} else {
|
||||
println("No Match")
|
||||
println('No Match')
|
||||
}
|
||||
}
|
||||
```
|
||||
|
Loading…
Reference in New Issue
Block a user