regex: fix formatting inconsistencies in README.md (#17940)

2023-08-10 21:13:21 +03:00 · 2023-04-13 13:44:45 +02:00
parent 524f7c3ead
commit 489ac892b9
1 changed files with 187 additions and 181 deletions
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@ -1,4 +1,5 @@
 # Description
+
 `regex` is a small but powerful regular expression library,
 written in pure V.

@ -15,8 +16,7 @@ are valid for all the `regex` module features:
 1. The matching stops at the end of the string, *not* at newline characters.

 2. The basic atomic elements of this regex engine are the tokens.
-In a query string a simple character is a token.
-
+   In a query string a simple character is a token.

 ## Differences with PCRE:

@ -28,36 +28,35 @@ In a query string a simple character is a token.
 The main differences can be summarized in the following points:

 - The basic element **is the token not the sequence of symbols**, and the most
-simple token, is a single character.
+  simple token, is a single character.

 - `|` **the OR operator acts on tokens,** for example `abc|ebc` is not
-`abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`,
-followed by `bc`, because the **token is the base element**,
-not the sequence of symbols.
-Note: **Two char classes with an `OR` in the middle is a syntax error.**
+  `abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`,
+  followed by `bc`, because the **token is the base element**,
+  not the sequence of symbols.
+  Note: **Two char classes with an `OR` in the middle is a syntax error.**

 - The **match operation stops at the end of the string**. It does *NOT* stop
-at new line characters.
+  at new line characters.

+- The **match operation stops at the end of the string**. It does *NOT* stop
+  at new line characters.

 ## Tokens

 The tokens are the atomic units, used by this regex engine.
 They can be one of the following:

-
 ### Simple char

 This token is a simple single character like `a` or `b` etc.

-
 ### Match positional delimiters

 `^` Matches the start of the string.

 `$` Matches the end of the string.

-
 ### Char class (cc)

 The character classes match all the chars specified inside. Use square
@ -98,14 +97,14 @@ For example `\w` is the meta-char `w`.

 A meta-char can match different types of characters.

-* `\w` matches a word char char `[a-zA-Z0-9_]`
-* `\W` matches a non word char
-* `\d` matches a digit `[0-9]`
-* `\D` matches a non digit
-* `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
-* `\S` matches a non space char
-* `\a` matches only a lowercase char `[a-z]`
-* `\A` matches only an uppercase char `[A-Z]`
+- `\w` matches a word char char `[a-zA-Z0-9_]`
+- `\W` matches a non word char
+- `\d` matches a digit `[0-9]`
+- `\D` matches a non digit
+- `\s` matches a space char, one of `[' ','\t','\n','\r','\v','\f']`
+- `\S` matches a non space char
+- `\a` matches only a lowercase char `[a-z]`
+- `\A` matches only an uppercase char `[A-Z]`

 ### Quantifier

@ -123,9 +122,9 @@ must be matched.
 - `{x}` matches exactly x times, `a{2}` matches `aa`, but not `aaa` or `a`
 - `{min,}` matches at least min times, `a{2,}` matches `aaa` or `aa`, not `a`
 - `{,max}` matches at least 0 times and at maximum max times,
-   for example, `a{,2}` matches `a` and `aa`, but doesn't match `aaa`
+  for example, `a{,2}` matches `a` and `aa`, but doesn't match `aaa`
 - `{min,max}` matches from min times, to max times, for example
-    `a{2,3}` matches `aa` and `aaa`, but doesn't match `a` or `aaaa`
+  `a{2,3}` matches `aa` and `aaa`, but doesn't match `a` or `aaaa`

 A long quantifier, may have a `greedy off` flag, that is the `?`
 character after the brackets. `{2,4}?` means to match the minimum
@ -141,12 +140,13 @@ Suppose you have `abccc ddeef` as a source string, that you want to parse
 with a regex. The following table show the query strings and the result of
 parsing source string.

-| query string |   result    |
-|--------------|-------------|
+| query string | result      |
+| ------------ | ----------- |
 | `.*c`        | `abc`       |
-| `.*dd`	   | `abcc dd`   |
+| `.*dd`       | `abcc dd`   |
 | `ab.*e`      | `abccc dde` |
 | `ab.{3} .*e` | `abccc dde` |
+
 The dot matches any character, until the next token match is satisfied.

 > Important Note: Consecutive dots, for example `...`, are not allowed.
@ -195,7 +195,7 @@ i.e. the space char (ascii code 32) followed by the `?` quantifier,
 which means that the preceding space should be matched 0 or 1 time.

 This explains why the `(c(pa)+z ?)+` query string,
-can match `cpaz cpapaz cpapapaz` .
+can match `cpaz cpapaz cpapapaz`.

 In this implementation the groups are "capture groups". This means that the
 last temporal result for each group, can be retrieved from the `RE` struct.
@ -275,13 +275,13 @@ fn convert_html_rgb(in_col string) u32 {
 ```

 Others utility functions are `get_group_by_id` and `get_group_bounds_by_id`
-that get  directly the string of a group using its `id`:
+that get directly the string of a group using its `id`:

 ```v ignore
-txt := "my used string...."
-for g_index := 0; g_index < re.group_count ; g_index++ {
-	println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
-    	bounds: ${re.get_group_bounds_by_id(g_index)}")
+txt := 'my used string....'
+for g_index := 0; g_index < re.group_count; g_index++ {
+	println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+		bounds: ${re.get_group_bounds_by_id(g_index)}')
 }
 ```

@ -311,35 +311,36 @@ not be saved.

 ```v ignore
 import regex
-fn main(){
-    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
-    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"

-    mut re := regex.regex_opt(query) or { panic(err) }
-    //println(re.get_code())   // uncomment to see the print of the regex execution code
-    re.debug=2  // enable maximum log
-    println("String: ${txt}")
-    println("Query : ${re.get_query()}")
-    re.debug=0  // disable log
-    re.group_csave_flag = true
-    start, end := re.match_string(txt)
-    if start >= 0 {
-        println("Match (${start}, ${end}) => [${txt[start..end]}]")
-    } else {
-        println("No Match")
-    }
+fn main() {
+	txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
+	query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'

-    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
-        println("cg: ${re.group_csave}")
-        mut cs_i := 1
-        for cs_i < re.group_csave[0]*3 {
-            g_id := re.group_csave[cs_i]
-            st   := re.group_csave[cs_i+1]
-            en   := re.group_csave[cs_i+2]
-            println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
-            cs_i += 3
-        }
-    }
+	mut re := regex.regex_opt(query) or { panic(err) }
+	// println(re.get_code())   // uncomment to see the print of the regex execution code
+	re.debug = 2 // enable maximum log
+	println('String: ${txt}')
+	println('Query : ${re.get_query()}')
+	re.debug = 0 // disable log
+	re.group_csave_flag = true
+	start, end := re.match_string(txt)
+	if start >= 0 {
+		println('Match (${start}, ${end}) => [${txt[start..end]}]')
+	} else {
+		println('No Match')
+	}
+
+	if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
+		println('cg: ${re.group_csave}')
+		mut cs_i := 1
+		for cs_i < re.group_csave[0] * 3 {
+			g_id := re.group_csave[cs_i]
+			st := re.group_csave[cs_i + 1]
+			en := re.group_csave[cs_i + 2]
+			println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
+			cs_i += 3
+		}
+	}
 }
 ```

@ -364,7 +365,7 @@ cg[1] 42 46:[html]

 This regex module supports partially the question mark `?` PCRE syntax for groups.

-`(?:abcd)` **non capturing group**:  the content of the group will not be saved.
+`(?:abcd)` **non capturing group**: the content of the group will not be saved.

 `(?P<mygroup>abcdef)` **named group:** the group content is saved and labeled
 as `mygroup`.
@ -374,29 +375,31 @@ that is a map from `string` to `int`, where the value is the index in
 `group_csave` list of indexes.

 Here is an example for how to use them:
+
 ```v ignore
 import regex
-fn main(){
-    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
-    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"

-    mut re := regex.regex_opt(query) or { panic(err) }
-    //println(re.get_code())   // uncomment to see the print of the regex execution code
-    re.debug=2  // enable maximum log
-    println("String: ${txt}")
-    println("Query : ${re.get_query()}")
-    re.debug=0  // disable log
-    start, end := re.match_string(txt)
-    if start >= 0 {
-        println("Match (${start}, ${end}) => [${txt[start..end]}]")
-    } else {
-        println("No Match")
-    }
+fn main() {
+	txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
+	query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'

-    for name in re.group_map.keys() {
-        println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
-        bounds: ${re.get_group_bounds_by_name(name)}")
-    }
+	mut re := regex.regex_opt(query) or { panic(err) }
+	// println(re.get_code())   // uncomment to see the print of the regex execution code
+	re.debug = 2 // enable maximum log
+	println('String: ${txt}')
+	println('Query : ${re.get_query()}')
+	re.debug = 0 // disable log
+	start, end := re.match_string(txt)
+	if start >= 0 {
+		println('Match (${start}, ${end}) => [${txt[start..end]}]')
+	} else {
+		println('No Match')
+	}
+
+	for name in re.group_map.keys() {
+		println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
+			bounds: ${re.get_group_bounds_by_name(name)}")
+	}
 }
 ```

@ -414,6 +417,7 @@ In order to simplify the use of the named groups, it is possible to
 use a name map in the `re` struct, using the function `re.get_group_by_name`.

 Here is a more complex example of using them:
+
 ```v oksyntax
 // This function demonstrate the use of the named groups
 fn convert_html_rgb_n(in_col string) u32 {
@ -443,15 +447,13 @@ Other utilities are `get_group_by_name` and `get_group_bounds_by_name`,
 that return the string of a group using its `name`:

 ```v ignore
-txt := "my used string...."
+txt := 'my used string....'
 for name in re.group_map.keys() {
 	println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
-    bounds: ${re.get_group_bounds_by_name(name)}")
+		bounds: ${re.get_group_bounds_by_name(name)}")
 }
 ```

-
-
 ### Groups query functions

 These functions are helpers to query the captured groups
@ -493,15 +495,15 @@ re.flag = regex.f_bin
 - `f_bin`: parse a string as bytes, utf-8 management disabled.

 - `f_efm`: exit on the first char matches in the query, used by the
-           find function.
+  find function.

- `f_ms`:  matches only if the index of the start match is 0,
-           same as `^` at the start of the query string.
+- `f_ms`: matches only if the index of the start match is 0,
+  same as `^` at the start of the query string.

- `f_me`:  matches only if the end index of the match is the last char
-           of the input string, same as `$` end of query string.
+- `f_me`: matches only if the end index of the match is the last char
+  of the input string, same as `$` end of query string.

- `f_nl`:  stop the matching if found a new line char `\n` or `\r`
+- `f_nl`: stop the matching if found a new line char `\n` or `\r`

 ## Functions

@ -522,32 +524,35 @@ pub fn regex_opt(in_query string) ?RE
 ```v ignore
 // new_regex create a REgex of small size, usually sufficient for ordinary use
 pub fn new() RE
-
 ```
+
 #### **Custom initialization**
+
 For some particular needs, it is possible to initialize a fully customized regex:
+
 ```v ignore
-pattern = r"ab(.*)(ac)"
+pattern = r'ab(.*)(ac)'
 // init custom regex
 mut re := regex.RE{}
 // max program length, can not be longer then the pattern
-re.prog = []Token    {len: pattern.len + 1}
+re.prog = []Token{len: pattern.len + 1}
 // can not be more char class the the length of the pattern
-re.cc   = []CharClass{len: pattern.len}
+re.cc = []CharClass{len: pattern.len}

-re.group_csave_flag = false          // true enable continuous group saving if needed
-re.group_max_nested = 128            // set max 128 group nested possible
-re.group_max        = pattern.len>>1 // we can't have more groups than the half of the pattern length
+re.group_csave_flag = false // true enable continuous group saving if needed
+re.group_max_nested = 128 // set max 128 group nested possible
+re.group_max = pattern.len >> 1 // we can't have more groups than the half of the pattern length
 re.group_stack = []int{len: re.group_max, init: -1}
-re.group_data  = []int{len: re.group_max, init: -1}
+re.group_data = []int{len: re.group_max, init: -1}
 ```
+
 ### Compiling

 After an initializer is used, the regex expression must be compiled with:

 ```v ignore
 // compile compiles the REgex returning an error if the compilation fails
-pub fn (re mut RE) compile_opt(in_txt string)?
+pub fn (mut re RE) compile_opt(in_txt string) ?
 ```

 ### Matching Functions
@ -556,29 +561,28 @@ These are the matching functions

 ```v ignore
 // match_string try to match the input string, return start and end index if found else start is -1
-pub fn (re mut RE) match_string(in_txt string) (int,int)
-
+pub fn (mut re RE) match_string(in_txt string) (int, int)
 ```

 ## Find and Replace

-There are the following find  and replace functions:
+There are the following find and replace functions:

 #### Find functions

 ```v ignore
 // find try to find the first match in the input string
 // return start and end index if found else start is -1
-pub fn (re mut RE) find(in_txt string) (int,int)
+pub fn (mut re RE) find(in_txt string) (int, int)

 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of start end indexes like: [3,4,6,8]
 // the matches are [3,4] and [6,8]
-pub fn (re mut RE) find_all(in_txt string) []int
+pub fn (mut re RE) find_all(in_txt string) []int

 // find_all find all the "non overlapping" occurrences of the matching pattern
 // return a list of strings
-// the result is like ["first match","secon match"]
+// the result is like ['first match','secon match']
 pub fn (mut re RE) find_all_str(in_txt string) []string
 ```

@ -587,16 +591,16 @@ pub fn (mut re RE) find_all_str(in_txt string) []string
 ```v ignore
 // replace return a string where the matches are replaced with the repl_str string,
 // this function support groups in the replace string
-pub fn (re mut RE) replace(in_txt string, repl string) string
+pub fn (mut re RE) replace(in_txt string, repl string) string
 ```

 replace string can include groups references:

 ```v ignore
-txt   := "Today it is a good day."
+txt := 'Today it is a good day.'
 query := r'(a\w)[ ,.]'
 mut re := regex.regex_opt(query)?
-res := re.replace(txt, r"__[\0]__")
+res := re.replace(txt, r'__[\0]__')
 ```

 in this example we used the group `0` in the replace string: `\0`, the result will be:
@ -617,6 +621,7 @@ pub fn (mut re RE) replace_simple(in_txt string, repl string) string
 ```

 If it is needed to replace N instances of the found strings it is possible to use:
+
 ```v ignore
 // replace_n return a string where the first `count` matches are replaced with the repl_str string
 // `count` indicate the number of max replacements that will be done.
@ -650,21 +655,22 @@ The following example will clarify its usage:
 import regex
 // customized replace functions
 // it will be called on each non overlapped find
+
 fn my_repl(re regex.RE, in_txt string, start int, end int) string {
-    g0 := re.get_group_by_id(in_txt, 0)
-    g1 := re.get_group_by_id(in_txt, 1)
-    g2 := re.get_group_by_id(in_txt, 2)
-    return "*${g0}*${g1}*${g2}*"
+	g0 := re.get_group_by_id(in_txt, 0)
+	g1 := re.get_group_by_id(in_txt, 1)
+	g2 := re.get_group_by_id(in_txt, 2)
+	return '*${g0}*${g1}*${g2}*'
 }

-fn main(){
-    txt   := "today [John] is gone to his house with (Jack) and [Marie]."
-    query := r"(.)(\A\w+)(.)"
+fn main() {
+	txt := 'today [John] is gone to his house with (Jack) and [Marie].'
+	query := r'(.)(\A\w+)(.)'

-    mut re := regex.regex_opt(query) or { panic(err) }
+	mut re := regex.regex_opt(query) or { panic(err) }

-    result := re.replace_by_fn(txt, my_repl)
-    println(result)
+	result := re.replace_by_fn(txt, my_repl)
+	println(result)
 }
 ```

@ -674,8 +680,6 @@ Output:
 today *[*John*]* is gone to his house with *(*Jack*)* and *[*Marie*]*.
 ```

-
-
 ## Debugging

 This module has few small utilities to you write regex patterns.
@ -727,7 +731,7 @@ PC: 10 ist: 88000000 PROG_END {  0,  0}

 `query_ch` is the type of token.

-`{m,n}` is the quantifier, the greedy off flag  `?`  will be showed if present in the token
+`{m,n}` is the quantifier, the greedy off flag `?` will be showed if present in the token

 ### **Log debug**

@ -810,87 +814,89 @@ Here an example that perform some basically match of strings
 ```v ignore
 import regex

-fn main(){
-    txt   := "http://www.ciao.mondo/hello/pippo12_/pera.html"
-    query := r"(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+"
+fn main() {
+	txt := 'http://www.ciao.mondo/hello/pippo12_/pera.html'
+	query := r'(?P<format>https?)|(?P<format>ftps?)://(?P<token>[\w_]+.)+'

-    mut re := regex.regex_opt(query) or { panic(err) }
+	mut re := regex.regex_opt(query) or { panic(err) }

-    start, end := re.match_string(txt)
-    if start >= 0 {
-        println("Match (${start}, ${end}) => [${txt[start..end]}]")
-        for g_index := 0; g_index < re.group_count ; g_index++ {
-            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
-            bounds: ${re.get_group_bounds_by_id(g_index)}")
-        }
-        for name in re.group_map.keys() {
-            println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
-            bounds: ${re.get_group_bounds_by_name(name)}")
-        }
-    } else {
-        println("No Match")
-    }
+	start, end := re.match_string(txt)
+	if start >= 0 {
+		println('Match (${start}, ${end}) => [${txt[start..end]}]')
+		for g_index := 0; g_index < re.group_count; g_index++ {
+			println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+				bounds: ${re.get_group_bounds_by_id(g_index)}')
+		}
+		for name in re.group_map.keys() {
+			println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
+				bounds: ${re.get_group_bounds_by_name(name)}")
+		}
+	} else {
+		println('No Match')
+	}
 }
 ```
+
 Here an example of total customization of the regex environment creation:
+
 ```v ignore
 import regex

-fn main(){
-    txt   := "today John is gone to his house with Jack and Marie."
-    query := r"(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+"
+fn main() {
+	txt := 'today John is gone to his house with Jack and Marie.'
+	query := r'(?:(?P<word>\A\w+)|(?:\a\w+)[\s.]?)+'

-    // init regex
-    mut re := regex.RE{}
+	// init regex
+	mut re := regex.RE{}
 	// max program length, can not be longer then the query
-    re.prog = []regex.Token    {len: query.len + 1}
+	re.prog = []regex.Token{len: query.len + 1}
 	// can not be more char class the the length of the query
-    re.cc   = []regex.CharClass{len: query.len}
-    re.prog = []regex.Token    {len: query.len+1}
+	re.cc = []regex.CharClass{len: query.len}
+	re.prog = []regex.Token{len: query.len + 1}
 	// enable continuous group saving
-    re.group_csave_flag = true
+	re.group_csave_flag = true
 	// set max 128 group nested
-    re.group_max_nested = 128
+	re.group_max_nested = 128
 	// we can't have more groups than the half of the query length
-    re.group_max        = query.len>>1
+	re.group_max = query.len >> 1

-    // compile the query
-    re.compile_opt(query) or { panic(err) }
+	// compile the query
+	re.compile_opt(query) or { panic(err) }

-    start, end := re.match_string(txt)
-    if start >= 0 {
-        println("Match (${start}, ${end}) => [${txt[start..end]}]")
-    } else {
-        println("No Match")
-    }
+	start, end := re.match_string(txt)
+	if start >= 0 {
+		println('Match (${start}, ${end}) => [${txt[start..end]}]')
+	} else {
+		println('No Match')
+	}

-    // show results for continuous group saving
-    if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0{
-        println("cg: ${re.group_csave}")
-        mut cs_i := 1
-        for cs_i < re.group_csave[0]*3 {
-            g_id := re.group_csave[cs_i]
-            st   := re.group_csave[cs_i+1]
-            en   := re.group_csave[cs_i+2]
-            println("cg[${g_id}] ${st} ${en}:[${txt[st..en]}]")
-            cs_i += 3
-        }
-    }
+	// show results for continuous group saving
+	if re.group_csave_flag == true && start >= 0 && re.group_csave.len > 0 {
+		println('cg: ${re.group_csave}')
+		mut cs_i := 1
+		for cs_i < re.group_csave[0] * 3 {
+			g_id := re.group_csave[cs_i]
+			st := re.group_csave[cs_i + 1]
+			en := re.group_csave[cs_i + 2]
+			println('cg[${g_id}] ${st} ${en}:[${txt[st..en]}]')
+			cs_i += 3
+		}
+	}

-    // show results for captured groups
-    if start >= 0 {
-        println("Match (${start}, ${end}) => [${txt[start..end]}]")
-        for g_index := 0; g_index < re.group_count ; g_index++ {
-            println("#${g_index} [${re.get_group_by_id(txt, g_index)}] \
-            bounds: ${re.get_group_bounds_by_id(g_index)}")
-        }
-        for name in re.group_map.keys() {
-            println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
-            bounds: ${re.get_group_bounds_by_name(name)}")
-        }
-    } else {
-        println("No Match")
-    }
+	// show results for captured groups
+	if start >= 0 {
+		println('Match (${start}, ${end}) => [${txt[start..end]}]')
+		for g_index := 0; g_index < re.group_count; g_index++ {
+			println('#${g_index} [${re.get_group_by_id(txt, g_index)}] \
+				bounds: ${re.get_group_bounds_by_id(g_index)}')
+		}
+		for name in re.group_map.keys() {
+			println("group:'${name}' \t=> [${re.get_group_by_name(txt, name)}] \
+				bounds: ${re.get_group_bounds_by_name(name)}")
+		}
+	} else {
+		println('No Match')
+	}
 }
 ```