From 47a2301139dd780dbbf2ddcca5aad6b3b3a0aab1 Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sun, 24 Oct 2021 10:54:51 +0200 Subject: [PATCH] regex: add OR error, if sourounded by char classes, and a test (#12278) --- vlib/regex/README.md | 14 ++++++++------ vlib/regex/regex.v | 24 ++++++++++++++++++------ vlib/regex/regex_test.v | 26 ++++++++++++++++++++------ 3 files changed, 46 insertions(+), 18 deletions(-) diff --git a/vlib/regex/README.md b/vlib/regex/README.md index 0faa833bb3..33a6de5525 100644 --- a/vlib/regex/README.md +++ b/vlib/regex/README.md @@ -28,6 +28,7 @@ simple token, is a single character. `abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`, followed by `bc`, because the **token is the base element**, not the sequence of symbols. +Note: **Two char classes with an `OR` in the middle is a syntax error.** - The **match operation stops at the end of the string**. It does *NOT* stop at new line characters. @@ -155,6 +156,7 @@ match too, finally test the token `c`. NB: ** unlike in PCRE, the OR operation works at token level!** It doesn't work at concatenation level! +NB2: **Two char classes with an `OR` in the middle is a syntax error.** That also means, that a query string like `abc|bde` is not equal to `(abc)|(bde)`, but instead to `ab(c|b)de. @@ -474,21 +476,21 @@ the behavior of the parser itself. ```v ignore // example of flag settings mut re := regex.new() -re.flag = regex.F_BIN +re.flag = regex.f_bin ``` -- `F_BIN`: parse a string as bytes, utf-8 management disabled. +- `f_bin`: parse a string as bytes, utf-8 management disabled. -- `F_EFM`: exit on the first char matches in the query, used by the +- `f_efm`: exit on the first char matches in the query, used by the find function. -- `F_MS`: matches only if the index of the start match is 0, +- `f_ms`: matches only if the index of the start match is 0, same as `^` at the start of the query string. -- `F_ME`: matches only if the end index of the match is the last char +- `f_me`: matches only if the end index of the match is the last char of the input string, same as `$` end of query string. -- `F_NL`: stop the matching if found a new line char `\n` or `\r` +- `f_nl`: stop the matching if found a new line char `\n` or `\r` ## Functions diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 575c277493..c87c67582b 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -40,6 +40,7 @@ pub const ( err_groups_max_nested = -8 // max number of nested group reached err_group_not_balanced = -9 // group not balanced err_group_qm_notation = -10 // group invalid notation + err_invalid_or_with_cc = -11 // invalid or on two consecutive char class ) const ( @@ -196,6 +197,7 @@ pub fn (re RE) get_parse_error_string(err int) string { regex.err_groups_max_nested { return 'err_groups_max_nested' } regex.err_group_not_balanced { return 'err_group_not_balanced' } regex.err_group_qm_notation { return 'err_group_qm_notation' } + regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' } else { return 'err_unknown' } } } @@ -252,6 +254,8 @@ mut: // dot_char token variables dot_check_pc int = -1 // pc of the next token to check last_dot_flag bool // if true indicate that is the last dot_char in the regex + // debug fields + source_index int } [inline] @@ -1028,11 +1032,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { // OR branch if char_len == 1 && pc > 0 && byte(char_tmp) == `|` { - // two consecutive ist_dot_char are an error if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { return regex.err_syntax_error, i } re.prog[pc].ist = u32(0) | regex.ist_or_branch + re.prog[pc].source_index = i pc = pc + 1 i = i + char_len continue @@ -1252,10 +1256,18 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { pc1 = 0 for pc1 < pc - 2 { // println("Here $pc1 ${pc-2}") - // two consecutive OR are a syntax error - if re.prog[pc1 + 1].ist == regex.ist_or_branch - && re.prog[pc1 + 2].ist == regex.ist_or_branch { - return regex.err_syntax_error, i + // println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}") + if re.prog[pc1 + 1].ist == regex.ist_or_branch { + // two consecutive OR are a syntax error + if re.prog[pc1 + 2].ist == regex.ist_or_branch { + return regex.err_syntax_error, i + } + + // check for []|[] errors + if re.prog[pc1].ist == regex.ist_char_class_pos + && re.prog[pc1 + 2].ist == regex.ist_char_class_pos { + return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index + } } // manange a|b chains like a|(b)|c|d... @@ -1280,7 +1292,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { pc2++ } - // special case query of few chars, teh true can't go on the first instruction + // special case query of few chars, the true can't go on the first instruction if re.prog[pc1 + 1].rep_max == pc1 { re.prog[pc1 + 1].rep_max = 3 } diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index bc1be2dfbb..c8121875f2 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -389,9 +389,8 @@ fn test_regex(){ } if start != to.s || end != to.e { - //println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") + println("#$c [$to.src] q[$to.q] res[$tmp_str] base:[${to.s},${to.e}] $start, $end") eprintln("ERROR!") - //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) assert false continue } @@ -430,8 +429,8 @@ fn test_regex(){ for ln:=0; ln < re.groups.len; ln++ { if re.groups[ln] != to.cg[ln] { eprintln("Capture group doesn't match:") - eprintln("true ground: [${to.cg}]") - eprintln("elaborated : [${re.groups}]") + eprintln("true ground: ${to.cg}") + eprintln("elaborated : ${re.groups}") assert false } } @@ -551,7 +550,6 @@ fn test_regex(){ if start != to.s || end != to.e { eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end") eprintln("ERROR!") - //C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e) assert false continue } @@ -705,4 +703,20 @@ fn test_groups_in_find(){ assert end == test_obj.e assert re.groups == test_obj.res } -} \ No newline at end of file +} + +const( + err_query_list = [ + r'([a]|[b])*' + ] +) +fn test_errors(){ + mut count := 0 + for query in err_query_list { + _, err, _ := regex.regex_base(query) + if err != regex.compile_ok { + count++ + } + } + assert count == err_query_list.len +}