From 47a2301139dd780dbbf2ddcca5aad6b3b3a0aab1 Mon Sep 17 00:00:00 2001
From: penguindark <57967770+penguindark@users.noreply.github.com>
Date: Sun, 24 Oct 2021 10:54:51 +0200
Subject: [PATCH] regex: add OR error, if sourounded by char classes, and a
 test (#12278)

---
 vlib/regex/README.md    | 14 ++++++++------
 vlib/regex/regex.v      | 24 ++++++++++++++++++------
 vlib/regex/regex_test.v | 26 ++++++++++++++++++++------
 3 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/vlib/regex/README.md b/vlib/regex/README.md
index 0faa833bb3..33a6de5525 100644
--- a/vlib/regex/README.md
+++ b/vlib/regex/README.md
@@ -28,6 +28,7 @@ simple token, is a single character.
 `abc` OR `ebc`. Instead it is evaluated like `ab`, followed by `c OR e`,
 followed by `bc`, because the **token is the base element**,
 not the sequence of symbols.
+Note: **Two char classes with an `OR` in the middle is a syntax error.**
 
 - The **match operation stops at the end of the string**. It does *NOT* stop 
 at new line characters.
@@ -155,6 +156,7 @@ match too, finally test the token `c`.
 
 NB: ** unlike in PCRE, the OR operation works at token level!** 
 It doesn't work at concatenation level!
+NB2: **Two char classes with an `OR` in the middle is a syntax error.**
 
 That also means, that a query string like `abc|bde` is not equal to 
 `(abc)|(bde)`, but instead to `ab(c|b)de.
@@ -474,21 +476,21 @@ the behavior of the parser itself.
 ```v ignore
 // example of flag settings
 mut re := regex.new()
-re.flag = regex.F_BIN
+re.flag = regex.f_bin
 ```
 
-- `F_BIN`: parse a string as bytes, utf-8 management disabled.
+- `f_bin`: parse a string as bytes, utf-8 management disabled.
 
-- `F_EFM`: exit on the first char matches in the query, used by the 
+- `f_efm`: exit on the first char matches in the query, used by the 
            find function.
 	
-- `F_MS`:  matches only if the index of the start match is 0,
+- `f_ms`:  matches only if the index of the start match is 0,
            same as `^` at the start of the query string.
 	
-- `F_ME`:  matches only if the end index of the match is the last char
+- `f_me`:  matches only if the end index of the match is the last char
            of the input string, same as `$` end of query string.
 	
-- `F_NL`:  stop the matching if found a new line char `\n` or `\r`
+- `f_nl`:  stop the matching if found a new line char `\n` or `\r`
 
 ## Functions
 
diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v
index 575c277493..c87c67582b 100644
--- a/vlib/regex/regex.v
+++ b/vlib/regex/regex.v
@@ -40,6 +40,7 @@ pub const (
 	err_groups_max_nested  = -8 // max number of nested group reached
 	err_group_not_balanced = -9 // group not balanced
 	err_group_qm_notation  = -10 // group invalid notation
+	err_invalid_or_with_cc = -11 // invalid or on two consecutive char class
 )
 
 const (
@@ -196,6 +197,7 @@ pub fn (re RE) get_parse_error_string(err int) string {
 		regex.err_groups_max_nested { return 'err_groups_max_nested' }
 		regex.err_group_not_balanced { return 'err_group_not_balanced' }
 		regex.err_group_qm_notation { return 'err_group_qm_notation' }
+		regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' }
 		else { return 'err_unknown' }
 	}
 }
@@ -252,6 +254,8 @@ mut:
 	// dot_char token variables
 	dot_check_pc  int = -1 // pc of the next token to check
 	last_dot_flag bool // if true indicate that is the last dot_char in the regex
+	// debug fields
+	source_index int
 }
 
 [inline]
@@ -1028,11 +1032,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 
 		// OR branch
 		if char_len == 1 && pc > 0 && byte(char_tmp) == `|` {
-			// two consecutive ist_dot_char are an error
 			if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch {
 				return regex.err_syntax_error, i
 			}
 			re.prog[pc].ist = u32(0) | regex.ist_or_branch
+			re.prog[pc].source_index = i
 			pc = pc + 1
 			i = i + char_len
 			continue
@@ -1252,10 +1256,18 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 	pc1 = 0
 	for pc1 < pc - 2 {
 		// println("Here $pc1 ${pc-2}")
-		// two consecutive OR are a syntax error
-		if re.prog[pc1 + 1].ist == regex.ist_or_branch
-			&& re.prog[pc1 + 2].ist == regex.ist_or_branch {
-			return regex.err_syntax_error, i
+		// println("source index: ${pc1 + 1} => ${re.prog[pc1+1].source_index}")
+		if re.prog[pc1 + 1].ist == regex.ist_or_branch {
+			// two consecutive OR are a syntax error
+			if re.prog[pc1 + 2].ist == regex.ist_or_branch {
+				return regex.err_syntax_error, i
+			}
+
+			// check for []|[] errors
+			if re.prog[pc1].ist == regex.ist_char_class_pos
+				&& re.prog[pc1 + 2].ist == regex.ist_char_class_pos {
+				return regex.err_invalid_or_with_cc, re.prog[pc1 + 1].source_index
+			}
 		}
 
 		// manange a|b chains like a|(b)|c|d...
@@ -1280,7 +1292,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) {
 
 				pc2++
 			}
-			// special case query of few chars, teh true can't go on the first instruction
+			// special case query of few chars, the true can't go on the first instruction
 			if re.prog[pc1 + 1].rep_max == pc1 {
 				re.prog[pc1 + 1].rep_max = 3
 			}
diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v
index bc1be2dfbb..c8121875f2 100644
--- a/vlib/regex/regex_test.v
+++ b/vlib/regex/regex_test.v
@@ -389,9 +389,8 @@ fn test_regex(){
 		}
 
 		if start != to.s || end != to.e {
-			//println("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
+			println("#$c [$to.src] q[$to.q] res[$tmp_str] base:[${to.s},${to.e}] $start, $end")
 			eprintln("ERROR!")
-			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
 		}	
@@ -430,8 +429,8 @@ fn test_regex(){
 			for ln:=0; ln < re.groups.len; ln++ {
 				if re.groups[ln] != to.cg[ln] {
 					eprintln("Capture group doesn't match:")
-					eprintln("true ground: [${to.cg}]")
-					eprintln("elaborated : [${re.groups}]")
+					eprintln("true ground: ${to.cg}")
+					eprintln("elaborated : ${re.groups}")
 					assert false
 				}
 			} 
@@ -551,7 +550,6 @@ fn test_regex(){
 		if start != to.s || end != to.e {
 			eprintln("#$c [$to.src] q[$to.q] res[$tmp_str] $start, $end")
 			eprintln("ERROR!")
-			//C.printf("ERROR!! res:(%d, %d) refh:(%d, %d)\n",start, end, to.s, to.e)
 			assert false
 			continue
 		}
@@ -705,4 +703,20 @@ fn test_groups_in_find(){
 		assert end == test_obj.e
 		assert re.groups == test_obj.res
 	}
-}
\ No newline at end of file
+}
+
+const(
+	err_query_list = [
+		r'([a]|[b])*'
+	]
+)
+fn test_errors(){
+	mut count := 0
+	for query in err_query_list {
+		_, err, _ := regex.regex_base(query)
+		if err != regex.compile_ok {
+			count++
+		}
+	}
+	assert count == err_query_list.len
+}