From 908296cdfb7cb4bf407c1a3d7283119c57e456ab Mon Sep 17 00:00:00 2001 From: penguindark <57967770+penguindark@users.noreply.github.com> Date: Sat, 1 Jan 2022 08:21:27 +0100 Subject: [PATCH] regex: improve errors for edge cases (#13008) * code cleaning, added more clear errors for dots and ORs * added failed match index for better find functions, updated tests * added index in match failed, updated tests * test cleaning * test check --- vlib/regex/regex.v | 53 ++++++++++++----------------------------- vlib/regex/regex_test.v | 26 ++++++++++---------- vlib/regex/regex_util.v | 14 ++++++++++- 3 files changed, 41 insertions(+), 52 deletions(-) diff --git a/vlib/regex/regex.v b/vlib/regex/regex.v index 36b8b8f2a3..a0c8f51514 100644 --- a/vlib/regex/regex.v +++ b/vlib/regex/regex.v @@ -42,6 +42,7 @@ pub const ( err_group_qm_notation = -10 // group invalid notation err_invalid_or_with_cc = -11 // invalid or on two consecutive char class err_neg_group_quantifier = -12 // negation groups can not have quantifier + err_consecutive_dots = -13 // two consecutive dots is an error ) const ( @@ -200,6 +201,7 @@ pub fn (re RE) get_parse_error_string(err int) string { regex.err_group_qm_notation { return 'err_group_qm_notation' } regex.err_invalid_or_with_cc { return 'err_invalid_or_with_cc' } regex.err_neg_group_quantifier { return 'err_neg_group_quantifier' } + regex.err_consecutive_dots { return 'err_consecutive_dots' } else { return 'err_unknown' } } } @@ -283,14 +285,7 @@ pub const ( f_src = 0x00020000 // search mode enabled ) -struct StateDotObj { -mut: - i int = -1 // char index in the input buffer - pc int = -1 // program counter saved - mi int = -1 // match_index saved - group_stack_index int = -1 // continuous save on capturing groups -} - +// Log function prototype pub type FnLog = fn (string) pub struct RE { @@ -1042,6 +1037,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { re.prog[goto_pc].goto_pc = pc // start goto point to the end group pc // re.prog[goto_pc].group_id = group_count // id of this group, used for storing data + // duplicate the negation group info and settings if re.prog[goto_pc].group_neg == true { re.prog[pc].group_neg = re.prog[goto_pc].group_neg re.prog[pc].rep_min = re.prog[goto_pc].rep_min @@ -1054,6 +1050,11 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { // ist_dot_char match any char except the following token if char_len == 1 && pc >= 0 && byte(char_tmp) == `.` { + // consecutive ist_dot_char is a syntax error + if pc > 0 && re.prog[pc - 1].ist == regex.ist_dot_char { + return regex.err_consecutive_dots, i + } + re.prog[pc].ist = u32(0) | regex.ist_dot_char re.prog[pc].rep_min = 1 re.prog[pc].rep_max = 1 @@ -1228,7 +1229,7 @@ fn (mut re RE) impl_compile(in_txt string) (int, int) { // check for OR at the end of the program if pc > 0 && re.prog[pc - 1].ist == regex.ist_or_branch { - return regex.err_syntax_error, in_txt.len + return regex.err_syntax_error, in_txt.len - 1 } // store the number of groups in the query @@ -1873,7 +1874,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { } // print("No good exit!!") - return regex.no_match_found, 0 + return regex.no_match_found, state.i } // starting and init @@ -1959,7 +1960,7 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { } // exit on no match - return result, 0 + return result, state.i } // ist_load else if m_state == .ist_load { @@ -2164,30 +2165,6 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { continue } // check bsls - /* - else if ist == regex.ist_bsls_char { - state.match_flag = false - tmp_res := re.prog[state.pc].validator(byte(ch)) - // println("BSLS in_ch: ${ch:c} res: $tmp_res") - if tmp_res { - state.match_flag = true - l_ist = u32(regex.ist_bsls_char) - - if state.first_match < 0 { - state.first_match = state.i - } - - state.match_index = state.i - - re.prog[state.pc].rep++ // increase repetitions - state.i += char_len // next char - m_state = .ist_quant_p - continue - } - m_state = .ist_quant_n - continue - } - */ else if ist == regex.ist_bsls_char { // println("ist_bsls_char rep: ${re.prog[state.pc].rep}") @@ -2541,14 +2518,14 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { return state.first_match, state.i } // println("Program not finished! ") - return regex.no_match_found, 0 + return regex.no_match_found, state.i } if src_end { // println("program end") return state.first_match, state.i } // print("No match found!!") - return regex.no_match_found, 0 + return regex.no_match_found, state.i } else { // println("Group match! OK") // println("first_match: $state.first_match, i: $state.i") @@ -2559,5 +2536,5 @@ pub fn (mut re RE) match_base(in_txt &byte, in_txt_len int) (int, int) { } } // println("no_match_found, natural end") - return regex.no_match_found, 0 + return regex.no_match_found, state.i } diff --git a/vlib/regex/regex_test.v b/vlib/regex/regex_test.v index a37e9afb31..8f479b757b 100644 --- a/vlib/regex/regex_test.v +++ b/vlib/regex/regex_test.v @@ -18,7 +18,7 @@ const( match_test_suite = [ // minus in CC TestItem{"d.def",r"abc.\.[\w\-]{,100}",-1,0}, - TestItem{"abc12345.asd",r"abc.\.[\w\-]{,100}",-1,0}, + TestItem{"abc12345.asd",r"abc.\.[\w\-]{,100}",-1,4}, TestItem{"abca.exe",r"abc.\.[\w\-]{,100}",0,8}, TestItem{"abc2.exe-test_12",r"abc.\.[\w\-]{,100}",0,16}, TestItem{"abcdefGHK",r"[a-f]+\A+",0,9}, @@ -96,30 +96,30 @@ match_test_suite = [ // negative TestItem{"zthis ciao",r"((t[hieo]+se?)\s*)+",-1,0}, - TestItem{"this is a good.",r"thes",-1,0}, - TestItem{"test1post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",-1,0}, + TestItem{"this is a good.",r"thes",-1,2}, + TestItem{"test1post.pip.com, pera",r"[\w]+@([\w]+\.)+\w+",-1,9}, TestItem{"this cpapaz adce",r"(c(pa)+z)(\s[\a]+){2}",-1,0}, TestItem{"this cpapaz adce aabe third",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"1234this cpapaz adce aabe ter",r"(c(pa)+z)(\s[\a]+){2}$",-1,0}, TestItem{"cpapaz ole. pipipo,",r"^.*c.+ol?e.*p([ip])+o$",-1,0}, - TestItem{"/home/us_er/pippo/info-01.jpeg", r"(/?[-\w_]+)*\.txt$",-1,0} + TestItem{"/home/us_er/pippo/info-01.jpeg", r"(/?[-\w_]+)*\.txt$",-1,26} // check unicode TestItem{"this is a Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r".*a [Ⅰ-Ⅵ ]+",0,34}, TestItem{"123Ⅰ Ⅱ Ⅲ Ⅳ Ⅴ Ⅵ test",r"[Ⅰ-Ⅴ\s]+",3,23}, // new edge cases - TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,0}, + TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",-1,8}, TestItem{"12345678", r"[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]",0,8}, TestItem{"123456789", r"^[0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9][0-9]$",0,9} TestItem{"12345678", r"^\d{8}$",0,8}, TestItem{"12345678", r"^\d{7}$",-1,0}, - TestItem{"12345678", r"^\d{9}$",-1,0}, + TestItem{"12345678", r"^\d{9}$",-1,8}, TestItem{"eth", r"(oth)|(eth)",0,3}, - TestItem{"et", r"(oth)|(eth)",-1,0}, - TestItem{"et", r".*(oth)|(eth)",-1,0}, - TestItem{"peoth", r".*(ith)|(eth)",-1,0}, + TestItem{"et", r"(oth)|(eth)",-1,2}, + TestItem{"et", r".*(oth)|(eth)",-1,2}, + TestItem{"peoth", r".*(ith)|(eth)",-1,5}, TestItem{"poth", r"(eth)|(oth)",1,4}, TestItem{"poth", r"(oth)|(eth)",1,4}, @@ -132,7 +132,7 @@ match_test_suite = [ TestItem{"accccb deer", r"^a(.*)b d(.+)r",0,11}, TestItem{"accccb deer", r"^a(.*)b d(.+)",0,11}, TestItem{"accccb deer", r"^(.*)$",0,11}, - TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,0}, + TestItem{"accccb deer", r"^a(.*)b d(.+)p",-1,11}, TestItem{"##.#....#.##.####...#.##", r".{18}[.#]",0,19}, TestItem{"#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", r'.*#[.#]{4}##[.#]{4}##[.#]{4}###',0,49}, @@ -328,19 +328,19 @@ find_all_test_suite = [ [29, 49], ['#....###...##...####'] }, - Test_find_all{ + Test_find_all{ "#.#......##.#..#..##........##....###...##...######.......#.....#..#......#...#........###.#..#.", r".*#[.#]{4}##[.#]{4}##[.#]{4}###", [0, 49], ['#.#......##.#..#..##........##....###...##...####'] }, - Test_find_all{ + Test_find_all{ "1234 Aa dddd Aaf 12334 Aa opopo Aaf", r"Aa.+Aaf", [5, 16, 23, 35], ['Aa dddd Aaf', 'Aa opopo Aaf'] }, - Test_find_all{ + Test_find_all{ "@for something @endfor @for something else @endfor altro testo @for body @endfor uno due @for senza dire più @endfor pippo", r"@for.+@endfor", [0, 22, 23, 50, 63, 80, 89, 117], diff --git a/vlib/regex/regex_util.v b/vlib/regex/regex_util.v index ed29b8acfd..02853ced95 100644 --- a/vlib/regex/regex_util.v +++ b/vlib/regex/regex_util.v @@ -272,8 +272,14 @@ pub fn (mut re RE) find_all(in_txt string) []int { i += e continue } + /* + if e > 0 { + i += e + continue + } + */ + i++ } - i++ } // re.flag = old_flag return res @@ -306,6 +312,12 @@ pub fn (mut re RE) find_all_str(in_txt string) []string { continue } } + /* + if e > 0 { + i += e + continue + } + */ i++ } // re.flag = old_flag