From 88758082d24bc811b252118c5402cd97f7cd6919 Mon Sep 17 00:00:00 2001 From: d2verb Date: Wed, 24 Jul 2019 07:06:48 +0900 Subject: [PATCH] scanner.v: refactoring --- Makefile | 1 + compiler/parser.v | 27 +++- compiler/scanner.v | 289 +++++++++++++++++++++--------------------- compiler/token.v | 4 +- vlib/builtin/string.v | 8 ++ 5 files changed, 176 insertions(+), 153 deletions(-) diff --git a/Makefile b/Makefile index e5193f66fb..199fd9eb1f 100644 --- a/Makefile +++ b/Makefile @@ -25,6 +25,7 @@ test: v clean: -rm -f v.c .v.c v vprod thirdparty/**/*.o + find . -name '.*.c' -print0 | xargs -0 -n1 rm -f SOURCES = $(wildcard thirdparty/**/*.c) OBJECTS := ${SOURCES:.c=.o} diff --git a/compiler/parser.v b/compiler/parser.v index a39e7fd9cd..25e718621f 100644 --- a/compiler/parser.v +++ b/compiler/parser.v @@ -823,7 +823,7 @@ fn (p mut Parser) get_type() string { if p.tok == .lsbr { p.check(.lsbr) // [10]int - if p.tok == .integer { + if p.tok == .number { typ = '[$p.lit]' p.next() } @@ -834,9 +834,9 @@ fn (p mut Parser) get_type() string { // [10][3]int if p.tok == .lsbr { p.next() - if p.tok == .integer { + if p.tok == .number { typ += '[$p.lit]' - p.check(.integer) + p.check(.number) } else { is_arr2 = true @@ -2095,7 +2095,7 @@ fn (p mut Parser) term() string { p.next() p.gen(tok.str())// + ' /*op2*/ ') p.fgen(' ' + tok.str() + ' ') - if is_div && p.tok == .integer && p.lit == '0' { + if is_div && p.tok == .number && p.lit == '0' { p.error('division by zero') } if is_mod && (is_float_type(typ) || !is_number_type(typ)) { @@ -2129,7 +2129,7 @@ fn (p mut Parser) factor() string { mut typ := '' tok := p.tok switch tok { - case .integer: + case .number: typ = 'int' // Check if float (`1.0`, `1e+3`) but not if is hexa if (p.lit.contains('.') || (p.lit.contains('e') || p.lit.contains('E'))) && @@ -2452,7 +2452,7 @@ fn (p mut Parser) map_init() string { fn (p mut Parser) array_init() string { p.is_alloc = true p.check(.lsbr) - is_integer := p.tok == .integer + is_integer := p.tok == .number lit := p.lit mut typ := '' new_arr_ph := p.cgen.add_placeholder() @@ -3334,7 +3334,7 @@ fn (p mut Parser) return_st() { } else { // Don't allow `return val` in functions that don't return anything - if false && p.tok == .name || p.tok == .integer { + if false && p.tok == .name || p.tok == .number { p.error('function `$p.cur_fn.name` does not return a value') } @@ -3529,3 +3529,16 @@ fn (p mut Parser) fspace() { fn (p mut Parser) fgenln(s string) { p.scanner.fgenln(s) } + +fn (p mut Parser) peek() Token { + for { + tok := p.scanner.peek() + if tok != .nl { + return tok + } + } +} + +fn (p mut Parser) create_type_string(T Type, name string) { + p.scanner.create_type_string(T, name) +} diff --git a/compiler/scanner.v b/compiler/scanner.v index b156bfd2c9..dcaa432acd 100644 --- a/compiler/scanner.v +++ b/compiler/scanner.v @@ -26,16 +26,11 @@ mut: prev_tok Token } -const ( - SingleQuote = `\'` - //QUOTE = `"` -) - fn new_scanner(file_path string) *Scanner { if !os.file_exists(file_path) { panic('"$file_path" doesn\'t exist') } - //text := os.read_file(file_path) + mut raw_text := os.read_file(file_path) or { panic('scanner: failed to open "$file_path"') return &Scanner{} @@ -60,7 +55,6 @@ fn new_scanner(file_path string) *Scanner { fmt_out: strings.new_builder(1000) } - // println('new scanner "$file_path" txt.len=$scanner.text.len') return scanner } @@ -74,14 +68,6 @@ fn scan_res(tok Token, lit string) ScanRes { return ScanRes{tok, lit} } -fn is_white(c byte) bool { - return c.is_white() -} - -fn is_nl(c byte) bool { - return c == `\r` || c == `\n` -} - fn (s mut Scanner) ident_name() string { start := s.pos for { @@ -99,51 +85,119 @@ fn (s mut Scanner) ident_name() string { return name } -fn (s mut Scanner) ident_number() string { - start := s.pos - is_hex := s.pos + 1 < s.text.len && s.text[s.pos] == `0` && s.text[s.pos + 1] == `x` - is_oct := !is_hex && s.text[s.pos] == `0` - mut is_float := false +fn (s mut Scanner) ident_hex_number() string { + start_pos := s.pos + s.pos += 2 // skip '0x' for { - s.pos++ if s.pos >= s.text.len { break } c := s.text[s.pos] - if c == `.` { - is_float = true - } - is_good_hex := is_hex && (c == `x` || (c >= `a` && c <= `f`) || (c >= `A` && c <= `F`)) - // 1e+3, 1e-3, 1e3 - if !is_hex && c == `e` && s.pos + 1 < s.text.len { - next := s.text[s.pos + 1] - if next == `+` || next == `-` || next.is_digit() { - s.pos++ - continue - } - } - if !c.is_digit() && c != `.` && !is_good_hex { + if !c.is_hex_digit() { break } - // 1..9 - if c == `.` && s.pos + 1 < s.text.len && s.text[s.pos + 1] == `.` { - break - } - if is_oct && c >= `8` && !is_float { - s.error('malformed octal constant') - } + s.pos++ } - number := s.text.substr(start, s.pos) + number := s.text.substr(start_pos, s.pos) s.pos-- return number } +fn (s mut Scanner) ident_oct_number() string { + start_pos := s.pos + for { + if s.pos >= s.text.len { + break + } + c := s.text[s.pos] + if c.is_digit() { + if !c.is_oct_digit() { + s.error('malformed octal constant') + } + } else { + break + } + s.pos++ + } + number := s.text.substr(start_pos, s.pos) + s.pos-- + return number +} + +fn (s mut Scanner) ident_dec_number() string { + start_pos := s.pos + + // scan integer part + for s.text[s.pos].is_digit() { + s.pos++ + } + + // e.g. 1..9 + // we just return '1' and don't scan '..9' + if s.expect('..', s.pos) { + number := s.text.substr(start_pos, s.pos) + s.pos-- + return number + } + + // scan fractional part + if s.text[s.pos] == `.` { + s.pos++ + for s.text[s.pos].is_digit() { + s.pos++ + } + } + + // scan exponential part + mut has_exponential_part := false + if s.expect('e+', s.pos) || s.expect('e-', s.pos) { + exp_start_pos := s.pos += 2 + for s.text[s.pos].is_digit() { + s.pos++ + } + if exp_start_pos == s.pos { + s.error('exponent has no digits') + } + has_exponential_part = true + } + + // error check: 1.23.4, 123.e+3.4 + if s.text[s.pos] == `.` { + if has_exponential_part { + s.error('exponential part should be integer') + } + else { + s.error('too many decimal points in number') + } + } + + number := s.text.substr(start_pos, s.pos) + s.pos-- + return number +} + +fn (s mut Scanner) ident_number() string { + if s.expect('0x', s.pos) { + return s.ident_hex_number() + } + + if s.expect('0.', s.pos) || s.expect('0e', s.pos) { + return s.ident_dec_number() + } + + if s.text[s.pos] == `0` { + return s.ident_oct_number() + } + + return s.ident_dec_number() +} + fn (s Scanner) has_gone_over_line_end() bool { mut i := s.pos-1 - for i >= 0 && !is_white(s.text[i]) { + for i >= 0 && !s.text[i].is_white() { i-- } - for i >= 0 && is_white(s.text[i]) { + for i >= 0 && s.text[i].is_white() { if is_nl(s.text[i]) { return true } @@ -153,45 +207,21 @@ fn (s Scanner) has_gone_over_line_end() bool { } fn (s mut Scanner) skip_whitespace() { - for s.pos < s.text.len && is_white(s.text[s.pos]) { - if is_nl(s.text[s.pos]) { - // Count \r\n as one line - if !(s.text[s.pos] == `\n` && s.pos > 0 && s.text[s.pos-1] == `\r`) { + for s.pos < s.text.len && s.text[s.pos].is_white() { + // Count \r\n as one line + if is_nl(s.text[s.pos]) && !s.expect('\r\n', s.pos-1) { s.line_nr++ - } } s.pos++ } - // if s.pos == s.text.len { - // return scan_res(.eof, '') - // } } -fn (s mut Scanner) get_var_name(pos int) string { - mut pos_start := pos - - for ; pos_start >= 0 && s.text[pos_start] != `\n` && s.text[pos_start] != `;`; pos_start-- {} - pos_start++ - return s.text.substr(pos_start, pos) -} - -// CAO stands for Compound Assignment Operators (e.g '+=' ) -/* -fn (s mut Scanner) cao_change(operator string) { - s.text = s.text.substr(0, s.pos - operator.len) + ' = ' + s.get_var_name(s.pos - operator.len) + ' ' + operator + ' ' + s.text.substr(s.pos + 1, s.text.len) -} -*/ - fn (s mut Scanner) scan() ScanRes { -if s.line_comment != '' { - //s.fgenln('// LOL "$s.line_comment"') - //s.line_comment = '' -} - // if s.file_path == 'd.v' { - // println('\nscan()') - // } + if s.line_comment != '' { + //s.fgenln('// LOL "$s.line_comment"') + //s.line_comment = '' + } if s.started { - // || (s.pos == 0 && s.text.len > 0 && s.text[s.pos] == `\n`) { s.pos++ } s.started = true @@ -204,8 +234,7 @@ if s.line_comment != '' { } // End of $var, start next string if s.dollar_end { - // fmt.Println("end of $var, get string", s.pos, string(s.text[s.pos])) - if s.text[s.pos] == SingleQuote { + if s.text[s.pos] == `\'` { s.dollar_end = false return scan_res(.str, '') } @@ -215,7 +244,6 @@ if s.line_comment != '' { s.skip_whitespace() // end of file if s.pos >= s.text.len { - // println('scan(): returning .eof (pos >= len)') return scan_res(.eof, '') } // handle each char @@ -230,26 +258,19 @@ if s.line_comment != '' { // tmp hack to detect . in ${} // Check if not .eof to prevent panic next_char := if s.pos + 1 < s.text.len { s.text[s.pos + 1] } else { `\0` } - // println('!!! got name=$name next_char=$next_char') if is_key(name) { - // println('IS KEY') - // tok := (key_to_token(name)) - // println(tok.str()) return scan_res(key_to_token(name), '') } // 'asdf $b' => "b" is the last name in the string, dont start parsing string // at the next ', skip it if s.inside_string { - // println('is_letter inside string! nextc=${nextc.str()}') - if next_char == SingleQuote { - // println('var is last before QUOTE') + if next_char == `\'` { s.pos++ s.dollar_start = false s.inside_string = false } } - if s.dollar_start && next_char != `.` {//&& next_char != `(` { - // println('INSIDE .str .dollar var=$name') + if s.dollar_start && next_char != `.` { s.dollar_end = true s.dollar_start = false } @@ -263,7 +284,7 @@ if s.line_comment != '' { // `123`, `.123` else if c.is_digit() || c == `.` && nextc.is_digit() { num := s.ident_number() - return scan_res(.integer, num) + return scan_res(.number, num) } // all other tokens switch c { @@ -307,7 +328,7 @@ if s.line_comment != '' { return scan_res(.mod, '') case `?`: return scan_res(.question, '') - case SingleQuote: + case `\'`: return scan_res(.str, s.ident_string()) // TODO allow double quotes // case QUOTE: @@ -336,7 +357,7 @@ if s.line_comment != '' { if s.inside_string { s.pos++ // TODO UN.neEDED? - if s.text[s.pos] == SingleQuote { + if s.text[s.pos] == `\'` { s.inside_string = false return scan_res(.str, '') } @@ -458,7 +479,6 @@ if s.line_comment != '' { return scan_res(.div_assign, '') } if nextc == `/` { - // debug("!!!!!!.key_goT LI.ne COM") start := s.pos + 1 for s.pos < s.text.len && s.text[s.pos] != `\n`{ s.pos++ @@ -485,11 +505,11 @@ if s.line_comment != '' { s.line_nr++ continue } - if s.text[s.pos] == `/` && s.text[s.pos + 1] == `*` { + if s.expect('/*', s.pos) { nest_count++ continue } - if s.text[s.pos] == `*` && s.text[s.pos + 1] == `/` { + if s.expect('*/', s.pos) { nest_count-- } } @@ -507,7 +527,6 @@ if s.line_comment != '' { return scan_res(.eof, '') } } - println('(char code=$c) pos=$s.pos len=$s.text.len') mut msg := 'invalid character `${c.str()}`' if c == `"` { msg += ', use \' to denote strings' @@ -520,9 +539,6 @@ fn (s &Scanner) error(msg string) { file := s.file_path.all_after('/') println('panic: $file:${s.line_nr + 1}') println(msg) - // os.print_backtrace() - // println(file) - // println(s.file_path) exit(1) } @@ -531,10 +547,6 @@ fn (s &Scanner) error(msg string) { fn (s mut Scanner) ident_string() string { // println("\nidentString() at char=", string(s.text[s.pos]), // "chard=", s.text[s.pos], " pos=", s.pos, "txt=", s.text[s.pos:s.pos+7]) - debug := s.file_path.contains('test_test') - if debug { - println('identStr() $s.file_path line=$s.line_nr pos=$s.pos') - } mut start := s.pos s.inside_string = false slash := `\\` @@ -544,12 +556,9 @@ fn (s mut Scanner) ident_string() string { break } c := s.text[s.pos] - if debug { - println(c.str()) - } prevc := s.text[s.pos - 1] // end of string - if c == SingleQuote && (prevc != slash || (prevc == slash && s.text[s.pos - 2] == slash)) { + if c == `\'` && (prevc != slash || (prevc == slash && s.text[s.pos - 2] == slash)) { // handle '123\\' slash at the end break } @@ -561,8 +570,7 @@ fn (s mut Scanner) ident_string() string { s.error('0 character in a string literal') } // Don't allow \x00 - if c == `0` && s.pos > 5 && s.text[s.pos - 1] == `0` && s.text[s.pos - 2] == `x` && - s.text[s.pos - 3] == `\\` { + if c == `0` && s.pos > 5 && s.expect('\\x0', s.pos - 3) { s.error('0 character in a string literal') } // ${var} @@ -573,17 +581,15 @@ fn (s mut Scanner) ident_string() string { break } // $var - // if !s.is_fmt && c != `{` && c != ` ` && ! (c >= `0` && c <= `9`) && prevc == `$` { if (c.is_letter() || c == `_`) && prevc == `$` { s.inside_string = true s.dollar_start = true - // println('setting s.dollar=true pos=$s.pos') s.pos -= 2 break } } mut lit := '' - if s.text[start] == SingleQuote { + if s.text[start] == `\'` { start++ } mut end := s.pos @@ -594,17 +600,6 @@ fn (s mut Scanner) ident_string() string { else { lit = s.text.substr(start, end) } - // if lit.contains('\n') { - // println('\nstring lit="$lit" pos=$s.pos line=$s.line_nr') - // } - /* - for c in lit { - if s.file_path.contains('range_test') { - println('!') - println(c) - } - } -*/ return lit } @@ -620,7 +615,7 @@ fn (s mut Scanner) ident_char() string { if s.text[s.pos] != slash { len++ } - double_slash := s.text[s.pos - 1] == slash && s.text[s.pos - 2] == slash + double_slash := s.expect('\\\\', s.pos - 2) if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) { if double_slash { len++ @@ -633,30 +628,24 @@ fn (s mut Scanner) ident_char() string { if len != 1 { u := c.ustring() if u.len != 1 { - s.error('invalid character literal (more than one character: $len)') + s.error('invalid character literal (more than one character: $len)') } } return c } -fn (p mut Parser) peek() Token { - for { - tok := p.scanner.peek() - if tok != .nl { - return tok - } - } -} - fn (s mut Scanner) peek() Token { + // save scanner state pos := s.pos line := s.line_nr inside_string := s.inside_string dollar_start := s.dollar_start dollar_end := s.dollar_end - // ///// + res := s.scan() tok := res.tok + + // restore scanner state s.pos = pos s.line_nr = line s.inside_string = inside_string @@ -665,30 +654,42 @@ fn (s mut Scanner) peek() Token { return tok } +fn (s mut Scanner) expect(want string, start_pos int) bool { + end_pos := start_pos + want.len + if start_pos < 0 || start_pos >= s.text.len { + return false + } + if end_pos < 0 || end_pos > s.text.len { + return false + } + for pos in start_pos..end_pos { + if s.text[pos] != want[pos-start_pos] { + return false + } + } + return true +} + fn (s mut Scanner) debug_tokens() { s.pos = 0 + s.debug = true + fname := s.file_path.all_after('/') println('\n===DEBUG TOKENS $fname===') - // allToks := '' - s.debug = true + for { res := s.scan() tok := res.tok lit := res.lit - // printiln(tok) print(tok.str()) - // allToks += tok.String() if lit != '' { println(' `$lit`') - // allToks += " `" + lit + "`" } else { println('') } - // allToks += "\n" if tok == .eof { println('============ END OF DEBUG TOKENS ==================') - // fmt.Println("========"+s.file+"========\n", allToks) break } } @@ -698,6 +699,10 @@ fn is_name_char(c byte) bool { return c.is_letter() || c == `_` } +fn is_nl(c byte) bool { + return c == `\r` || c == `\n` +} + fn (s mut Scanner) get_opening_bracket() int { mut pos := s.pos mut parentheses := 0 @@ -740,7 +745,3 @@ fn (s mut Scanner) create_type_string(T Type, name string) { s.line_nr = line s.inside_string = inside_string } - -fn (p mut Parser) create_type_string(T Type, name string) { - p.scanner.create_type_string(T, name) -} diff --git a/compiler/token.v b/compiler/token.v index 364299ebed..e1d3cec95c 100644 --- a/compiler/token.v +++ b/compiler/token.v @@ -7,7 +7,7 @@ module main enum Token { eof name // user - integer // 123 + number // 123 str // 'foo' str_inter // 'name=$user.name' chartoken // `A` @@ -127,7 +127,7 @@ fn build_token_str() []string { s[Token.keyword_end] = '' s[Token.eof] = '.eof' s[Token.name] = '.name' - s[Token.integer] = '.integer' + s[Token.number] = '.number' s[Token.str] = 'STR' s[Token.chartoken] = '.chartoken' s[Token.plus] = '+' diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index f5788863ae..33d4534f9e 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -714,6 +714,14 @@ pub fn (c byte) is_digit() bool { return c >= `0` && c <= `9` } +pub fn (c byte) is_hex_digit() bool { + return c.is_digit() || (c >= `a` && c <= `f`) || (c >= `A` && c <= `F`) +} + +pub fn (c byte) is_oct_digit() bool { + return c >= `0` && c <= `7` +} + pub fn (c byte) is_letter() bool { return (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) }