From ca284482cb12d9a9a40833cabe232609f793ab50 Mon Sep 17 00:00:00 2001 From: Alexander Medvednikov Date: Sun, 22 Dec 2019 04:34:37 +0300 Subject: [PATCH] new AST built with sum types --- vlib/compiler/expression.v | 10 +- vlib/compiler/fn.v | 8 +- vlib/compiler/if_match.v | 29 +- vlib/compiler/parser.v | 26 +- vlib/compiler/scanner.v | 2 +- vlib/compiler/struct.v | 2 +- vlib/compiler/table.v | 13 +- vlib/compiler/tests/type_test.v | 7 +- vlib/compiler2/ast/ast.v | 85 +++ vlib/compiler2/fmt/fmt.v | 15 + vlib/compiler2/parser/parser.v | 81 +++ vlib/compiler2/parser/parser_test.v | 35 + vlib/compiler2/scanner/scanner.v | 911 ++++++++++++++++++++++++++ vlib/compiler2/scanner/scanner_test.v | 30 + vlib/compiler2/token/token.v | 305 +++++++++ 15 files changed, 1521 insertions(+), 38 deletions(-) create mode 100644 vlib/compiler2/ast/ast.v create mode 100644 vlib/compiler2/fmt/fmt.v create mode 100644 vlib/compiler2/parser/parser.v create mode 100644 vlib/compiler2/parser/parser_test.v create mode 100644 vlib/compiler2/scanner/scanner.v create mode 100644 vlib/compiler2/scanner/scanner_test.v create mode 100644 vlib/compiler2/token/token.v diff --git a/vlib/compiler/expression.v b/vlib/compiler/expression.v index de11e6c2dc..bfbca33853 100644 --- a/vlib/compiler/expression.v +++ b/vlib/compiler/expression.v @@ -47,8 +47,12 @@ fn (p mut Parser) bool_expression() string { p.error('expr() returns empty type') } if expected != typ && expected in p.table.sum_types { // TODO perf - p.cgen.set_placeholder(start_ph, '/*KUK*/($expected) { .obj = ($typ[]) { ') - p.gen('}, .typ = 1}')//${val}_type }') + p.cgen.set_placeholder(start_ph, + //'/*SUM TYPE CAST*/($expected) { .obj = &($typ[]) { ') + '/*SUM TYPE CAST*/($expected) { .obj = memdup(& ') + tt := typ.all_after('_') // TODO + //p.gen('}, .typ = SumType_${tt} }')//${val}_type }') + p.gen(', sizeof($typ) ), .typ = SumType_${tt} }')//${val}_type }') } return typ @@ -369,7 +373,7 @@ fn (p mut Parser) name_expr() string { //println(q) //println(q[idx]) arg_type := q[idx] - p.gen('($enum_type.name) { .obj = ($arg_type[]) { ') + p.gen('($enum_type.name) { .obj = ($arg_type[]) { ') p.bool_expression() p.check(.rpar) p.gen('}, .typ = ${val}_type }') diff --git a/vlib/compiler/fn.v b/vlib/compiler/fn.v index abf32dc210..8be88b659e 100644 --- a/vlib/compiler/fn.v +++ b/vlib/compiler/fn.v @@ -220,7 +220,7 @@ fn (p mut Parser) fn_decl() { mut f := Fn{ mod: p.mod is_public: is_pub || p.is_vh // functions defined in .vh are always public - + is_unsafe: p.attr == 'unsafe_fn' is_deprecated: p.attr == 'deprecated' comptime_define: if p.attr.starts_with('if ') { p.attr[3..] } else { '' } @@ -799,7 +799,7 @@ fn (p mut Parser) fn_call(f mut Fn, method_ph int, receiver_var, receiver_type s if f.is_method { receiver := f.args.first() mut receiver_is_interface := false - if receiver.typ.ends_with('er') { + if receiver.typ.ends_with('er') || receiver.typ[0] == `I` { // I absolutely love this syntax // `s.speak()` => // `((void (*)())(Speaker_name_table[s._interface_idx][1]))(s._object); @@ -893,7 +893,7 @@ fn (p mut Parser) fn_args(f mut Fn) { typ: typ is_arg: true // is_mut: is_mut - + line_nr: p.scanner.line_nr token_idx: p.cur_tok_index() } @@ -1083,7 +1083,7 @@ fn (p mut Parser) fn_call_args(f mut Fn, generic_param_types []string) { // fn run(r Animal) { ... } // `run(dog)` adds `Dog` to the `Animal` interface. // This is needed to generate an interface table. - if arg.typ.ends_with('er') { + if arg.typ.ends_with('er') || arg.typ[0] == `I` { t := p.table.find_type(arg.typ) if t.cat == .interface_ { // perform((Speaker) { ._object = &dog, diff --git a/vlib/compiler/if_match.v b/vlib/compiler/if_match.v index 6024c48ce1..72d4b18302 100644 --- a/vlib/compiler/if_match.v +++ b/vlib/compiler/if_match.v @@ -16,6 +16,9 @@ fn (p mut Parser) match_statement(is_expr bool) string { if typ.starts_with('array_') { p.error('arrays cannot be compared') } + is_sum_type := typ in p.table.sum_types + mut sum_child_type := '' + // is it safe to use p.cgen.insert_before ??? tmp_var := p.get_tmp() p.cgen.insert_before('$typ $tmp_var = $expr;') @@ -111,6 +114,7 @@ fn (p mut Parser) match_statement(is_expr bool) string { } ph := p.cgen.add_placeholder() // Multiple checks separated by comma + p.open_scope() mut got_comma := false for { if got_comma { @@ -121,11 +125,26 @@ fn (p mut Parser) match_statement(is_expr bool) string { got_string = true p.gen('string_eq($tmp_var, ') } + else if is_sum_type { + p.gen('${tmp_var}.typ == ') + } else { p.gen('$tmp_var == ') } p.expected_type = typ - p.check_types(p.bool_expression(), typ) + // `match node { ast.BoolExpr { it := node as BoolExpr ... } }` + if is_sum_type { + sum_child_type = p.get_type2().name + tt := sum_child_type.all_after('_') + p.gen('SumType_$tt') + //println('got child $sum_child_type') + p.register_var(Var{ + name: 'it' + typ: sum_child_type + }) + } else { + p.check_types(p.bool_expression(), typ) + } p.expected_type = '' if got_string { p.gen(')') @@ -169,12 +188,16 @@ fn (p mut Parser) match_statement(is_expr bool) string { p.fspace() p.check(.lcbr) p.genln('{ ') + if is_sum_type { + p.genln(' $sum_child_type it = *($sum_child_type*)$tmp_var .obj ;') + } p.statements() all_cases_return = all_cases_return && p.returns // p.gen(')') } i++ p.fgen_nl() + p.close_scope() } p.error('match must be exhaustive') // p.returns = false // only get here when no default, so return is not guaranteed @@ -229,12 +252,12 @@ fn (p mut Parser) if_statement(is_expr bool, elif_depth int) string { name: var_name typ: typ is_mut: false // TODO - + is_used: true // TODO // is_alloc: p.is_alloc || typ.starts_with('array_') // line_nr: p.tokens[ var_token_idx ].line_nr // token_idx: var_token_idx - + }) p.statements() p.close_scope() diff --git a/vlib/compiler/parser.v b/vlib/compiler/parser.v index 6562792291..7afd30acf4 100644 --- a/vlib/compiler/parser.v +++ b/vlib/compiler/parser.v @@ -787,7 +787,7 @@ fn (p mut Parser) type_decl() { } p.check(.key_type) p.fspace() - name := p.check_name() + mut name := p.check_name() p.fspace() // V used to have 'type Foo struct', many Go users might use this syntax if p.tok == .key_struct { @@ -801,6 +801,9 @@ fn (p mut Parser) type_decl() { // Sum type is_sum := p.tok == .pipe if is_sum { + if !p.builtin_mod && p.mod != 'main' { + name = p.prepend_mod(name) + } // Register the first child (name we already parsed) /* p.table.register_type(Type{ @@ -811,26 +814,21 @@ fn (p mut Parser) type_decl() { }) */ // Register the rest of them + mut idx := 0 for p.tok == .pipe { + idx++ p.next() - child := p.check_name() + child_type_name := p.check_name() if p.pass == .main { // Update the type's parent - println('child=$child parent=$name') - mut t := p.table.find_type(child) + //println('child=$child_type_name parent=$name') + mut t := p.find_type(child_type_name) if t.name == '' { - p.error('unknown type `$child`') + p.error('qunknown type `$child_type_name`') } t.parent = name p.table.rewrite_type(t) - /* - p.table.register_type(Type{ - parent: name - name: child - mod: p.mod - is_public: is_pub - }) - */ + p.cgen.consts << '#define SumType_$child_type_name $idx // DEF2' } } if p.pass == .decl { @@ -838,7 +836,7 @@ fn (p mut Parser) type_decl() { println(p.table.sum_types) } // Register the actual sum type - println('reging sum $name') + //println('registering sum $name') p.table.register_type(Type{ name: name mod: p.mod diff --git a/vlib/compiler/scanner.v b/vlib/compiler/scanner.v index 1bfebd1538..fc9ba3c351 100644 --- a/vlib/compiler/scanner.v +++ b/vlib/compiler/scanner.v @@ -15,7 +15,7 @@ const ( error_context_after = 2 // ^^^ same, but after ) -struct Scanner { +pub struct Scanner { mut: file_path string text string diff --git a/vlib/compiler/struct.v b/vlib/compiler/struct.v index 6f6076304a..db8c4dfaad 100644 --- a/vlib/compiler/struct.v +++ b/vlib/compiler/struct.v @@ -38,7 +38,7 @@ fn (p mut Parser) struct_decl(generic_param_types []string) { if !p.builtin_mod && !name[0].is_capital() { p.error('mod=$p.mod struct names must be capitalized: use `struct ${name.capitalize()}`') } - if is_interface && !name.ends_with('er') { + if is_interface && !name.ends_with('er') && name[0] != `I` { p.error('interface names temporarily have to end with `er` (e.g. `Speaker`, `Reader`)') } mut generic_types := map[string]string diff --git a/vlib/compiler/table.v b/vlib/compiler/table.v index f74103cbea..ed71a3658f 100644 --- a/vlib/compiler/table.v +++ b/vlib/compiler/table.v @@ -594,9 +594,9 @@ fn (t &Table) find_type(name_ string) Type { } fn (p mut Parser) check_types2(got_, expected_ string, throw bool) bool { - if p.fileis('type_test') { - println('got=$got_ exp=$expected_') - } + //if p.fileis('type_test') { + //println('got=$got_ exp=$expected_') + //} mut got := got_ mut expected := expected_ // p.log('check types got="$got" exp="$expected" ') @@ -724,18 +724,17 @@ fn (p mut Parser) check_types2(got_, expected_ string, throw bool) bool { got = got.replace('*', '').replace('ptr', '') if got != expected { // Interface check - if expected.ends_with('er') { + if expected.ends_with('er') || expected[0] == `I` { if p.satisfies_interface(expected, got, throw) { return true } } // Sum type - println(expected) if expected in p.table.sum_types { - println('checking sum') + //println('checking sum') child := p.table.find_type(got) if child.parent == expected { - println('yep $expected') + //println('yep $expected') return true } } diff --git a/vlib/compiler/tests/type_test.v b/vlib/compiler/tests/type_test.v index 4509b11f7f..0a0ae2acc4 100644 --- a/vlib/compiler/tests/type_test.v +++ b/vlib/compiler/tests/type_test.v @@ -20,9 +20,7 @@ fn test_person_str() { struct Foo {} -struct WTF { - wtf int -} +type Expr = Foo | BoolExpr | BinExpr | UnaryExpr struct BoolExpr { foo int @@ -37,7 +35,6 @@ struct UnaryExpr { } -type Expr = Foo | BoolExpr | BinExpr | UnaryExpr fn handle_expr(e Expr) { @@ -47,7 +44,7 @@ fn parse_bool() BoolExpr { return BoolExpr{} } -fn test_sum() { +fn test_sum_types() { b := parse_bool() handle_expr(b) } diff --git a/vlib/compiler2/ast/ast.v b/vlib/compiler2/ast/ast.v new file mode 100644 index 0000000000..5960117e5b --- /dev/null +++ b/vlib/compiler2/ast/ast.v @@ -0,0 +1,85 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module ast + +import ( + compiler2.token +) + + +struct Foo {} + +pub type Expr = Foo | IfExpr | BinaryExpr | IntegerExpr + +pub struct IntegerExpr { +pub: + val int +} + +/* +pub enum Expr { + Binary(BinaryExpr) + If(IfExpr) + Integer(IntegerExpr) +} +*/ + +pub struct Stmt { + pos int + //end int +} + +// A single identifier +struct Ident { + token token.Token + value string +} + +pub struct BinaryExpr { +pub: + token token.Token + //op BinaryOp + op token.Token + left Expr + right Expr +} + +struct IfExpr { + token token.Token + cond Expr + body []Stmt + else_ []Stmt +} + +struct ReturnStmt { + token token.Token // or pos + results []Expr +} + +enum BinaryOp { + sum + difference + product + quotient + remainder + bitwise_and + bitwise_or + bitwise_xor + left_shift + right_shift + + equality + inequality + less_than + less_than_or_equal + more_than + more_than_or_equal + + in_check + + //These are suffixed with `bool` to prevent conflict with the keyword `or` + and_bool + or_bool +} + diff --git a/vlib/compiler2/fmt/fmt.v b/vlib/compiler2/fmt/fmt.v new file mode 100644 index 0000000000..f2628d8a5e --- /dev/null +++ b/vlib/compiler2/fmt/fmt.v @@ -0,0 +1,15 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module fmt + +struct Fmt { + // vfmt fields TODO move to a separate struct + // fmt_out strings.Builder + fmt_lines []string + // fmt_line string + fmt_indent int + fmt_line_empty bool + // fmt_needs_nl bool + +} diff --git a/vlib/compiler2/parser/parser.v b/vlib/compiler2/parser/parser.v new file mode 100644 index 0000000000..d3b403fe45 --- /dev/null +++ b/vlib/compiler2/parser/parser.v @@ -0,0 +1,81 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module parser + +import ( + compiler2.scanner + compiler2.ast + compiler2.token +) + +struct Parser { + scanner &scanner.Scanner +mut: + tok token.Token + lit string +} + +pub fn parse_expr(text string) ast.Expr { + mut s := scanner.new_scanner(text) + res := s.scan() + mut p := Parser{ + scanner: s + tok: res.tok + lit: res.lit + } + return p.expr() +} + +fn (p mut Parser) next() { + res := p.scanner.scan() + p.tok = res.tok + //println(p.tok.str()) + p.lit = res.lit +} + +fn (p mut Parser) expr() ast.Expr { + //println('\n\nexpr()') + mut node := p.term() + for p.tok == .plus || p.tok == .minus { + op := p.tok + p.next() + node = ast.BinaryExpr { + left: node + op: op + right: p.term() + } + } + return node +} + +fn (p mut Parser) term() ast.Expr { + mut node := p.factor() + for p.tok == .mul || p.tok == .div || p.tok == .mod { + op := p.tok + p.next() + node = ast.BinaryExpr { + left: node + op: op + right: p.factor() + } + } + return node + //return ast.BinaryExpr{} + //return ast.Expr.Binary(ast.BinaryExpr{}) +} + +fn (p mut Parser) factor() ast.Expr { + if p.tok == .number { + val := p.lit.int() + p.next() + return ast.IntegerExpr { val: val } + } else { + println('bad factor token') + println(p.tok) + exit(1) + } +} + + + diff --git a/vlib/compiler2/parser/parser_test.v b/vlib/compiler2/parser/parser_test.v new file mode 100644 index 0000000000..2d5231692d --- /dev/null +++ b/vlib/compiler2/parser/parser_test.v @@ -0,0 +1,35 @@ +module parser + +import compiler2.ast + +fn test_parser() { + //expr := ast.IntegerExpr {val:10} + //expr := ast.BinaryExpr{} + expr := parse_expr('3 + 7') + walk(expr) + println('') +} + +fn walk(node ast.Expr) { + //println('walk()') + match node { + ast.IntegerExpr { + print(it.val) + } + ast.BinaryExpr { + walk(it.left) + match it.op { + .plus { + print(' + ') + } + .minus { + print(' - ') + } + else {} + + } + walk(it.right) + } + else {} + } +} diff --git a/vlib/compiler2/scanner/scanner.v b/vlib/compiler2/scanner/scanner.v new file mode 100644 index 0000000000..86ff4f3dbd --- /dev/null +++ b/vlib/compiler2/scanner/scanner.v @@ -0,0 +1,911 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module scanner + +import ( + os + compiler2.token + // strings +) + +const ( + single_quote = `\'` + double_quote = `"` + error_context_before = 2 // how many lines of source context to print before the pointer line + error_context_after = 2 // ^^^ same, but after +) + +pub struct Scanner { +mut: + file_path string + text string + pos int + line_nr int + last_nl_pos int // for calculating column + inside_string bool + inter_start bool // for hacky string interpolation TODO simplify + inter_end bool + debug bool + line_comment string + //prev_tok TokenKind + started bool + fn_name string // needed for @FN + print_line_on_error bool + print_colored_error bool + print_rel_paths_on_error bool + quote byte // which quote is used to denote current string: ' or " + line_ends []int // the positions of source lines ends (i.e. \n signs) + nr_lines int // total number of lines in the source file that were scanned + is_vh bool // Keep newlines + is_fmt bool // Used only for skipping ${} in strings, since we need literal + // string values when generating formatted code. +} +// new scanner from file. +fn new_scanner_file(file_path string) &Scanner { + if !os.exists(file_path) { + verror("$file_path doesn't exist") + } + mut raw_text := os.read_file(file_path)or{ + verror('scanner: failed to open $file_path') + return 0 + } + // BOM check + if raw_text.len >= 3 { + c_text := raw_text.str + if c_text[0] == 0xEF && c_text[1] == 0xBB && c_text[2] == 0xBF { + // skip three BOM bytes + offset_from_begin := 3 + raw_text = tos(c_text[offset_from_begin], vstrlen(c_text) - offset_from_begin) + } + } + mut s := new_scanner(raw_text) + //s.init_fmt() + s.file_path = file_path + return s +} + +// new scanner from string. +pub fn new_scanner(text string) &Scanner { + return &Scanner{ + text: text + print_line_on_error: true + print_colored_error: true + print_rel_paths_on_error: true + } +} + +// TODO remove once multiple return values are implemented +pub struct ScanRes { +pub: + tok token.Token + lit string +} + +fn scan_res(tok token.Token, lit string) ScanRes { + return ScanRes{ + tok,lit} +} + +fn (s mut Scanner) ident_name() string { + start := s.pos + for { + s.pos++ + if s.pos >= s.text.len { + break + } + c := s.text[s.pos] + if !is_name_char(c) && !c.is_digit() { + break + } + } + name := s.text[start..s.pos] + s.pos-- + return name +} + +fn (s mut Scanner) ident_hex_number() string { + start_pos := s.pos + s.pos += 2 // skip '0x' + for { + if s.pos >= s.text.len { + break + } + c := s.text[s.pos] + if !c.is_hex_digit() { + break + } + s.pos++ + } + number := s.text[start_pos..s.pos] + s.pos-- + return number +} + +fn (s mut Scanner) ident_oct_number() string { + start_pos := s.pos + for { + if s.pos >= s.text.len { + break + } + c := s.text[s.pos] + if c.is_digit() { + if !c.is_oct_digit() { + s.error('malformed octal constant') + } + } + else { + break + } + s.pos++ + } + number := s.text[start_pos..s.pos] + s.pos-- + return number +} + +fn (s mut Scanner) ident_dec_number() string { + start_pos := s.pos + // scan integer part + for s.pos < s.text.len && s.text[s.pos].is_digit() { + s.pos++ + } + // e.g. 1..9 + // we just return '1' and don't scan '..9' + if s.expect('..', s.pos) { + number := s.text[start_pos..s.pos] + s.pos-- + return number + } + // scan fractional part + if s.pos < s.text.len && s.text[s.pos] == `.` { + s.pos++ + for s.pos < s.text.len && s.text[s.pos].is_digit() { + s.pos++ + } + if !s.inside_string && s.pos < s.text.len && s.text[s.pos] == `f` { + s.error('no `f` is needed for floats') + } + } + // scan exponential part + mut has_exponential_part := false + if s.expect('e+', s.pos) || s.expect('e-', s.pos) { + exp_start_pos := s.pos += 2 + for s.pos < s.text.len && s.text[s.pos].is_digit() { + s.pos++ + } + if exp_start_pos == s.pos { + s.error('exponent has no digits') + } + has_exponential_part = true + } + // error check: 1.23.4, 123.e+3.4 + if s.pos < s.text.len && s.text[s.pos] == `.` { + if has_exponential_part { + s.error('exponential part should be integer') + } + else { + s.error('too many decimal points in number') + } + } + number := s.text[start_pos..s.pos] + s.pos-- + return number +} + +fn (s mut Scanner) ident_number() string { + if s.expect('0x', s.pos) { + return s.ident_hex_number() + } + if s.expect('0.', s.pos) || s.expect('0e', s.pos) { + return s.ident_dec_number() + } + if s.text[s.pos] == `0` { + return s.ident_oct_number() + } + return s.ident_dec_number() +} + +fn (s mut Scanner) skip_whitespace() { + // if s.is_vh { println('vh') return } + for s.pos < s.text.len && s.text[s.pos].is_white() { + if is_nl(s.text[s.pos]) && s.is_vh { + return + } + // Count \r\n as one line + if is_nl(s.text[s.pos]) && !s.expect('\r\n', s.pos - 1) { + s.inc_line_number() + } + s.pos++ + } +} + +fn (s mut Scanner) end_of_file() ScanRes { + s.pos = s.text.len + s.inc_line_number() + return scan_res(.eof, '') +} + +pub fn (s mut Scanner) scan() ScanRes { + // if s.line_comment != '' { + // s.fgenln('// LC "$s.line_comment"') + // s.line_comment = '' + // } + if s.started { + s.pos++ + } + s.started = true + if s.pos >= s.text.len { + return s.end_of_file() + } + if !s.inside_string { + s.skip_whitespace() + } + // End of $var, start next string + if s.inter_end { + if s.text[s.pos] == s.quote { + s.inter_end = false + return scan_res(.str, '') + } + s.inter_end = false + return scan_res(.str, s.ident_string()) + } + s.skip_whitespace() + // end of file + if s.pos >= s.text.len { + return s.end_of_file() + } + // handle each char + c := s.text[s.pos] + mut nextc := `\0` + if s.pos + 1 < s.text.len { + nextc = s.text[s.pos + 1] + } + // name or keyword + if is_name_char(c) { + name := s.ident_name() + // tmp hack to detect . in ${} + // Check if not .eof to prevent panic + next_char := if s.pos + 1 < s.text.len { s.text[s.pos + 1] } else { `\0` } + if token.is_key(name) { + return scan_res(token.key_to_token(name), '') + } + // 'asdf $b' => "b" is the last name in the string, dont start parsing string + // at the next ', skip it + if s.inside_string { + if next_char == s.quote { + s.inter_end = true + s.inter_start = false + s.inside_string = false + } + } + // end of `$expr` + // allow `'$a.b'` and `'$a.c()'` + if s.inter_start && next_char != `.` && next_char != `(` { + s.inter_end = true + s.inter_start = false + } + if s.pos == 0 && next_char == ` ` { + // If a single letter name at the start of the file, increment + // Otherwise the scanner would be stuck at s.pos = 0 + s.pos++ + } + return scan_res(.name, name) + } + // `123`, `.123` + else if c.is_digit() || (c == `.` && nextc.is_digit()) { + num := s.ident_number() + return scan_res(.number, num) + } + // Handle `'$fn()'` + if c == `)` && s.inter_start { + s.inter_end = true + s.inter_start = false + next_char := if s.pos + 1 < s.text.len { s.text[s.pos + 1] } else { `\0` } + if next_char == s.quote { + s.inside_string = false + } + return scan_res(.rpar, '') + } + // all other tokens + match c { + `+` { + if nextc == `+` { + s.pos++ + return scan_res(.inc, '') + } + else if nextc == `=` { + s.pos++ + return scan_res(.plus_assign, '') + } + return scan_res(.plus, '') + } + `-` { + if nextc == `-` { + s.pos++ + return scan_res(.dec, '') + } + else if nextc == `=` { + s.pos++ + return scan_res(.minus_assign, '') + } + return scan_res(.minus, '') + } + `*` { + if nextc == `=` { + s.pos++ + return scan_res(.mult_assign, '') + } + return scan_res(.mul, '') + } + `^` { + if nextc == `=` { + s.pos++ + return scan_res(.xor_assign, '') + } + return scan_res(.xor, '') + } + `%` { + if nextc == `=` { + s.pos++ + return scan_res(.mod_assign, '') + } + return scan_res(.mod, '') + } + `?` { + return scan_res(.question, '') + } + single_quote, double_quote { + return scan_res(.str, s.ident_string()) + } + `\`` { + // ` // apostrophe balance comment. do not remove + return scan_res(.chartoken, s.ident_char()) + } + `(` { + return scan_res(.lpar, '') + } + `)` { + return scan_res(.rpar, '') + } + `[` { + return scan_res(.lsbr, '') + } + `]` { + return scan_res(.rsbr, '') + } + `{` { + // Skip { in `${` in strings + if s.inside_string { + return s.scan() + } + return scan_res(.lcbr, '') + } + `$` { + if s.inside_string { + return scan_res(.str_dollar, '') + } + else { + return scan_res(.dollar, '') + } + } + `}` { + // s = `hello $name !` + // s = `hello ${name} !` + if s.inside_string { + s.pos++ + if s.text[s.pos] == s.quote { + s.inside_string = false + return scan_res(.str, '') + } + return scan_res(.str, s.ident_string()) + } + else { + return scan_res(.rcbr, '') + } + } + `&` { + if nextc == `=` { + s.pos++ + return scan_res(.and_assign, '') + } + if nextc == `&` { + s.pos++ + return scan_res(.and, '') + } + return scan_res(.amp, '') + } + `|` { + if nextc == `|` { + s.pos++ + return scan_res(.logical_or, '') + } + if nextc == `=` { + s.pos++ + return scan_res(.or_assign, '') + } + return scan_res(.pipe, '') + } + `,` { + return scan_res(.comma, '') + } + `@` { + s.pos++ + name := s.ident_name() + // @FN => will be substituted with the name of the current V function + // @FILE => will be substituted with the path of the V source file + // @LINE => will be substituted with the V line number where it appears (as a string). + // @COLUMN => will be substituted with the column where it appears (as a string). + // @VHASH => will be substituted with the shortened commit hash of the V compiler (as a string). + // This allows things like this: + // println( 'file: ' + @FILE + ' | line: ' + @LINE + ' | fn: ' + @FN) + // ... which is useful while debugging/tracing + if name == 'FN' { + return scan_res(.str, s.fn_name) + } + if name == 'FILE' { + return scan_res(.str, cescaped_path(os.realpath(s.file_path))) + } + if name == 'LINE' { + return scan_res(.str, (s.line_nr + 1).str()) + } + if name == 'COLUMN' { + return scan_res(.str, (s.current_column()).str()) + } + if name == 'VHASH' { + return scan_res(.str, vhash()) + } + if !token.is_key(name) { + s.error('@ must be used before keywords (e.g. `@type string`)') + } + return scan_res(.name, name) + } + /* + case `\r`: + if nextc == `\n` { + s.pos++ + s.last_nl_pos = s.pos + return scan_res(.nl, '') + } + } + case `\n`: + s.last_nl_pos = s.pos + return scan_res(.nl, '') + } + */ + + `.` { + if nextc == `.` { + s.pos++ + if s.text[s.pos + 1] == `.` { + s.pos++ + return scan_res(.ellipsis, '') + } + return scan_res(.dotdot, '') + } + return scan_res(.dot, '') + } + `#` { + start := s.pos + 1 + s.ignore_line() + if nextc == `!` { + // treat shebang line (#!) as a comment + s.line_comment = s.text[start + 1..s.pos].trim_space() + // s.fgenln('// shebang line "$s.line_comment"') + return s.scan() + } + hash := s.text[start..s.pos] + return scan_res(.hash, hash.trim_space()) + } + `>` { + if nextc == `=` { + s.pos++ + return scan_res(.ge, '') + } + else if nextc == `>` { + if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` { + s.pos += 2 + return scan_res(.righ_shift_assign, '') + } + s.pos++ + return scan_res(.righ_shift, '') + } + else { + return scan_res(.gt, '') + } + } + 0xE2 { + // case `≠`: + if nextc == 0x89 && s.text[s.pos + 2] == 0xA0 { + s.pos += 2 + return scan_res(.ne, '') + } + // ⩽ + else if nextc == 0x89 && s.text[s.pos + 2] == 0xBD { + s.pos += 2 + return scan_res(.le, '') + } + // ⩾ + else if nextc == 0xA9 && s.text[s.pos + 2] == 0xBE { + s.pos += 2 + return scan_res(.ge, '') + } + } + `<` { + if nextc == `=` { + s.pos++ + return scan_res(.le, '') + } + else if nextc == `<` { + if s.pos + 2 < s.text.len && s.text[s.pos + 2] == `=` { + s.pos += 2 + return scan_res(.left_shift_assign, '') + } + s.pos++ + return scan_res(.left_shift, '') + } + else { + return scan_res(.lt, '') + } + } + `=` { + if nextc == `=` { + s.pos++ + return scan_res(.eq, '') + } + else if nextc == `>` { + s.pos++ + return scan_res(.arrow, '') + } + else { + return scan_res(.assign, '') + } + } + `:` { + if nextc == `=` { + s.pos++ + return scan_res(.decl_assign, '') + } + else { + return scan_res(.colon, '') + } + } + `;` { + return scan_res(.semicolon, '') + } + `!` { + if nextc == `=` { + s.pos++ + return scan_res(.ne, '') + } + else { + return scan_res(.not, '') + } + } + `~` { + return scan_res(.bit_not, '') + } + `/` { + if nextc == `=` { + s.pos++ + return scan_res(.div_assign, '') + } + if nextc == `/` { + start := s.pos + 1 + s.ignore_line() + s.line_comment = s.text[start + 1..s.pos] + s.line_comment = s.line_comment.trim_space() + if s.is_fmt { + s.pos-- // fix line_nr, \n was read, and the comment is marked on the next line + s.line_nr-- + return scan_res(.line_comment, s.line_comment) + } + // s.fgenln('// ${s.prev_tok.str()} "$s.line_comment"') + // Skip the comment (return the next token) + return s.scan() + } + // Multiline comments + if nextc == `*` { + start := s.pos + mut nest_count := 1 + // Skip comment + for nest_count > 0 { + s.pos++ + if s.pos >= s.text.len { + s.line_nr-- + s.error('comment not terminated') + } + if s.text[s.pos] == `\n` { + s.inc_line_number() + continue + } + if s.expect('/*', s.pos) { + nest_count++ + continue + } + if s.expect('*/', s.pos) { + nest_count-- + } + } + s.pos++ + end := s.pos + 1 + comment := s.text[start..end] + if s.is_fmt { + s.line_comment = comment + return scan_res(.mline_comment, s.line_comment) + } + // Skip if not in fmt mode + return s.scan() + } + return scan_res(.div, '') + } + else { + }} + $if windows { + if c == `\0` { + return s.end_of_file() + } + } + s.error('invalid character `${c.str()}`') + return s.end_of_file() +} + +fn (s &Scanner) current_column() int { + return s.pos - s.last_nl_pos +} + +fn (s Scanner) count_symbol_before(p int, sym byte) int { + mut count := 0 + for i := p; i >= 0; i-- { + if s.text[i] != sym { + break + } + count++ + } + return count +} + +fn (s mut Scanner) ident_string() string { + q := s.text[s.pos] + is_quote := q == single_quote || q == double_quote + is_raw := is_quote && s.text[s.pos - 1] == `r` + if is_quote && !s.inside_string { + s.quote = q + } + // if s.file_path.contains('string_test') { + // println('\nident_string() at char=${s.text[s.pos].str()}') + // println('linenr=$s.line_nr quote= $qquote ${qquote.str()}') + // } + mut start := s.pos + s.inside_string = false + slash := `\\` + for { + s.pos++ + if s.pos >= s.text.len { + break + } + c := s.text[s.pos] + prevc := s.text[s.pos - 1] + // end of string + if c == s.quote && (prevc != slash || (prevc == slash && s.text[s.pos - 2] == slash)) { + // handle '123\\' slash at the end + break + } + if c == `\n` { + s.inc_line_number() + } + // Don't allow \0 + if c == `0` && s.pos > 2 && s.text[s.pos - 1] == slash { + if s.pos < s.text.len - 1 && s.text[s.pos + 1].is_digit() { + } + else { + s.error('0 character in a string literal') + } + } + // Don't allow \x00 + if c == `0` && s.pos > 5 && s.expect('\\x0', s.pos - 3) { + s.error('0 character in a string literal') + } + // ${var} (ignore in vfmt mode) + if c == `{` && prevc == `$` && !is_raw && !s.is_fmt && s.count_symbol_before(s.pos - 2, slash) % 2 == 0 { + s.inside_string = true + // so that s.pos points to $ at the next step + s.pos -= 2 + break + } + // $var + if is_name_char(c) && prevc == `$` && !s.is_fmt && !is_raw && s.count_symbol_before(s.pos - 2, slash) % 2 == 0 { + s.inside_string = true + s.inter_start = true + s.pos -= 2 + break + } + } + mut lit := '' + if s.text[start] == s.quote { + start++ + } + mut end := s.pos + if s.inside_string { + end++ + } + if start > s.pos { + } + else { + lit = s.text[start..end] + } + return lit +} + +fn (s mut Scanner) ident_char() string { + start := s.pos + slash := `\\` + mut len := 0 + for { + s.pos++ + if s.pos >= s.text.len { + break + } + if s.text[s.pos] != slash { + len++ + } + double_slash := s.expect('\\\\', s.pos - 2) + if s.text[s.pos] == `\`` && (s.text[s.pos - 1] != slash || double_slash) { + // ` // apostrophe balance comment. do not remove + if double_slash { + len++ + } + break + } + } + len-- + c := s.text[start + 1..s.pos] + if len != 1 { + u := c.ustring() + if u.len != 1 { + s.error('invalid character literal (more than one character)\n' + 'use quotes for strings, backticks for characters') + } + } + if c == '\\`' { + return '`' + } + // Escapes a `'` character + return if c == "\'" { '\\' + c } else { c } +} + +fn (s &Scanner) expect(want string, start_pos int) bool { + end_pos := start_pos + want.len + if start_pos < 0 || start_pos >= s.text.len { + return false + } + if end_pos < 0 || end_pos > s.text.len { + return false + } + for pos in start_pos .. end_pos { + if s.text[pos] != want[pos - start_pos] { + return false + } + } + return true +} + +fn (s mut Scanner) debug_tokens() { + s.pos = 0 + s.started = false + s.debug = true + fname := s.file_path.all_after(os.path_separator) + println('\n===DEBUG TOKENS $fname===') + for { + res := s.scan() + tok := res.tok + lit := res.lit + print(tok.str()) + if lit != '' { + println(' `$lit`') + } + else { + println('') + } + if tok == .eof { + println('============ END OF DEBUG TOKENS ==================') + break + } + } +} + +fn (s mut Scanner) ignore_line() { + s.eat_to_end_of_line() + s.inc_line_number() +} + +fn (s mut Scanner) eat_to_end_of_line() { + for s.pos < s.text.len && s.text[s.pos] != `\n` { + s.pos++ + } +} + +fn (s mut Scanner) inc_line_number() { + s.last_nl_pos = s.pos + s.line_nr++ + s.line_ends << s.pos + if s.line_nr > s.nr_lines { + s.nr_lines = s.line_nr + } +} + +fn (s Scanner) line(n int) string { + mut res := '' + if n >= 0 && n < s.line_ends.len { + nline_start := if n == 0 { 0 } else { s.line_ends[n - 1] } + nline_end := s.line_ends[n] + if nline_start <= nline_end { + res = s.text[nline_start..nline_end] + } + } + return res.trim_right('\r\n').trim_left('\r\n') +} + +fn is_name_char(c byte) bool { + return c == `_` || c.is_letter() +} + +[inline] +fn is_nl(c byte) bool { + return c == `\r` || c == `\n` +} + +fn contains_capital(s string) bool { + for c in s { + if c >= `A` && c <= `Z` { + return true + } + } + return false +} + +// HTTPRequest bad +// HttpRequest good +fn good_type_name(s string) bool { + if s.len < 4 { + return true + } + for i in 2 .. s.len { + if s[i].is_capital() && s[i - 1].is_capital() && s[i - 2].is_capital() { + return false + } + } + return true +} + +// registration_date good +// registrationdate bad +fn (s &Scanner) validate_var_name(name string) { + if name.len > 15 && !name.contains('_') { + s.error('bad variable name `$name`\n' + 'looks like you have a multi-word name without separating them with `_`' + '\nfor example, use `registration_date` instead of `registrationdate` ') + } +} + +pub fn (s &Scanner) error(msg string) { + println('$s.line_nr : $msg') + exit(1) +} + +pub fn verror(s string) { + println('V error: $s') + os.flush_stdout() + exit(1) +} + +pub fn vhash() string { + mut buf := [50]byte + buf[0] = 0 + C.snprintf(charptr(buf), 50, '%s', C.V_COMMIT_HASH) + return tos_clone(buf) +} + +pub fn cescaped_path(s string) string { + return s.replace('\\', '\\\\') +} + + + diff --git a/vlib/compiler2/scanner/scanner_test.v b/vlib/compiler2/scanner/scanner_test.v new file mode 100644 index 0000000000..8681251551 --- /dev/null +++ b/vlib/compiler2/scanner/scanner_test.v @@ -0,0 +1,30 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module scanner + +import ( + compiler2.token +) + +fn test_scan() { + text := 'println(2 + 3)' + mut scanner := new_scanner(text) + mut tokens := []token.Token + for { + res := scanner.scan() + if res.tok == .eof { + break + } + tokens << res.tok + } + assert tokens.len == 6 + assert tokens[0] == .name + assert tokens[1] == .lpar + assert tokens[2] == .number + assert tokens[3] == .plus + assert tokens[4] == .number + assert tokens[5] == .rpar + +} + diff --git a/vlib/compiler2/token/token.v b/vlib/compiler2/token/token.v new file mode 100644 index 0000000000..ab6446b58c --- /dev/null +++ b/vlib/compiler2/token/token.v @@ -0,0 +1,305 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module token + +/* +struct Token { + tok TokenKind // the token number/enum; for quick comparisons + lit string // literal representation of the token + line_nr int // the line number in the source where the token occured + //name_idx int // name table index for O(1) lookup + pos int // the position of the token in scanner text +} +*/ + +pub enum Token { + eof + name // user + number // 123 + str // 'foo' + str_inter // 'name=$user.name' + chartoken // `A` + plus + minus + mul + div + mod + xor // ^ + pipe // | + inc // ++ + dec // -- + and // && + logical_or + not + bit_not + question + comma + semicolon + colon + arrow // => + amp + hash + dollar + str_dollar + left_shift + righ_shift + // at // @ + assign // = + decl_assign // := + plus_assign // += + minus_assign // -= + div_assign + mult_assign + xor_assign + mod_assign + or_assign + and_assign + righ_shift_assign + left_shift_assign + // {} () [] + lcbr + rcbr + lpar + rpar + lsbr + rsbr + // == != <= < >= > + eq + ne + gt + lt + ge + le + // comments + line_comment + mline_comment + nl + dot + dotdot + ellipsis + // keywords + keyword_beg + key_as + key_asm + key_assert + key_atomic + key_break + key_const + key_continue + key_defer + key_else + key_embed + key_enum + key_false + key_for + key_fn + key_global + key_go + key_goto + key_if + key_import + key_import_const + key_in + key_interface + // key_it + key_match + key_module + key_mut + key_none + key_return + key_select + key_sizeof + key_offsetof + key_struct + key_switch + key_true + key_type + // typeof + key_orelse + key_union + key_pub + key_static + key_unsafe + keyword_end +} + +const ( + assign_tokens = [Token.assign, .plus_assign, .minus_assign, .mult_assign, + .div_assign, .xor_assign, .mod_assign, .or_assign, .and_assign, + .righ_shift_assign, .left_shift_assign] + + nr_tokens = 141 +) + + +// build_keys genereates a map with keywords' string values: +// Keywords['return'] == .key_return +fn build_keys() map[string]int { + mut res := map[string]int + for t := int(Token.keyword_beg) + 1; t < int(Token.keyword_end); t++ { + key := token_str[t] + res[key] = t + } + return res +} + +// TODO remove once we have `enum Token { name('name') if('if') ... }` +fn build_token_str() []string { + mut s := [''].repeat(nr_tokens) + s[Token.keyword_beg] = '' + s[Token.keyword_end] = '' + s[Token.eof] = 'eof' + s[Token.name] = 'name' + s[Token.number] = 'number' + s[Token.str] = 'STR' + s[Token.chartoken] = 'char' + s[Token.plus] = '+' + s[Token.minus] = '-' + s[Token.mul] = '*' + s[Token.div] = '/' + s[Token.mod] = '%' + s[Token.xor] = '^' + s[Token.bit_not] = '~' + s[Token.pipe] = '|' + s[Token.hash] = '#' + s[Token.amp] = '&' + s[Token.inc] = '++' + s[Token.dec] = '--' + s[Token.and] = '&&' + s[Token.logical_or] = '||' + s[Token.not] = '!' + s[Token.dot] = '.' + s[Token.dotdot] = '..' + s[Token.ellipsis] = '...' + s[Token.comma] = ',' + // s[Token.at] = '@' + s[Token.semicolon] = ';' + s[Token.colon] = ':' + s[Token.arrow] = '=>' + s[Token.assign] = '=' + s[Token.decl_assign] = ':=' + s[Token.plus_assign] = '+=' + s[Token.minus_assign] = '-=' + s[Token.mult_assign] = '*=' + s[Token.div_assign] = '/=' + s[Token.xor_assign] = '^=' + s[Token.mod_assign] = '%=' + s[Token.or_assign] = '|=' + s[Token.and_assign] = '&=' + s[Token.righ_shift_assign] = '>>=' + s[Token.left_shift_assign] = '<<=' + s[Token.lcbr] = '{' + s[Token.rcbr] = '}' + s[Token.lpar] = '(' + s[Token.rpar] = ')' + s[Token.lsbr] = '[' + s[Token.rsbr] = ']' + s[Token.eq] = '==' + s[Token.ne] = '!=' + s[Token.gt] = '>' + s[Token.lt] = '<' + s[Token.ge] = '>=' + s[Token.le] = '<=' + s[Token.question] = '?' + s[Token.left_shift] = '<<' + s[Token.righ_shift] = '>>' + s[Token.line_comment] = '// line comment' + s[Token.mline_comment] = '/* mline comment */' + s[Token.nl] = 'NLL' + s[Token.dollar] = '$' + s[Token.str_dollar] = '$2' + s[Token.key_assert] = 'assert' + s[Token.key_struct] = 'struct' + s[Token.key_if] = 'if' + // s[Token.key_it] = 'it' + s[Token.key_else] = 'else' + s[Token.key_asm] = 'asm' + s[Token.key_return] = 'return' + s[Token.key_module] = 'module' + s[Token.key_sizeof] = 'sizeof' + s[Token.key_go] = 'go' + s[Token.key_goto] = 'goto' + s[Token.key_const] = 'const' + s[Token.key_mut] = 'mut' + s[Token.key_type] = 'type' + s[Token.key_for] = 'for' + s[Token.key_switch] = 'switch' + s[Token.key_fn] = 'fn' + s[Token.key_true] = 'true' + s[Token.key_false] = 'false' + s[Token.key_continue] = 'continue' + s[Token.key_break] = 'break' + s[Token.key_import] = 'import' + s[Token.key_embed] = 'embed' + s[Token.key_unsafe] = 'unsafe' + // Tokens[key_typeof] = 'typeof' + s[Token.key_enum] = 'enum' + s[Token.key_interface] = 'interface' + s[Token.key_pub] = 'pub' + s[Token.key_import_const] = 'import_const' + s[Token.key_in] = 'in' + s[Token.key_atomic] = 'atomic' + s[Token.key_orelse] = 'or' + s[Token.key_global] = '__global' + s[Token.key_union] = 'union' + s[Token.key_static] = 'static' + s[Token.key_as] = 'as' + s[Token.key_defer] = 'defer' + s[Token.key_match] = 'match' + s[Token.key_select] = 'select' + s[Token.key_none] = 'none' + s[Token.key_offsetof] = '__offsetof' + return s +} + +const ( + token_str = build_token_str() + keywords = build_keys() +) + +pub fn key_to_token(key string) Token { + a := Token(keywords[key]) + return a +} + +pub fn is_key(key string) bool { + return int(key_to_token(key)) > 0 +} + +pub fn is_decl(t Token) bool { + return t in [.key_enum, +.key_interface, .key_fn, .key_struct, .key_type, .key_const, .key_import_const, +.key_pub, .eof] +} + +fn (t Token) is_assign() bool { + return t in assign_tokens +} + +fn (t []Token) contains(val Token) bool { + for tt in t { + if tt == val { + return true + } + } + return false +} + +pub fn (t Token) str() string { + lit := 't.lit' + if t == .number { + return lit + } + if t == .chartoken { + return '`lit`' + } + if t == .str { + return "'lit'" + } + if t < .plus { + return lit // string, number etc + } + return token_str[int(t)] +} +