diff --git a/vlib/v/ast/ast.v b/vlib/v/ast/ast.v index 49a11e0019..b1654381ac 100644 --- a/vlib/v/ast/ast.v +++ b/vlib/v/ast/ast.v @@ -2276,11 +2276,3 @@ pub fn type_can_start_with_token(tok &token.Token) bool { } return false } - -fn build_builtin_type_names_matcher() token.KeywordsMatcher { - mut m := map[string]int{} - for i, name in builtin_type_names { - m[name] = i - } - return token.new_keywords_matcher(m) -} diff --git a/vlib/v/ast/types.v b/vlib/v/ast/types.v index dedf336e27..9ae69217f5 100644 --- a/vlib/v/ast/types.v +++ b/vlib/v/ast/types.v @@ -13,6 +13,7 @@ module ast import strings import v.pref +import v.token pub type Type = int @@ -455,7 +456,7 @@ pub const builtin_type_names = ['void', 'voidptr', 'byteptr', 'charptr', 'i8', ' 'isize', 'u8', 'u16', 'u32', 'u64', 'usize', 'f32', 'f64', 'char', 'bool', 'none', 'string', 'rune', 'array', 'map', 'chan', 'any', 'float_literal', 'int_literal', 'thread', 'Error', 'nil'] -pub const builtin_type_names_matcher = build_builtin_type_names_matcher() +pub const builtin_type_names_matcher = token.new_keywords_matcher_from_array_trie(builtin_type_names) pub const ( integer_type_idxs = [i8_type_idx, i16_type_idx, int_type_idx, i64_type_idx, u8_type_idx, diff --git a/vlib/v/pref/default.v b/vlib/v/pref/default.v index 0d89451a6a..4c5cdc4c29 100644 --- a/vlib/v/pref/default.v +++ b/vlib/v/pref/default.v @@ -158,7 +158,7 @@ pub fn (mut p Preferences) fill_with_defaults() { } $if prealloc { - if !p.no_parallel { + if !p.no_parallel && p.is_verbose { eprintln('disabling parallel cgen, since V was built with -prealloc') } p.no_parallel = true diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 130c9eab71..e30c15cb3c 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -665,7 +665,7 @@ fn (mut s Scanner) text_scan() token.Token { // tmp hack to detect . in ${} // Check if not .eof to prevent panic next_char := s.look_ahead(1) - kind := token.matcher.find(name) + kind := token.scanner_matcher.find(name) if kind != -1 { return s.new_token(token.Kind(kind), name, name.len) } diff --git a/vlib/v/tests/bench/bench_compare_tokens.v b/vlib/v/tests/bench/bench_compare_tokens.v index 5bb3d1ec1c..b9727babb0 100644 --- a/vlib/v/tests/bench/bench_compare_tokens.v +++ b/vlib/v/tests/bench/bench_compare_tokens.v @@ -4,19 +4,21 @@ import benchmark const max_repetitions = 4_000_000 fn main() { - km := token.new_keywords_matcher(token.keywords) + mut res := token.Kind{} + km_trie := token.new_keywords_matcher_trie(token.keywords) for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else', - 'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] { - mut res := token.Kind{} + 'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a', 'assez', 'returned'] { mut bmark := benchmark.start() for _ in 0 .. max_repetitions { res = token.keywords[kw] } bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res') + for _ in 0 .. max_repetitions { - res = km.find(kw) + res = token.Kind(km_trie.find(kw)) } - bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res') + bmark.measure('$max_repetitions repetitions of km_trie.find("$kw") = $res') + println('--------------------------------') } } diff --git a/vlib/v/token/README.md b/vlib/v/token/README.md new file mode 100644 index 0000000000..2222645868 --- /dev/null +++ b/vlib/v/token/README.md @@ -0,0 +1,57 @@ +## Description: + +`v.token` is a module providing the basic building blocks of the V +syntax - the tokens, as well as utilities for working with them. + +## KeywordsMatcherTrie +KeywordsMatcherTrie provides a faster way of determinining whether a given name is a reserved +word (belongs to a given set of previously known words `R`). It works by exploiting the fact, +that the set of reserved words is small, and the words short. + +KeywordsMatcherTrie uses an ordered set of [tries](https://en.wikipedia.org/wiki/Trie), +one per each word length, that was added, so that rejecting that something is a reserved +word, can be done in constant time for words smaller or larger in length than all the +reserved ones. + +After a word `w`, is confirmed by this initial check by length `n`, that it could belong +to a trie `Tn`, responsible for all known reserved words of that length, then `Tn` is used +to further verify or reject the word quickly. In order to do so, `Tn` prepares in advance +an array of all possible continuations (letters), at each index of the words `R`, after +any given prefix, belonging to `R`. + +For example, if we have added the word `asm` to the trie T3, its tree (its nodes) may look +like this (note that the 0 pointers in children, mean that there was no word in `R`, that had +that corresponding letter at that specific index): +``` +TrieNode 0: a b c d e f g h i j k l m n o p q r s t u v w x y z ... | +| children: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`a`] = 1 -> TrieNode 1 +| prefix so far: '' | value: 0 | +| +TrieNode 1: a b c d e f g h i j k l m n o p q r s t u v w x y z ... | +| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 ... | children[`s`] = 2 -> TrieNode 2 +| prefix so far: 'a' | value: 0 | +| +TrieNode 2: a b c d e f g h i j k l m n o p q r s t u v w x y z ... | +| children: 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`m`] = 3 -> TrieNode 3 +| prefix so far: 'as' | value: 0 | Note: `as` is a keyword with length 2, +| but we are searching in T3 trie. +| +TrieNode 3: a b c d e f g h i j k l m n o p q r s t u v w x y z ... | +| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | all of children are 0 +| prefix so far: 'asm' | value: int(token.Kind.asm) | +``` + +Matching any given `word` in the trie, after you have prepared it, is then simple: +just read each character of the `word`, and follow the corresponding pointer from +the `children` array (indexed by character). When the pointer is nil, there was NO +match, and the word is rejected, which happens very often, and early for most words +that are not in the set of the previously added reserved words. One significant +benefit compared to just comparing the checked `word` against a linear list of all +known words, is that once you have found that a word is not a match at any given +level/trie node, then you know that it is not a match to *any* of them. + +Note: benchmarking shows that it is ~300% to 400% faster, compared to just using +`token.keywords[name]` on average, when there is a match, but it can be 17x faster +in the case, where there is a length mismatch. After changes to KeywordsMatcherTrie, +please do `v -prod run vlib/v/tests/bench/bench_compare_tokens.v` to verify, +that there is no performance regression. diff --git a/vlib/v/token/keywords_matcher.v b/vlib/v/token/keywords_matcher.v deleted file mode 100644 index 772d0ea85f..0000000000 --- a/vlib/v/token/keywords_matcher.v +++ /dev/null @@ -1,92 +0,0 @@ -module token - -// bump token.max_keyword_len, if you add a longer keyword -const max_keyword_len = 20 - -// KeywordsMatcher provides a faster way of determinining whether a given name -// is a reserved word, by doing a comparison with only the keywords that -// have exactly the same length as `name`. -// Benchmarking shows that with -prod, it is 20-25% slower in the worst case -// compared to just using token.keywords[name], but can be 20x faster -// in the case, where there is a length mismatch, and 2x-3x faster in most -// cases, where there is a match. -// Without -prod, with tcc, using KeywordsMatcher is always faster -// (2x to 14x times), compared to using a hash of all the keywords. -pub struct KeywordsMatcher { -mut: - len_min int = 9999 - len_max int = -1 - words [max_keyword_len][]WIndex -} - -struct WIndex { -mut: - word string - index int -} - -pub fn new_keywords_matcher(kw_map map[string]T) KeywordsMatcher { - mut km := KeywordsMatcher{} - // TODO: remove this loop. It is currently needed, because a - // fixed array of arrays is not initialised properly automatically - // as of 2021/10/28 - for i in 0 .. token.max_keyword_len { - km.words[i] = []WIndex{} - } - for k, v in kw_map { - km.add_word(k, int(v)) - } - for i in 0 .. token.max_keyword_len { - if km.words[i].len > 0 { - km.words[i].sort(a.word < b.word) - $if trace_keyword_matcher_initialisation ? { - print('word len: ${i:3} | words: ') - for w in km.words[i] { - print('$w.word, ') - } - println('') - } - } - } - return km -} - -fn (mut km KeywordsMatcher) add_word(word string, kind int) { - if word.len >= token.max_keyword_len { - panic('increase max_keyword_len to > $word.len') - } - if km.len_max < word.len { - km.len_max = word.len - } - if word.len < km.len_min { - km.len_min = word.len - } - km.words[word.len] << WIndex{word, kind} -} - -// find returns the int index, given a word, by doing a binary search -// on the sorted list of words for each bin -[direct_array_access] -pub fn (km &KeywordsMatcher) find(word string) int { - wlen := word.len - if wlen < km.len_min || wlen > km.len_max { - return -1 - } - list_len := km.words[wlen].len - if list_len == 0 { - return -1 - } - mut lo := 0 - mut hi := list_len - 1 - for lo <= hi { - mid := lo + (hi - lo) / 2 - cmp := km.words[wlen][mid].word.compare(word) - match cmp { - 0 { return km.words[wlen][mid].index } - -1 { lo = mid + 1 } - 1 { hi = mid - 1 } - else {} - } - } - return -1 -} diff --git a/vlib/v/token/keywords_matcher_trie.v b/vlib/v/token/keywords_matcher_trie.v new file mode 100644 index 0000000000..2c7484126a --- /dev/null +++ b/vlib/v/token/keywords_matcher_trie.v @@ -0,0 +1,152 @@ +module token + +// KeywordsMatcherTrie provides a faster way of determinining whether a given name +// is a reserved word (belongs to a given set of previously known words `R`). +// See the module description for more details. +[heap] +pub struct KeywordsMatcherTrie { +mut: + nodes []&TrieNode + min_len int = 999999 + max_len int +} + +// TrieNode is a single node from a trie, used by KeywordsMatcherTrie +pub struct TrieNode { +mut: + children [123]&TrieNode + value int // when positive, it is a leaf node representing a match +} + +// find tries to find the given `word` in the set of all previously added words +// to the KeywordsMatcherTrie instance. It returns -1 if the word was NOT found +// there at all. If the word was found, find will return the `value` (value => 0), +// associated with the word, when it was added. +[direct_array_access] +pub fn (km &KeywordsMatcherTrie) find(word string) int { + wlen := word.len + if wlen < km.min_len { + return -1 + } + if wlen > km.max_len { + return -1 + } + node := km.nodes[wlen] + if node == unsafe { nil } { + return -1 + } + return node.find(word) +} + +// add_word adds the given word to the KeywordsMatcherTrie instance. It associates a non +// negative integer value to it, so later `find` could return the value, when it succeeds. +[direct_array_access] +pub fn (mut km KeywordsMatcherTrie) add_word(word string, value int) { + wlen := word.len + if km.max_len < wlen { + km.max_len = wlen + } + if km.min_len > wlen { + km.min_len = wlen + } + if km.nodes[wlen] == unsafe { nil } { + km.nodes[wlen] = new_trie_node() + } + km.nodes[wlen].add_word(word, value, 0) +} + +// new_keywords_matcher_trie creates a new KeywordsMatcherTrie instance from a given map +// with string keys, and integer or enum values. +pub fn new_keywords_matcher_trie(kw_map map[string]T) KeywordsMatcherTrie { + mut km := KeywordsMatcherTrie{ + nodes: []&TrieNode{cap: 20} + } + for _ in 0 .. 20 { + km.nodes << &TrieNode(0) + } + for k, v in kw_map { + km.add_word(k, v) + } + // dump(km.min_len) + // dump(km.max_len) + // for idx,x in km.nodes { if x != unsafe { nil } { eprintln('>> idx: $idx | ${ptr_str(x)}') } } + return km +} + +// new_keywords_matcher_from_array_trie creates a new KeywordsMatcherTrie instance from a given array +// of strings. The values for the strings, that `find` will return, will be the indexes in that array. +pub fn new_keywords_matcher_from_array_trie(names []string) KeywordsMatcherTrie { + mut m := map[string]int{} + for i, name in names { + m[name] = i + } + return new_keywords_matcher_trie(m) +} + +// + +// new_trie_node creates a new TrieNode instance +pub fn new_trie_node() &TrieNode { + return &TrieNode{} +} + +// show displays the information in `node`, in a more compact/readable format (recursively) +pub fn (node &TrieNode) show(level int) { + mut non_nil_children := 0 + for x in node.children { + if x != unsafe { nil } { + non_nil_children++ + } + } + eprintln('> level: ${level:2} | value: ${node.value:12} | non_nil_children: ${non_nil_children:2}') + for x in node.children { + if x != unsafe { nil } { + x.show(level + 1) + } + } +} + +// add_word adds another `word` and `value` pair into the trie, starting from `node` (recursively). +// `word_idx` is jsut used as an accumulator, and starts from 0 at the root of the tree. +pub fn (mut node TrieNode) add_word(word string, value int, word_idx int) { + first := u8(word[word_idx] or { + node.value = value + return + }) + // eprintln('>> node: ${ptr_str(node)} | first: $first | word_idx: $word_idx') + mut child_node := node.children[first] + if child_node == unsafe { nil } { + child_node = new_trie_node() + node.children[first] = child_node + } + child_node.add_word(word, value, word_idx + 1) +} + +// find tries to find a match for `word` to the trie (the set of all previously added words). +// It returns -1 if there is no match, or the value associated with the previously added +// matching word by `add_word`. +[direct_array_access] +pub fn (root &TrieNode) find(word string) int { + wlen := word.len + mut node := unsafe { &TrieNode(root) } + mut idx := 0 + for { + // eprintln('> match_keyword: `${word:20}` | node: ${ptr_str(node)} | idx: ${idx:3}') + if idx == wlen { + k := node.value + if k > 0 { + // node.show(0) + return k + } + return -1 + } + c := word[idx] + child := node.children[c] + if child == unsafe { nil } { + return -1 + } + node = child + idx++ + } + return -1 +} diff --git a/vlib/v/token/token.v b/vlib/v/token/token.v index 625b128e97..bde02d6e70 100644 --- a/vlib/v/token/token.v +++ b/vlib/v/token/token.v @@ -186,10 +186,10 @@ pub const ( token_str = build_token_str() keywords = build_keys() - - matcher = new_keywords_matcher(keywords) ) +pub const scanner_matcher = new_keywords_matcher_trie(keywords) + // build_keys genereates a map with keywords' string values: // Keywords['return'] == .key_return fn build_keys() map[string]Kind { @@ -351,7 +351,11 @@ pub fn (t Kind) is_assign() bool { // note: used for some code generation, so no quoting [inline] pub fn (t Kind) str() string { - return token.token_str[int(t)] + idx := int(t) + if idx < 0 || token.token_str.len <= idx { + return 'unknown' + } + return token.token_str[idx] } pub fn (t Token) str() string {