v.token: use a more performant KeywordMatcher in the scanner and parser (#15196)

2023-08-10 21:13:21 +03:00 · 2022-07-25 06:32:06 +03:00 · 2022-07-25 06:32:06 +03:00 · 6a5db0df61
commit 6a5db0df61
parent 297cb5f89c
9 changed files with 227 additions and 111 deletions
--- a/vlib/v/ast/ast.v
+++ b/vlib/v/ast/ast.v
@ -2276,11 +2276,3 @@ pub fn type_can_start_with_token(tok &token.Token) bool {
 	}
 	return false
 }
-
-fn build_builtin_type_names_matcher() token.KeywordsMatcher {
-	mut m := map[string]int{}
-	for i, name in builtin_type_names {
-		m[name] = i
-	}
-	return token.new_keywords_matcher<int>(m)
-}
--- a/vlib/v/ast/types.v
+++ b/vlib/v/ast/types.v
@ -13,6 +13,7 @@ module ast

 import strings
 import v.pref
+import v.token

 pub type Type = int

@ -455,7 +456,7 @@ pub const builtin_type_names = ['void', 'voidptr', 'byteptr', 'charptr', 'i8', '
 	'isize', 'u8', 'u16', 'u32', 'u64', 'usize', 'f32', 'f64', 'char', 'bool', 'none', 'string',
 	'rune', 'array', 'map', 'chan', 'any', 'float_literal', 'int_literal', 'thread', 'Error', 'nil']

-pub const builtin_type_names_matcher = build_builtin_type_names_matcher()
+pub const builtin_type_names_matcher = token.new_keywords_matcher_from_array_trie(builtin_type_names)

 pub const (
 	integer_type_idxs          = [i8_type_idx, i16_type_idx, int_type_idx, i64_type_idx, u8_type_idx,
--- a/vlib/v/pref/default.v
+++ b/vlib/v/pref/default.v
@ -158,7 +158,7 @@ pub fn (mut p Preferences) fill_with_defaults() {
 	}

 	$if prealloc {
-		if !p.no_parallel {
+		if !p.no_parallel && p.is_verbose {
 			eprintln('disabling parallel cgen, since V was built with -prealloc')
 		}
 		p.no_parallel = true
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -665,7 +665,7 @@ fn (mut s Scanner) text_scan() token.Token {
 			// tmp hack to detect . in ${}
 			// Check if not .eof to prevent panic
 			next_char := s.look_ahead(1)
-			kind := token.matcher.find(name)
+			kind := token.scanner_matcher.find(name)
 			if kind != -1 {
 				return s.new_token(token.Kind(kind), name, name.len)
 			}
--- a/vlib/v/tests/bench/bench_compare_tokens.v
+++ b/vlib/v/tests/bench/bench_compare_tokens.v
@ -4,19 +4,21 @@ import benchmark
 const max_repetitions = 4_000_000

 fn main() {
-	km := token.new_keywords_matcher(token.keywords)
+	mut res := token.Kind{}
+	km_trie := token.new_keywords_matcher_trie(token.keywords)
 	for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
-		'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
-		mut res := token.Kind{}
+		'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a', 'assez', 'returned'] {
 		mut bmark := benchmark.start()
 		for _ in 0 .. max_repetitions {
 			res = token.keywords[kw]
 		}
 		bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
+
 		for _ in 0 .. max_repetitions {
-			res = km.find(kw)
+			res = token.Kind(km_trie.find(kw))
 		}
-		bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
+		bmark.measure('$max_repetitions repetitions of km_trie.find("$kw") = $res')
+
 		println('--------------------------------')
 	}
 }
--- a/vlib/v/token/README.md
+++ b/vlib/v/token/README.md
@ -0,0 +1,57 @@
+## Description:
+
+`v.token` is a module providing the basic building blocks of the V
+syntax - the tokens, as well as utilities for working with them.
+
+## KeywordsMatcherTrie 
+KeywordsMatcherTrie provides a faster way of determinining whether a given name is a reserved
+word (belongs to a given set of previously known words `R`). It works by exploiting the fact,
+that the set of reserved words is small, and the words short.
+
+KeywordsMatcherTrie uses an ordered set of [tries](https://en.wikipedia.org/wiki/Trie),
+one per each word length, that was added, so that rejecting that something is a reserved
+word, can be done in constant time for words smaller or larger in length than all the
+reserved ones.
+
+After a word `w`, is confirmed by this initial check by length `n`, that it could belong
+to a trie `Tn`, responsible for all known reserved words of that length, then `Tn` is used
+to further verify or reject the word quickly. In order to do so, `Tn` prepares in advance
+an array of all possible continuations (letters), at each index of the words `R`, after
+any given prefix, belonging to `R`.
+
+For example, if we have added the word `asm` to the trie T3, its tree (its nodes) may look
+like this (note that the 0 pointers in children, mean that there was no word in `R`, that had
+that corresponding letter at that specific index):
+```
+TrieNode 0:  a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
+| children:  1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`a`] = 1 -> TrieNode 1
+|   prefix so far: ''    | value: 0                                  |
+|
+TrieNode 1:  a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
+| children:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 ... | children[`s`] = 2 -> TrieNode 2
+|   prefix so far: 'a'   | value: 0                                  |
+|
+TrieNode 2:  a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
+| children:  0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`m`] = 3 -> TrieNode 3
+|   prefix so far: 'as'  | value: 0                                  | Note: `as` is a keyword with length 2,
+|                                                                      but we are searching in T3 trie.
+|
+TrieNode 3:  a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
+| children:  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | all of children are 0
+|   prefix so far: 'asm' | value: int(token.Kind.asm)                |
+```
+
+Matching any given `word` in the trie, after you have prepared it, is then simple:
+just read each character of the `word`, and follow the corresponding pointer from
+the `children` array (indexed by character). When the pointer is nil, there was NO
+match, and the word is rejected, which happens very often, and early for most words
+that are not in the set of the previously added reserved words. One significant 
+benefit compared to just comparing the checked `word` against a linear list of all
+known words, is that once you have found that a word is not a match at any given
+level/trie node, then you know that it is not a match to *any* of them.
+
+Note: benchmarking shows that it is ~300% to 400% faster, compared to just using 
+`token.keywords[name]` on average, when there is a match, but it can be 17x faster
+in the case, where there is a length mismatch. After changes to KeywordsMatcherTrie,
+please do `v -prod run vlib/v/tests/bench/bench_compare_tokens.v` to verify, 
+that there is no performance regression.
--- a/vlib/v/token/keywords_matcher.v
+++ b/vlib/v/token/keywords_matcher.v
@ -1,92 +0,0 @@
-module token
-
-// bump token.max_keyword_len, if you add a longer keyword
-const max_keyword_len = 20
-
-// KeywordsMatcher provides a faster way of determinining whether a given name
-// is a reserved word, by doing a comparison with only the keywords that
-// have exactly the same length as `name`.
-// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
-// compared to just using token.keywords[name], but can be 20x faster
-// in the case, where there is a length mismatch, and 2x-3x faster in most
-// cases, where there is a match.
-// Without -prod, with tcc, using KeywordsMatcher is always faster
-// (2x to 14x times), compared to using a hash of all the keywords.
-pub struct KeywordsMatcher {
-mut:
-	len_min int = 9999
-	len_max int = -1
-	words   [max_keyword_len][]WIndex
-}
-
-struct WIndex {
-mut:
-	word  string
-	index int
-}
-
-pub fn new_keywords_matcher<T>(kw_map map[string]T) KeywordsMatcher {
-	mut km := KeywordsMatcher{}
-	// TODO: remove this loop. It is currently needed, because a
-	// fixed array of arrays is not initialised properly automatically
-	// as of 2021/10/28
-	for i in 0 .. token.max_keyword_len {
-		km.words[i] = []WIndex{}
-	}
-	for k, v in kw_map {
-		km.add_word(k, int(v))
-	}
-	for i in 0 .. token.max_keyword_len {
-		if km.words[i].len > 0 {
-			km.words[i].sort(a.word < b.word)
-			$if trace_keyword_matcher_initialisation ? {
-				print('word len: ${i:3} | words: ')
-				for w in km.words[i] {
-					print('$w.word, ')
-				}
-				println('')
-			}
-		}
-	}
-	return km
-}
-
-fn (mut km KeywordsMatcher) add_word(word string, kind int) {
-	if word.len >= token.max_keyword_len {
-		panic('increase max_keyword_len to > $word.len')
-	}
-	if km.len_max < word.len {
-		km.len_max = word.len
-	}
-	if word.len < km.len_min {
-		km.len_min = word.len
-	}
-	km.words[word.len] << WIndex{word, kind}
-}
-
-// find returns the int index, given a word, by doing a binary search
-// on the sorted list of words for each bin
-[direct_array_access]
-pub fn (km &KeywordsMatcher) find(word string) int {
-	wlen := word.len
-	if wlen < km.len_min || wlen > km.len_max {
-		return -1
-	}
-	list_len := km.words[wlen].len
-	if list_len == 0 {
-		return -1
-	}
-	mut lo := 0
-	mut hi := list_len - 1
-	for lo <= hi {
-		mid := lo + (hi - lo) / 2
-		cmp := km.words[wlen][mid].word.compare(word)
-		match cmp {
-			0 { return km.words[wlen][mid].index }
-			-1 { lo = mid + 1 }
-			1 { hi = mid - 1 }
-			else {}
-		}
-	}
-	return -1
-}
--- a/vlib/v/token/keywords_matcher_trie.v
+++ b/vlib/v/token/keywords_matcher_trie.v
@ -0,0 +1,152 @@
+module token
+
+// KeywordsMatcherTrie provides a faster way of determinining whether a given name
+// is a reserved word (belongs to a given set of previously known words `R`).
+// See the module description for more details.
+[heap]
+pub struct KeywordsMatcherTrie {
+mut:
+	nodes   []&TrieNode
+	min_len int = 999999
+	max_len int
+}
+
+// TrieNode is a single node from a trie, used by KeywordsMatcherTrie
+pub struct TrieNode {
+mut:
+	children [123]&TrieNode
+	value    int // when positive, it is a leaf node representing a match
+}
+
+// find tries to find the given `word` in the set of all previously added words
+// to the KeywordsMatcherTrie instance. It returns -1 if the word was NOT found
+// there at all. If the word was found, find will return the `value` (value => 0),
+// associated with the word, when it was added.
+[direct_array_access]
+pub fn (km &KeywordsMatcherTrie) find(word string) int {
+	wlen := word.len
+	if wlen < km.min_len {
+		return -1
+	}
+	if wlen > km.max_len {
+		return -1
+	}
+	node := km.nodes[wlen]
+	if node == unsafe { nil } {
+		return -1
+	}
+	return node.find(word)
+}
+
+// add_word adds the given word to the KeywordsMatcherTrie instance. It associates a non
+// negative integer value to it, so later `find` could return the value, when it succeeds.
+[direct_array_access]
+pub fn (mut km KeywordsMatcherTrie) add_word(word string, value int) {
+	wlen := word.len
+	if km.max_len < wlen {
+		km.max_len = wlen
+	}
+	if km.min_len > wlen {
+		km.min_len = wlen
+	}
+	if km.nodes[wlen] == unsafe { nil } {
+		km.nodes[wlen] = new_trie_node()
+	}
+	km.nodes[wlen].add_word(word, value, 0)
+}
+
+// new_keywords_matcher_trie creates a new KeywordsMatcherTrie instance from a given map
+// with string keys, and integer or enum values.
+pub fn new_keywords_matcher_trie<T>(kw_map map[string]T) KeywordsMatcherTrie {
+	mut km := KeywordsMatcherTrie{
+		nodes: []&TrieNode{cap: 20}
+	}
+	for _ in 0 .. 20 {
+		km.nodes << &TrieNode(0)
+	}
+	for k, v in kw_map {
+		km.add_word(k, v)
+	}
+	// dump(km.min_len)
+	// dump(km.max_len)
+	// for idx,x in km.nodes { if x != unsafe { nil } { eprintln('>> idx: $idx | ${ptr_str(x)}') } }
+	return km
+}
+
+// new_keywords_matcher_from_array_trie creates a new KeywordsMatcherTrie instance from a given array
+// of strings. The values for the strings, that `find` will return, will be the indexes in that array.
+pub fn new_keywords_matcher_from_array_trie(names []string) KeywordsMatcherTrie {
+	mut m := map[string]int{}
+	for i, name in names {
+		m[name] = i
+	}
+	return new_keywords_matcher_trie<int>(m)
+}
+
+//
+
+// new_trie_node creates a new TrieNode instance
+pub fn new_trie_node() &TrieNode {
+	return &TrieNode{}
+}
+
+// show displays the information in `node`, in a more compact/readable format (recursively)
+pub fn (node &TrieNode) show(level int) {
+	mut non_nil_children := 0
+	for x in node.children {
+		if x != unsafe { nil } {
+			non_nil_children++
+		}
+	}
+	eprintln('> level: ${level:2} | value: ${node.value:12} | non_nil_children: ${non_nil_children:2}')
+	for x in node.children {
+		if x != unsafe { nil } {
+			x.show(level + 1)
+		}
+	}
+}
+
+// add_word adds another `word` and `value` pair into the trie, starting from `node` (recursively).
+// `word_idx` is jsut used as an accumulator, and starts from 0 at the root of the tree.
+pub fn (mut node TrieNode) add_word(word string, value int, word_idx int) {
+	first := u8(word[word_idx] or {
+		node.value = value
+		return
+	})
+	// eprintln('>> node: ${ptr_str(node)} | first: $first | word_idx: $word_idx')
+	mut child_node := node.children[first]
+	if child_node == unsafe { nil } {
+		child_node = new_trie_node()
+		node.children[first] = child_node
+	}
+	child_node.add_word(word, value, word_idx + 1)
+}
+
+// find tries to find a match for `word` to the trie (the set of all previously added words).
+// It returns -1 if there is no match, or the value associated with the previously added
+// matching word by `add_word`.
+[direct_array_access]
+pub fn (root &TrieNode) find(word string) int {
+	wlen := word.len
+	mut node := unsafe { &TrieNode(root) }
+	mut idx := 0
+	for {
+		// eprintln('> match_keyword: `${word:20}` | node: ${ptr_str(node)} | idx: ${idx:3}')
+		if idx == wlen {
+			k := node.value
+			if k > 0 {
+				// node.show(0)
+				return k
+			}
+			return -1
+		}
+		c := word[idx]
+		child := node.children[c]
+		if child == unsafe { nil } {
+			return -1
+		}
+		node = child
+		idx++
+	}
+	return -1
+}
--- a/vlib/v/token/token.v
+++ b/vlib/v/token/token.v
@ -186,10 +186,10 @@ pub const (
 	token_str       = build_token_str()

 	keywords        = build_keys()
-
-	matcher         = new_keywords_matcher<Kind>(keywords)
 )

+pub const scanner_matcher = new_keywords_matcher_trie<Kind>(keywords)
+
 // build_keys genereates a map with keywords' string values:
 // Keywords['return'] == .key_return
 fn build_keys() map[string]Kind {
@ -351,7 +351,11 @@ pub fn (t Kind) is_assign() bool {
 // note: used for some code generation, so no quoting
 [inline]
 pub fn (t Kind) str() string {
-	return token.token_str[int(t)]
+	idx := int(t)
+	if idx < 0 || token.token_str.len <= idx {
+		return 'unknown'
+	}
+	return token.token_str[idx]
 }

 pub fn (t Token) str() string {