mirror of
https://github.com/vlang/v.git
synced 2023-08-10 21:13:21 +03:00
v.token: use a more performant KeywordMatcher in the scanner and parser (#15196)
This commit is contained in:
parent
297cb5f89c
commit
6a5db0df61
@ -2276,11 +2276,3 @@ pub fn type_can_start_with_token(tok &token.Token) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
fn build_builtin_type_names_matcher() token.KeywordsMatcher {
|
||||
mut m := map[string]int{}
|
||||
for i, name in builtin_type_names {
|
||||
m[name] = i
|
||||
}
|
||||
return token.new_keywords_matcher<int>(m)
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ module ast
|
||||
|
||||
import strings
|
||||
import v.pref
|
||||
import v.token
|
||||
|
||||
pub type Type = int
|
||||
|
||||
@ -455,7 +456,7 @@ pub const builtin_type_names = ['void', 'voidptr', 'byteptr', 'charptr', 'i8', '
|
||||
'isize', 'u8', 'u16', 'u32', 'u64', 'usize', 'f32', 'f64', 'char', 'bool', 'none', 'string',
|
||||
'rune', 'array', 'map', 'chan', 'any', 'float_literal', 'int_literal', 'thread', 'Error', 'nil']
|
||||
|
||||
pub const builtin_type_names_matcher = build_builtin_type_names_matcher()
|
||||
pub const builtin_type_names_matcher = token.new_keywords_matcher_from_array_trie(builtin_type_names)
|
||||
|
||||
pub const (
|
||||
integer_type_idxs = [i8_type_idx, i16_type_idx, int_type_idx, i64_type_idx, u8_type_idx,
|
||||
|
@ -158,7 +158,7 @@ pub fn (mut p Preferences) fill_with_defaults() {
|
||||
}
|
||||
|
||||
$if prealloc {
|
||||
if !p.no_parallel {
|
||||
if !p.no_parallel && p.is_verbose {
|
||||
eprintln('disabling parallel cgen, since V was built with -prealloc')
|
||||
}
|
||||
p.no_parallel = true
|
||||
|
@ -665,7 +665,7 @@ fn (mut s Scanner) text_scan() token.Token {
|
||||
// tmp hack to detect . in ${}
|
||||
// Check if not .eof to prevent panic
|
||||
next_char := s.look_ahead(1)
|
||||
kind := token.matcher.find(name)
|
||||
kind := token.scanner_matcher.find(name)
|
||||
if kind != -1 {
|
||||
return s.new_token(token.Kind(kind), name, name.len)
|
||||
}
|
||||
|
@ -4,19 +4,21 @@ import benchmark
|
||||
const max_repetitions = 4_000_000
|
||||
|
||||
fn main() {
|
||||
km := token.new_keywords_matcher(token.keywords)
|
||||
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
|
||||
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
|
||||
mut res := token.Kind{}
|
||||
km_trie := token.new_keywords_matcher_trie(token.keywords)
|
||||
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
|
||||
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a', 'assez', 'returned'] {
|
||||
mut bmark := benchmark.start()
|
||||
for _ in 0 .. max_repetitions {
|
||||
res = token.keywords[kw]
|
||||
}
|
||||
bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
|
||||
|
||||
for _ in 0 .. max_repetitions {
|
||||
res = km.find(kw)
|
||||
res = token.Kind(km_trie.find(kw))
|
||||
}
|
||||
bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
|
||||
bmark.measure('$max_repetitions repetitions of km_trie.find("$kw") = $res')
|
||||
|
||||
println('--------------------------------')
|
||||
}
|
||||
}
|
||||
|
57
vlib/v/token/README.md
Normal file
57
vlib/v/token/README.md
Normal file
@ -0,0 +1,57 @@
|
||||
## Description:
|
||||
|
||||
`v.token` is a module providing the basic building blocks of the V
|
||||
syntax - the tokens, as well as utilities for working with them.
|
||||
|
||||
## KeywordsMatcherTrie
|
||||
KeywordsMatcherTrie provides a faster way of determinining whether a given name is a reserved
|
||||
word (belongs to a given set of previously known words `R`). It works by exploiting the fact,
|
||||
that the set of reserved words is small, and the words short.
|
||||
|
||||
KeywordsMatcherTrie uses an ordered set of [tries](https://en.wikipedia.org/wiki/Trie),
|
||||
one per each word length, that was added, so that rejecting that something is a reserved
|
||||
word, can be done in constant time for words smaller or larger in length than all the
|
||||
reserved ones.
|
||||
|
||||
After a word `w`, is confirmed by this initial check by length `n`, that it could belong
|
||||
to a trie `Tn`, responsible for all known reserved words of that length, then `Tn` is used
|
||||
to further verify or reject the word quickly. In order to do so, `Tn` prepares in advance
|
||||
an array of all possible continuations (letters), at each index of the words `R`, after
|
||||
any given prefix, belonging to `R`.
|
||||
|
||||
For example, if we have added the word `asm` to the trie T3, its tree (its nodes) may look
|
||||
like this (note that the 0 pointers in children, mean that there was no word in `R`, that had
|
||||
that corresponding letter at that specific index):
|
||||
```
|
||||
TrieNode 0: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
|
||||
| children: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`a`] = 1 -> TrieNode 1
|
||||
| prefix so far: '' | value: 0 |
|
||||
|
|
||||
TrieNode 1: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
|
||||
| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 ... | children[`s`] = 2 -> TrieNode 2
|
||||
| prefix so far: 'a' | value: 0 |
|
||||
|
|
||||
TrieNode 2: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
|
||||
| children: 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`m`] = 3 -> TrieNode 3
|
||||
| prefix so far: 'as' | value: 0 | Note: `as` is a keyword with length 2,
|
||||
| but we are searching in T3 trie.
|
||||
|
|
||||
TrieNode 3: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
|
||||
| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | all of children are 0
|
||||
| prefix so far: 'asm' | value: int(token.Kind.asm) |
|
||||
```
|
||||
|
||||
Matching any given `word` in the trie, after you have prepared it, is then simple:
|
||||
just read each character of the `word`, and follow the corresponding pointer from
|
||||
the `children` array (indexed by character). When the pointer is nil, there was NO
|
||||
match, and the word is rejected, which happens very often, and early for most words
|
||||
that are not in the set of the previously added reserved words. One significant
|
||||
benefit compared to just comparing the checked `word` against a linear list of all
|
||||
known words, is that once you have found that a word is not a match at any given
|
||||
level/trie node, then you know that it is not a match to *any* of them.
|
||||
|
||||
Note: benchmarking shows that it is ~300% to 400% faster, compared to just using
|
||||
`token.keywords[name]` on average, when there is a match, but it can be 17x faster
|
||||
in the case, where there is a length mismatch. After changes to KeywordsMatcherTrie,
|
||||
please do `v -prod run vlib/v/tests/bench/bench_compare_tokens.v` to verify,
|
||||
that there is no performance regression.
|
@ -1,92 +0,0 @@
|
||||
module token
|
||||
|
||||
// bump token.max_keyword_len, if you add a longer keyword
|
||||
const max_keyword_len = 20
|
||||
|
||||
// KeywordsMatcher provides a faster way of determinining whether a given name
|
||||
// is a reserved word, by doing a comparison with only the keywords that
|
||||
// have exactly the same length as `name`.
|
||||
// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
|
||||
// compared to just using token.keywords[name], but can be 20x faster
|
||||
// in the case, where there is a length mismatch, and 2x-3x faster in most
|
||||
// cases, where there is a match.
|
||||
// Without -prod, with tcc, using KeywordsMatcher is always faster
|
||||
// (2x to 14x times), compared to using a hash of all the keywords.
|
||||
pub struct KeywordsMatcher {
|
||||
mut:
|
||||
len_min int = 9999
|
||||
len_max int = -1
|
||||
words [max_keyword_len][]WIndex
|
||||
}
|
||||
|
||||
struct WIndex {
|
||||
mut:
|
||||
word string
|
||||
index int
|
||||
}
|
||||
|
||||
pub fn new_keywords_matcher<T>(kw_map map[string]T) KeywordsMatcher {
|
||||
mut km := KeywordsMatcher{}
|
||||
// TODO: remove this loop. It is currently needed, because a
|
||||
// fixed array of arrays is not initialised properly automatically
|
||||
// as of 2021/10/28
|
||||
for i in 0 .. token.max_keyword_len {
|
||||
km.words[i] = []WIndex{}
|
||||
}
|
||||
for k, v in kw_map {
|
||||
km.add_word(k, int(v))
|
||||
}
|
||||
for i in 0 .. token.max_keyword_len {
|
||||
if km.words[i].len > 0 {
|
||||
km.words[i].sort(a.word < b.word)
|
||||
$if trace_keyword_matcher_initialisation ? {
|
||||
print('word len: ${i:3} | words: ')
|
||||
for w in km.words[i] {
|
||||
print('$w.word, ')
|
||||
}
|
||||
println('')
|
||||
}
|
||||
}
|
||||
}
|
||||
return km
|
||||
}
|
||||
|
||||
fn (mut km KeywordsMatcher) add_word(word string, kind int) {
|
||||
if word.len >= token.max_keyword_len {
|
||||
panic('increase max_keyword_len to > $word.len')
|
||||
}
|
||||
if km.len_max < word.len {
|
||||
km.len_max = word.len
|
||||
}
|
||||
if word.len < km.len_min {
|
||||
km.len_min = word.len
|
||||
}
|
||||
km.words[word.len] << WIndex{word, kind}
|
||||
}
|
||||
|
||||
// find returns the int index, given a word, by doing a binary search
|
||||
// on the sorted list of words for each bin
|
||||
[direct_array_access]
|
||||
pub fn (km &KeywordsMatcher) find(word string) int {
|
||||
wlen := word.len
|
||||
if wlen < km.len_min || wlen > km.len_max {
|
||||
return -1
|
||||
}
|
||||
list_len := km.words[wlen].len
|
||||
if list_len == 0 {
|
||||
return -1
|
||||
}
|
||||
mut lo := 0
|
||||
mut hi := list_len - 1
|
||||
for lo <= hi {
|
||||
mid := lo + (hi - lo) / 2
|
||||
cmp := km.words[wlen][mid].word.compare(word)
|
||||
match cmp {
|
||||
0 { return km.words[wlen][mid].index }
|
||||
-1 { lo = mid + 1 }
|
||||
1 { hi = mid - 1 }
|
||||
else {}
|
||||
}
|
||||
}
|
||||
return -1
|
||||
}
|
152
vlib/v/token/keywords_matcher_trie.v
Normal file
152
vlib/v/token/keywords_matcher_trie.v
Normal file
@ -0,0 +1,152 @@
|
||||
module token
|
||||
|
||||
// KeywordsMatcherTrie provides a faster way of determinining whether a given name
|
||||
// is a reserved word (belongs to a given set of previously known words `R`).
|
||||
// See the module description for more details.
|
||||
[heap]
|
||||
pub struct KeywordsMatcherTrie {
|
||||
mut:
|
||||
nodes []&TrieNode
|
||||
min_len int = 999999
|
||||
max_len int
|
||||
}
|
||||
|
||||
// TrieNode is a single node from a trie, used by KeywordsMatcherTrie
|
||||
pub struct TrieNode {
|
||||
mut:
|
||||
children [123]&TrieNode
|
||||
value int // when positive, it is a leaf node representing a match
|
||||
}
|
||||
|
||||
// find tries to find the given `word` in the set of all previously added words
|
||||
// to the KeywordsMatcherTrie instance. It returns -1 if the word was NOT found
|
||||
// there at all. If the word was found, find will return the `value` (value => 0),
|
||||
// associated with the word, when it was added.
|
||||
[direct_array_access]
|
||||
pub fn (km &KeywordsMatcherTrie) find(word string) int {
|
||||
wlen := word.len
|
||||
if wlen < km.min_len {
|
||||
return -1
|
||||
}
|
||||
if wlen > km.max_len {
|
||||
return -1
|
||||
}
|
||||
node := km.nodes[wlen]
|
||||
if node == unsafe { nil } {
|
||||
return -1
|
||||
}
|
||||
return node.find(word)
|
||||
}
|
||||
|
||||
// add_word adds the given word to the KeywordsMatcherTrie instance. It associates a non
|
||||
// negative integer value to it, so later `find` could return the value, when it succeeds.
|
||||
[direct_array_access]
|
||||
pub fn (mut km KeywordsMatcherTrie) add_word(word string, value int) {
|
||||
wlen := word.len
|
||||
if km.max_len < wlen {
|
||||
km.max_len = wlen
|
||||
}
|
||||
if km.min_len > wlen {
|
||||
km.min_len = wlen
|
||||
}
|
||||
if km.nodes[wlen] == unsafe { nil } {
|
||||
km.nodes[wlen] = new_trie_node()
|
||||
}
|
||||
km.nodes[wlen].add_word(word, value, 0)
|
||||
}
|
||||
|
||||
// new_keywords_matcher_trie creates a new KeywordsMatcherTrie instance from a given map
|
||||
// with string keys, and integer or enum values.
|
||||
pub fn new_keywords_matcher_trie<T>(kw_map map[string]T) KeywordsMatcherTrie {
|
||||
mut km := KeywordsMatcherTrie{
|
||||
nodes: []&TrieNode{cap: 20}
|
||||
}
|
||||
for _ in 0 .. 20 {
|
||||
km.nodes << &TrieNode(0)
|
||||
}
|
||||
for k, v in kw_map {
|
||||
km.add_word(k, v)
|
||||
}
|
||||
// dump(km.min_len)
|
||||
// dump(km.max_len)
|
||||
// for idx,x in km.nodes { if x != unsafe { nil } { eprintln('>> idx: $idx | ${ptr_str(x)}') } }
|
||||
return km
|
||||
}
|
||||
|
||||
// new_keywords_matcher_from_array_trie creates a new KeywordsMatcherTrie instance from a given array
|
||||
// of strings. The values for the strings, that `find` will return, will be the indexes in that array.
|
||||
pub fn new_keywords_matcher_from_array_trie(names []string) KeywordsMatcherTrie {
|
||||
mut m := map[string]int{}
|
||||
for i, name in names {
|
||||
m[name] = i
|
||||
}
|
||||
return new_keywords_matcher_trie<int>(m)
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
// new_trie_node creates a new TrieNode instance
|
||||
pub fn new_trie_node() &TrieNode {
|
||||
return &TrieNode{}
|
||||
}
|
||||
|
||||
// show displays the information in `node`, in a more compact/readable format (recursively)
|
||||
pub fn (node &TrieNode) show(level int) {
|
||||
mut non_nil_children := 0
|
||||
for x in node.children {
|
||||
if x != unsafe { nil } {
|
||||
non_nil_children++
|
||||
}
|
||||
}
|
||||
eprintln('> level: ${level:2} | value: ${node.value:12} | non_nil_children: ${non_nil_children:2}')
|
||||
for x in node.children {
|
||||
if x != unsafe { nil } {
|
||||
x.show(level + 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add_word adds another `word` and `value` pair into the trie, starting from `node` (recursively).
|
||||
// `word_idx` is jsut used as an accumulator, and starts from 0 at the root of the tree.
|
||||
pub fn (mut node TrieNode) add_word(word string, value int, word_idx int) {
|
||||
first := u8(word[word_idx] or {
|
||||
node.value = value
|
||||
return
|
||||
})
|
||||
// eprintln('>> node: ${ptr_str(node)} | first: $first | word_idx: $word_idx')
|
||||
mut child_node := node.children[first]
|
||||
if child_node == unsafe { nil } {
|
||||
child_node = new_trie_node()
|
||||
node.children[first] = child_node
|
||||
}
|
||||
child_node.add_word(word, value, word_idx + 1)
|
||||
}
|
||||
|
||||
// find tries to find a match for `word` to the trie (the set of all previously added words).
|
||||
// It returns -1 if there is no match, or the value associated with the previously added
|
||||
// matching word by `add_word`.
|
||||
[direct_array_access]
|
||||
pub fn (root &TrieNode) find(word string) int {
|
||||
wlen := word.len
|
||||
mut node := unsafe { &TrieNode(root) }
|
||||
mut idx := 0
|
||||
for {
|
||||
// eprintln('> match_keyword: `${word:20}` | node: ${ptr_str(node)} | idx: ${idx:3}')
|
||||
if idx == wlen {
|
||||
k := node.value
|
||||
if k > 0 {
|
||||
// node.show(0)
|
||||
return k
|
||||
}
|
||||
return -1
|
||||
}
|
||||
c := word[idx]
|
||||
child := node.children[c]
|
||||
if child == unsafe { nil } {
|
||||
return -1
|
||||
}
|
||||
node = child
|
||||
idx++
|
||||
}
|
||||
return -1
|
||||
}
|
@ -186,10 +186,10 @@ pub const (
|
||||
token_str = build_token_str()
|
||||
|
||||
keywords = build_keys()
|
||||
|
||||
matcher = new_keywords_matcher<Kind>(keywords)
|
||||
)
|
||||
|
||||
pub const scanner_matcher = new_keywords_matcher_trie<Kind>(keywords)
|
||||
|
||||
// build_keys genereates a map with keywords' string values:
|
||||
// Keywords['return'] == .key_return
|
||||
fn build_keys() map[string]Kind {
|
||||
@ -351,7 +351,11 @@ pub fn (t Kind) is_assign() bool {
|
||||
// note: used for some code generation, so no quoting
|
||||
[inline]
|
||||
pub fn (t Kind) str() string {
|
||||
return token.token_str[int(t)]
|
||||
idx := int(t)
|
||||
if idx < 0 || token.token_str.len <= idx {
|
||||
return 'unknown'
|
||||
}
|
||||
return token.token_str[idx]
|
||||
}
|
||||
|
||||
pub fn (t Token) str() string {
|
||||
|
Loading…
Reference in New Issue
Block a user