1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00

v.token: use a more performant KeywordMatcher in the scanner and parser (#15196)

This commit is contained in:
Delyan Angelov 2022-07-25 06:32:06 +03:00 committed by GitHub
parent 297cb5f89c
commit 6a5db0df61
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 227 additions and 111 deletions

View File

@ -2276,11 +2276,3 @@ pub fn type_can_start_with_token(tok &token.Token) bool {
}
return false
}
fn build_builtin_type_names_matcher() token.KeywordsMatcher {
mut m := map[string]int{}
for i, name in builtin_type_names {
m[name] = i
}
return token.new_keywords_matcher<int>(m)
}

View File

@ -13,6 +13,7 @@ module ast
import strings
import v.pref
import v.token
pub type Type = int
@ -455,7 +456,7 @@ pub const builtin_type_names = ['void', 'voidptr', 'byteptr', 'charptr', 'i8', '
'isize', 'u8', 'u16', 'u32', 'u64', 'usize', 'f32', 'f64', 'char', 'bool', 'none', 'string',
'rune', 'array', 'map', 'chan', 'any', 'float_literal', 'int_literal', 'thread', 'Error', 'nil']
pub const builtin_type_names_matcher = build_builtin_type_names_matcher()
pub const builtin_type_names_matcher = token.new_keywords_matcher_from_array_trie(builtin_type_names)
pub const (
integer_type_idxs = [i8_type_idx, i16_type_idx, int_type_idx, i64_type_idx, u8_type_idx,

View File

@ -158,7 +158,7 @@ pub fn (mut p Preferences) fill_with_defaults() {
}
$if prealloc {
if !p.no_parallel {
if !p.no_parallel && p.is_verbose {
eprintln('disabling parallel cgen, since V was built with -prealloc')
}
p.no_parallel = true

View File

@ -665,7 +665,7 @@ fn (mut s Scanner) text_scan() token.Token {
// tmp hack to detect . in ${}
// Check if not .eof to prevent panic
next_char := s.look_ahead(1)
kind := token.matcher.find(name)
kind := token.scanner_matcher.find(name)
if kind != -1 {
return s.new_token(token.Kind(kind), name, name.len)
}

View File

@ -4,19 +4,21 @@ import benchmark
const max_repetitions = 4_000_000
fn main() {
km := token.new_keywords_matcher(token.keywords)
mut res := token.Kind{}
km_trie := token.new_keywords_matcher_trie(token.keywords)
for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else',
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] {
mut res := token.Kind{}
'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a', 'assez', 'returned'] {
mut bmark := benchmark.start()
for _ in 0 .. max_repetitions {
res = token.keywords[kw]
}
bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res')
for _ in 0 .. max_repetitions {
res = km.find(kw)
res = token.Kind(km_trie.find(kw))
}
bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res')
bmark.measure('$max_repetitions repetitions of km_trie.find("$kw") = $res')
println('--------------------------------')
}
}

57
vlib/v/token/README.md Normal file
View File

@ -0,0 +1,57 @@
## Description:
`v.token` is a module providing the basic building blocks of the V
syntax - the tokens, as well as utilities for working with them.
## KeywordsMatcherTrie
KeywordsMatcherTrie provides a faster way of determinining whether a given name is a reserved
word (belongs to a given set of previously known words `R`). It works by exploiting the fact,
that the set of reserved words is small, and the words short.
KeywordsMatcherTrie uses an ordered set of [tries](https://en.wikipedia.org/wiki/Trie),
one per each word length, that was added, so that rejecting that something is a reserved
word, can be done in constant time for words smaller or larger in length than all the
reserved ones.
After a word `w`, is confirmed by this initial check by length `n`, that it could belong
to a trie `Tn`, responsible for all known reserved words of that length, then `Tn` is used
to further verify or reject the word quickly. In order to do so, `Tn` prepares in advance
an array of all possible continuations (letters), at each index of the words `R`, after
any given prefix, belonging to `R`.
For example, if we have added the word `asm` to the trie T3, its tree (its nodes) may look
like this (note that the 0 pointers in children, mean that there was no word in `R`, that had
that corresponding letter at that specific index):
```
TrieNode 0: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
| children: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`a`] = 1 -> TrieNode 1
| prefix so far: '' | value: 0 |
|
TrieNode 1: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 ... | children[`s`] = 2 -> TrieNode 2
| prefix so far: 'a' | value: 0 |
|
TrieNode 2: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
| children: 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | children[`m`] = 3 -> TrieNode 3
| prefix so far: 'as' | value: 0 | Note: `as` is a keyword with length 2,
| but we are searching in T3 trie.
|
TrieNode 3: a b c d e f g h i j k l m n o p q r s t u v w x y z ... |
| children: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... | all of children are 0
| prefix so far: 'asm' | value: int(token.Kind.asm) |
```
Matching any given `word` in the trie, after you have prepared it, is then simple:
just read each character of the `word`, and follow the corresponding pointer from
the `children` array (indexed by character). When the pointer is nil, there was NO
match, and the word is rejected, which happens very often, and early for most words
that are not in the set of the previously added reserved words. One significant
benefit compared to just comparing the checked `word` against a linear list of all
known words, is that once you have found that a word is not a match at any given
level/trie node, then you know that it is not a match to *any* of them.
Note: benchmarking shows that it is ~300% to 400% faster, compared to just using
`token.keywords[name]` on average, when there is a match, but it can be 17x faster
in the case, where there is a length mismatch. After changes to KeywordsMatcherTrie,
please do `v -prod run vlib/v/tests/bench/bench_compare_tokens.v` to verify,
that there is no performance regression.

View File

@ -1,92 +0,0 @@
module token
// bump token.max_keyword_len, if you add a longer keyword
const max_keyword_len = 20
// KeywordsMatcher provides a faster way of determinining whether a given name
// is a reserved word, by doing a comparison with only the keywords that
// have exactly the same length as `name`.
// Benchmarking shows that with -prod, it is 20-25% slower in the worst case
// compared to just using token.keywords[name], but can be 20x faster
// in the case, where there is a length mismatch, and 2x-3x faster in most
// cases, where there is a match.
// Without -prod, with tcc, using KeywordsMatcher is always faster
// (2x to 14x times), compared to using a hash of all the keywords.
pub struct KeywordsMatcher {
mut:
len_min int = 9999
len_max int = -1
words [max_keyword_len][]WIndex
}
struct WIndex {
mut:
word string
index int
}
pub fn new_keywords_matcher<T>(kw_map map[string]T) KeywordsMatcher {
mut km := KeywordsMatcher{}
// TODO: remove this loop. It is currently needed, because a
// fixed array of arrays is not initialised properly automatically
// as of 2021/10/28
for i in 0 .. token.max_keyword_len {
km.words[i] = []WIndex{}
}
for k, v in kw_map {
km.add_word(k, int(v))
}
for i in 0 .. token.max_keyword_len {
if km.words[i].len > 0 {
km.words[i].sort(a.word < b.word)
$if trace_keyword_matcher_initialisation ? {
print('word len: ${i:3} | words: ')
for w in km.words[i] {
print('$w.word, ')
}
println('')
}
}
}
return km
}
fn (mut km KeywordsMatcher) add_word(word string, kind int) {
if word.len >= token.max_keyword_len {
panic('increase max_keyword_len to > $word.len')
}
if km.len_max < word.len {
km.len_max = word.len
}
if word.len < km.len_min {
km.len_min = word.len
}
km.words[word.len] << WIndex{word, kind}
}
// find returns the int index, given a word, by doing a binary search
// on the sorted list of words for each bin
[direct_array_access]
pub fn (km &KeywordsMatcher) find(word string) int {
wlen := word.len
if wlen < km.len_min || wlen > km.len_max {
return -1
}
list_len := km.words[wlen].len
if list_len == 0 {
return -1
}
mut lo := 0
mut hi := list_len - 1
for lo <= hi {
mid := lo + (hi - lo) / 2
cmp := km.words[wlen][mid].word.compare(word)
match cmp {
0 { return km.words[wlen][mid].index }
-1 { lo = mid + 1 }
1 { hi = mid - 1 }
else {}
}
}
return -1
}

View File

@ -0,0 +1,152 @@
module token
// KeywordsMatcherTrie provides a faster way of determinining whether a given name
// is a reserved word (belongs to a given set of previously known words `R`).
// See the module description for more details.
[heap]
pub struct KeywordsMatcherTrie {
mut:
nodes []&TrieNode
min_len int = 999999
max_len int
}
// TrieNode is a single node from a trie, used by KeywordsMatcherTrie
pub struct TrieNode {
mut:
children [123]&TrieNode
value int // when positive, it is a leaf node representing a match
}
// find tries to find the given `word` in the set of all previously added words
// to the KeywordsMatcherTrie instance. It returns -1 if the word was NOT found
// there at all. If the word was found, find will return the `value` (value => 0),
// associated with the word, when it was added.
[direct_array_access]
pub fn (km &KeywordsMatcherTrie) find(word string) int {
wlen := word.len
if wlen < km.min_len {
return -1
}
if wlen > km.max_len {
return -1
}
node := km.nodes[wlen]
if node == unsafe { nil } {
return -1
}
return node.find(word)
}
// add_word adds the given word to the KeywordsMatcherTrie instance. It associates a non
// negative integer value to it, so later `find` could return the value, when it succeeds.
[direct_array_access]
pub fn (mut km KeywordsMatcherTrie) add_word(word string, value int) {
wlen := word.len
if km.max_len < wlen {
km.max_len = wlen
}
if km.min_len > wlen {
km.min_len = wlen
}
if km.nodes[wlen] == unsafe { nil } {
km.nodes[wlen] = new_trie_node()
}
km.nodes[wlen].add_word(word, value, 0)
}
// new_keywords_matcher_trie creates a new KeywordsMatcherTrie instance from a given map
// with string keys, and integer or enum values.
pub fn new_keywords_matcher_trie<T>(kw_map map[string]T) KeywordsMatcherTrie {
mut km := KeywordsMatcherTrie{
nodes: []&TrieNode{cap: 20}
}
for _ in 0 .. 20 {
km.nodes << &TrieNode(0)
}
for k, v in kw_map {
km.add_word(k, v)
}
// dump(km.min_len)
// dump(km.max_len)
// for idx,x in km.nodes { if x != unsafe { nil } { eprintln('>> idx: $idx | ${ptr_str(x)}') } }
return km
}
// new_keywords_matcher_from_array_trie creates a new KeywordsMatcherTrie instance from a given array
// of strings. The values for the strings, that `find` will return, will be the indexes in that array.
pub fn new_keywords_matcher_from_array_trie(names []string) KeywordsMatcherTrie {
mut m := map[string]int{}
for i, name in names {
m[name] = i
}
return new_keywords_matcher_trie<int>(m)
}
//
// new_trie_node creates a new TrieNode instance
pub fn new_trie_node() &TrieNode {
return &TrieNode{}
}
// show displays the information in `node`, in a more compact/readable format (recursively)
pub fn (node &TrieNode) show(level int) {
mut non_nil_children := 0
for x in node.children {
if x != unsafe { nil } {
non_nil_children++
}
}
eprintln('> level: ${level:2} | value: ${node.value:12} | non_nil_children: ${non_nil_children:2}')
for x in node.children {
if x != unsafe { nil } {
x.show(level + 1)
}
}
}
// add_word adds another `word` and `value` pair into the trie, starting from `node` (recursively).
// `word_idx` is jsut used as an accumulator, and starts from 0 at the root of the tree.
pub fn (mut node TrieNode) add_word(word string, value int, word_idx int) {
first := u8(word[word_idx] or {
node.value = value
return
})
// eprintln('>> node: ${ptr_str(node)} | first: $first | word_idx: $word_idx')
mut child_node := node.children[first]
if child_node == unsafe { nil } {
child_node = new_trie_node()
node.children[first] = child_node
}
child_node.add_word(word, value, word_idx + 1)
}
// find tries to find a match for `word` to the trie (the set of all previously added words).
// It returns -1 if there is no match, or the value associated with the previously added
// matching word by `add_word`.
[direct_array_access]
pub fn (root &TrieNode) find(word string) int {
wlen := word.len
mut node := unsafe { &TrieNode(root) }
mut idx := 0
for {
// eprintln('> match_keyword: `${word:20}` | node: ${ptr_str(node)} | idx: ${idx:3}')
if idx == wlen {
k := node.value
if k > 0 {
// node.show(0)
return k
}
return -1
}
c := word[idx]
child := node.children[c]
if child == unsafe { nil } {
return -1
}
node = child
idx++
}
return -1
}

View File

@ -186,10 +186,10 @@ pub const (
token_str = build_token_str()
keywords = build_keys()
matcher = new_keywords_matcher<Kind>(keywords)
)
pub const scanner_matcher = new_keywords_matcher_trie<Kind>(keywords)
// build_keys genereates a map with keywords' string values:
// Keywords['return'] == .key_return
fn build_keys() map[string]Kind {
@ -351,7 +351,11 @@ pub fn (t Kind) is_assign() bool {
// note: used for some code generation, so no quoting
[inline]
pub fn (t Kind) str() string {
return token.token_str[int(t)]
idx := int(t)
if idx < 0 || token.token_str.len <= idx {
return 'unknown'
}
return token.token_str[idx]
}
pub fn (t Token) str() string {