From 8014235e0e2eb0e26d2aa8d4f858956711638f03 Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Thu, 28 Oct 2021 15:09:41 +0300 Subject: [PATCH] scanner: speed up text_scan by using a specialised keywords matcher, instead of a generic V map of keywords --- vlib/builtin/string.v | 23 ++++++ vlib/v/scanner/scanner.v | 2 +- vlib/v/tests/bench/bench_compare_tokens.v | 22 ++++++ vlib/v/token/keywords_matcher.v | 92 +++++++++++++++++++++++ vlib/v/token/token.v | 38 ++++------ 5 files changed, 151 insertions(+), 26 deletions(-) create mode 100644 vlib/v/tests/bench/bench_compare_tokens.v create mode 100644 vlib/v/token/keywords_matcher.v diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index 33620e0b39..2182ecf553 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -499,6 +499,28 @@ fn (s string) == (a string) bool { } } +// compare returns -1 if `s` < `a`, 0 if `s` == `a`, and 1 if `s` > `a` +[direct_array_access] +pub fn (s string) compare(a string) int { + min_len := if s.len < a.len { s.len } else { a.len } + for i in 0 .. min_len { + if s[i] < a[i] { + return -1 + } + if s[i] > a[i] { + return 1 + } + } + if s.len < a.len { + return -1 + } + if s.len > a.len { + return 1 + } + return 0 +} + +[direct_array_access] fn (s string) < (a string) bool { for i in 0 .. s.len { if i >= a.len || s[i] > a[i] { @@ -513,6 +535,7 @@ fn (s string) < (a string) bool { return false } +[direct_array_access] fn (s string) + (a string) string { new_len := a.len + s.len mut res := string{ diff --git a/vlib/v/scanner/scanner.v b/vlib/v/scanner/scanner.v index 811c8f4a6d..8cb8c21496 100644 --- a/vlib/v/scanner/scanner.v +++ b/vlib/v/scanner/scanner.v @@ -675,7 +675,7 @@ fn (mut s Scanner) text_scan() token.Token { // tmp hack to detect . in ${} // Check if not .eof to prevent panic next_char := s.look_ahead(1) - kind := token.keywords[name] + kind := token.matcher.find(name) if kind != .unknown { return s.new_token(kind, name, name.len) } diff --git a/vlib/v/tests/bench/bench_compare_tokens.v b/vlib/v/tests/bench/bench_compare_tokens.v new file mode 100644 index 0000000000..5bb3d1ec1c --- /dev/null +++ b/vlib/v/tests/bench/bench_compare_tokens.v @@ -0,0 +1,22 @@ +import v.token +import benchmark + +const max_repetitions = 4_000_000 + +fn main() { + km := token.new_keywords_matcher(token.keywords) + for kw in ['for', 'val', 'int', 'f32', 'struct', 'return', 'if', 'in', 'as', 'or', 'else', + 'unsafe', 'return', 'assert', 'Abc', 'my_identifier', 'a'] { + mut res := token.Kind{} + mut bmark := benchmark.start() + for _ in 0 .. max_repetitions { + res = token.keywords[kw] + } + bmark.measure('$max_repetitions repetitions of token.keywords["$kw"] = $res') + for _ in 0 .. max_repetitions { + res = km.find(kw) + } + bmark.measure('$max_repetitions repetitions of km.find("$kw") = $res') + println('--------------------------------') + } +} diff --git a/vlib/v/token/keywords_matcher.v b/vlib/v/token/keywords_matcher.v new file mode 100644 index 0000000000..50d737c528 --- /dev/null +++ b/vlib/v/token/keywords_matcher.v @@ -0,0 +1,92 @@ +module token + +// bump token.max_keyword_len, if you add a longer keyword +const max_keyword_len = 11 + +// KeywordsMatcher provides a faster way of determinining whether a given name +// is a reserved keyword, by doing a comparison with only the keywords that +// have exactly the same length as `name`. +// Benchmarking shows that with -prod, it is 20-25% slower in the worst case +// compared to just using token.keywords[name], but can be 20x faster +// in the case, where there is a length mismatch, and 2x-3x faster in most +// cases, where there is a match. +// Without -prod, with tcc, using KeywordsMatcher is always faster +// (2x to 14x times), compared to using a hash of all the keywords. +pub struct KeywordsMatcher { +mut: + len_min int = 9999 + len_max int = -1 + words [max_keyword_len][]WKind +} + +struct WKind { +mut: + word string + kind Kind +} + +pub fn new_keywords_matcher(kw_map map[string]Kind) KeywordsMatcher { + mut km := KeywordsMatcher{} + // TODO: remove this loop. It is currently needed, because a + // fixed array of arrays is not initialised properly automatically + // as of 2021/10/28 + for i in 0 .. token.max_keyword_len { + km.words[i] = []WKind{} + } + for k, v in kw_map { + km.add_word(k, v) + } + for i in 0 .. token.max_keyword_len { + if km.words[i].len > 0 { + km.words[i].sort(a.word < b.word) + $if trace_keyword_matcher_initialisation ? { + print('word len: ${i:3} | words: ') + for w in km.words[i] { + print('$w.word, ') + } + println('') + } + } + } + return km +} + +fn (mut km KeywordsMatcher) add_word(word string, kind Kind) { + if word.len >= token.max_keyword_len { + panic('increase max_keyword_len to > $word.len') + } + if km.len_max < word.len { + km.len_max = word.len + } + if word.len < km.len_min { + km.len_min = word.len + } + km.words[word.len] << WKind{word, kind} +} + +// find returns the Kind given a word, by doing a binary search +// on the sorted list of words for each bin +[direct_array_access] +pub fn (km &KeywordsMatcher) find(word string) Kind { + wlen := word.len + if wlen < km.len_min || wlen > km.len_max { + return Kind.unknown + } + list_len := km.words[wlen].len + if list_len == 0 { + return Kind.unknown + } + mut lo := 0 + mut hi := list_len - 1 + for lo <= hi { + mid := lo + (hi - lo) / 2 + cmp := km.words[wlen][mid].word.compare(word) + match cmp { + 0 { return km.words[wlen][mid].kind } + -1 { lo = mid + 1 } + 1 { hi = mid - 1 } + else {} + } + } + return Kind.unknown +} diff --git a/vlib/v/token/token.v b/vlib/v/token/token.v index 1f787ee107..e164c40101 100644 --- a/vlib/v/token/token.v +++ b/vlib/v/token/token.v @@ -132,15 +132,11 @@ pub enum Kind { _end_ } -pub const ( - assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, .xor_assign, - .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign, - .unsigned_right_shift_assign] -) +pub const assign_tokens = [Kind.assign, .plus_assign, .minus_assign, .mult_assign, .div_assign, + .xor_assign, .mod_assign, .or_assign, .and_assign, .right_shift_assign, .left_shift_assign, + .unsigned_right_shift_assign] -const ( - nr_tokens = int(Kind._end_) -) +const nr_tokens = int(Kind._end_) // @FN => will be substituted with the name of the current V function // @METHOD => will be substituted with ReceiverType.MethodName @@ -182,10 +178,8 @@ pub enum AtKind { vexeroot_path } -pub const ( - valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT', - '@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE'] -) +pub const valid_at_tokens = ['@VROOT', '@VMODROOT', '@VEXEROOT', '@FN', '@METHOD', '@MOD', '@STRUCT', + '@VEXE', '@FILE', '@LINE', '@COLUMN', '@VHASH', '@VMOD_FILE'] // build_keys genereates a map with keywords' string values: // Keywords['return'] == .key_return @@ -315,13 +309,11 @@ fn build_token_str() []string { return s } -const ( - token_str = build_token_str() -) +const token_str = build_token_str() -pub const ( - keywords = build_keys() -) +pub const keywords = build_keys() + +pub const matcher = new_keywords_matcher(keywords) [inline] pub fn is_key(key string) bool { @@ -365,10 +357,8 @@ pub fn (t Token) str() string { // Representation of highest and lowest precedence /* -pub const ( - lowest_prec = 0 - highest_prec = 8 -) +pub const lowest_prec = 0 +pub const highest_prec = 8 */ pub enum Precedence { lowest @@ -439,9 +429,7 @@ pub fn build_precedences() []Precedence { return p } -const ( - precedences = build_precedences() -) +const precedences = build_precedences() // precedence returns a tokens precedence if defined, otherwise lowest_prec [inline]