From fc8e3d09717eaac3f2854640f91edfe55ede923f Mon Sep 17 00:00:00 2001 From: Delyan Angelov Date: Thu, 8 Sep 2022 11:09:13 +0300 Subject: [PATCH] builtin,strconv: speedup `str.int()` conversions (without -prod) --- vlib/builtin/string.v | 2 + vlib/strconv/atof.c.v | 5 ++- vlib/strconv/atofq.c.v | 1 + vlib/strconv/atoi.v | 49 +++++++++++++----------- vlib/strconv/number_to_base.c.v | 2 +- vlib/strconv/utilities.c.v | 4 +- vlib/strconv/vprintf.c.v | 4 +- vlib/strings/strings.c.v | 2 + vlib/v/tests/bench/bench_string_int.v | 17 ++++++++ vlib/v/tests/big_array_allocation_test.v | 7 ++++ 10 files changed, 63 insertions(+), 30 deletions(-) create mode 100644 vlib/v/tests/bench/bench_string_int.v diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index b2bf9c1946..da1b25eb2d 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -1818,6 +1818,7 @@ pub fn (s []string) join_lines() string { // reverse returns a reversed string. // Example: assert 'Hello V'.reverse() == 'V olleH' +[direct_array_access] pub fn (s string) reverse() string { if s.len == 0 || s.len == 1 { return s.clone() @@ -1870,6 +1871,7 @@ pub fn (s string) bytes() []u8 { } // repeat returns a new string with `count` number of copies of the string it was called on. +[direct_array_access] pub fn (s string) repeat(count int) string { if count < 0 { panic('string.repeat: count is negative: $count') diff --git a/vlib/strconv/atof.c.v b/vlib/strconv/atof.c.v index c51bd3f294..14241cdcd3 100644 --- a/vlib/strconv/atof.c.v +++ b/vlib/strconv/atof.c.v @@ -101,7 +101,7 @@ fn sub96(s2 u32, s1 u32, s0 u32, d2 u32, d1 u32, d0 u32) (u32, u32, u32) { // Utility functions fn is_digit(x u8) bool { - return (x >= strconv.c_zero && x <= strconv.c_nine) == true + return x >= strconv.c_zero && x <= strconv.c_nine } fn is_space(x u8) bool { @@ -109,7 +109,7 @@ fn is_space(x u8) bool { } fn is_exp(x u8) bool { - return (x == `E` || x == `e`) == true + return x == `E` || x == `e` } // Possible parser return values. @@ -124,6 +124,7 @@ enum ParserState { // parser tries to parse the given string into a number // NOTE: #TOFIX need one char after the last char of the number +[direct_array_access] fn parser(s string) (ParserState, PrepNumber) { mut digx := 0 mut result := ParserState.ok diff --git a/vlib/strconv/atofq.c.v b/vlib/strconv/atofq.c.v index 07bb7b33bd..d7b2b5e6a0 100644 --- a/vlib/strconv/atofq.c.v +++ b/vlib/strconv/atofq.c.v @@ -16,6 +16,7 @@ Know limitation: */ // atof_quick return a f64 number from a string in a quick way +[direct_array_access] pub fn atof_quick(s string) f64 { mut f := Float64u{} // result mut sign := f64(1.0) // result sign diff --git a/vlib/strconv/atoi.v b/vlib/strconv/atoi.v index 2babb2c1a2..6418d1989d 100644 --- a/vlib/strconv/atoi.v +++ b/vlib/strconv/atoi.v @@ -12,8 +12,9 @@ const ( max_u64 = u64(18446744073709551615) // as u64 // use this until we add support ) +[inline] pub fn byte_to_lower(c u8) u8 { - return c | (`x` - `X`) + return c | 32 } // common_parse_uint is called by parse_uint and allows the parsing @@ -34,14 +35,14 @@ pub fn common_parse_uint(s string, _base int, _bit_size int, error_on_non_digit // the first returned value contains the parsed value, // the second returned value contains the error code (0 = OK, >1 = index of first non-parseable character + 1, -1 = wrong base, -2 = wrong bit size, -3 = overflow) +[direct_array_access] pub fn common_parse_uint2(s string, _base int, _bit_size int) (u64, int) { - mut bit_size := _bit_size - mut base := _base if s.len < 1 || !underscore_ok(s) { // return error('parse_uint: syntax error $s') return u64(0), 1 } - base0 := base == 0 + mut bit_size := _bit_size + mut base := _base mut start_index := 0 if 2 <= base && base <= 36 { // valid base; nothing to do @@ -49,13 +50,13 @@ pub fn common_parse_uint2(s string, _base int, _bit_size int) (u64, int) { // Look for octal, hex prefix. base = 10 if s[0] == `0` { - if s.len >= 3 && byte_to_lower(s[1]) == `b` { + if s.len >= 3 && s[1] | 32 == `b` { base = 2 start_index += 2 - } else if s.len >= 3 && byte_to_lower(s[1]) == `o` { + } else if s.len >= 3 && s[1] | 32 == `o` { base = 8 start_index += 2 - } else if s.len >= 3 && byte_to_lower(s[1]) == `x` { + } else if s.len >= 3 && s[1] | 32 == `x` { base = 16 start_index += 2 } @@ -85,10 +86,10 @@ pub fn common_parse_uint2(s string, _base int, _bit_size int) (u64, int) { mut n := u64(0) for i in start_index .. s.len { c := s[i] - cl := byte_to_lower(c) + cl := c | 32 mut d := u8(0) - if c == `_` && base0 { + if c == `_` && _base == 0 { // underscore_ok already called continue } else if `0` <= c && c <= `9` { @@ -125,13 +126,17 @@ pub fn parse_uint(s string, _base int, _bit_size int) ?u64 { // common_parse_int is called by parse int and allows the parsing // to stop on non or invalid digit characters and return with an error +[direct_array_access] pub fn common_parse_int(_s string, base int, _bit_size int, error_on_non_digit bool, error_on_high_digit bool) ?i64 { - mut s := _s - mut bit_size := _bit_size - if s.len < 1 { + if _s.len < 1 { // return error('parse_int: syntax error $s') return i64(0) } + mut bit_size := _bit_size + if bit_size == 0 { + bit_size = strconv.int_size + } + mut s := _s // Pick off leading sign. mut neg := false if s[0] == `+` { @@ -148,9 +153,6 @@ pub fn common_parse_int(_s string, base int, _bit_size int, error_on_non_digit b if un == 0 { return i64(0) } - if bit_size == 0 { - bit_size = strconv.int_size - } // TODO: check should u64(bit_size-1) be size of int (32)? cutoff := u64(1) << u64(bit_size - 1) if !neg && un >= cutoff { @@ -181,9 +183,10 @@ pub fn parse_int(_s string, base int, _bit_size int) ?i64 { } // atoi is equivalent to parse_int(s, 10, 0), converted to type int. +[direct_array_access] pub fn atoi(s string) ?int { if s == '' { - return error('strconv.atoi: parsing "$s": invalid syntax ') + return error('strconv.atoi: parsing "": invalid syntax') } if (strconv.int_size == 32 && (0 < s.len && s.len < 10)) || (strconv.int_size == 64 && (0 < s.len && s.len < 19)) { @@ -193,7 +196,7 @@ pub fn atoi(s string) ?int { start_idx++ if s.len - start_idx < 1 { // return 0, &NumError{fnAtoi, s0, ErrSyntax} - return error('strconv.atoi: parsing "$s": invalid syntax ') + return error('strconv.atoi: parsing "$s": invalid syntax') } } mut n := 0 @@ -201,7 +204,7 @@ pub fn atoi(s string) ?int { ch := s[i] - `0` if ch > 9 { // return 0, &NumError{fnAtoi, s0, ErrSyntax} - return error('strconv.atoi: parsing "$s": invalid syntax ') + return error('strconv.atoi: parsing "$s": invalid syntax') } n = n * 10 + int(ch) } @@ -215,6 +218,7 @@ pub fn atoi(s string) ?int { // underscore_ok reports whether the underscores in s are allowed. // Checking them in this one function lets all the parsers skip over them simply. // Underscore must appear only between digits or between a base prefix and a digit. +[direct_array_access] fn underscore_ok(s string) bool { // saw tracks the last character (class) we saw: // ^ for beginning of number, @@ -229,17 +233,16 @@ fn underscore_ok(s string) bool { } // Optional base prefix. mut hex := false - if s.len - i >= 2 && s[i] == `0` && (byte_to_lower(s[i + 1]) == `b` - || byte_to_lower(s[i + 1]) == `o` || byte_to_lower(s[i + 1]) == `x`) { + if (s.len - i >= 2) && (s[i] == `0`) && (((s[i + 1] | 32) == `b`) + || ((s[i + 1] | 32) == `o`) || ((s[i + 1] | 32) == `x`)) { saw = `0` // base prefix counts as a digit for "underscore as digit separator" - hex = byte_to_lower(s[i + 1]) == `x` + hex = (s[i + 1] | 32) == `x` i += 2 } // Number proper. for ; i < s.len; i++ { // Digits are always okay. - if (`0` <= s[i] && s[i] <= `9`) || (hex && `a` <= byte_to_lower(s[i]) - && byte_to_lower(s[i]) <= `f`) { + if (`0` <= s[i] && s[i] <= `9`) || ((hex && `a` <= (s[i] | 32)) && ((s[i] | 32) <= `f`)) { saw = `0` continue } diff --git a/vlib/strconv/number_to_base.c.v b/vlib/strconv/number_to_base.c.v index afce53be79..69409f7e21 100644 --- a/vlib/strconv/number_to_base.c.v +++ b/vlib/strconv/number_to_base.c.v @@ -41,7 +41,7 @@ pub fn format_int(n i64, radix int) string { // format_uint returns the string representation of the number n in base `radix` // for digit values > 10, this function uses the small latin leters a-z. -[manualfree] +[direct_array_access; manualfree] pub fn format_uint(n u64, radix int) string { unsafe { if radix < 2 || radix > 36 { diff --git a/vlib/strconv/utilities.c.v b/vlib/strconv/utilities.c.v index 5cdfbf153d..d2c33a3a0f 100644 --- a/vlib/strconv/utilities.c.v +++ b/vlib/strconv/utilities.c.v @@ -75,7 +75,7 @@ pub fn f64_to_str_l_no_dot(f f64) string { // floating-point `string` in scientific notation. // // Example: assert strconv.fxx_to_str_l_parse('34.22e+00') == '34.22' -[manualfree] +[direct_array_access; manualfree] pub fn fxx_to_str_l_parse(s string) string { // check for +inf -inf Nan if s.len > 2 && (s[0] == `n` || s[1] == `i`) { @@ -202,7 +202,7 @@ pub fn fxx_to_str_l_parse(s string) string { // The decimal digits after the dot can be omitted. // // Example: assert strconv.fxx_to_str_l_parse_no_dot ('34.e+01') == '340' -[manualfree] +[direct_array_access; manualfree] pub fn fxx_to_str_l_parse_no_dot(s string) string { // check for +inf -inf Nan if s.len > 2 && (s[0] == `n` || s[1] == `i`) { diff --git a/vlib/strconv/vprintf.c.v b/vlib/strconv/vprintf.c.v index dbea0e1490..85b68a2fcb 100644 --- a/vlib/strconv/vprintf.c.v +++ b/vlib/strconv/vprintf.c.v @@ -36,7 +36,7 @@ pub fn v_printf(str string, pt ...voidptr) { // assert strconv.v_sprintf('aaa %G', x) == 'aaa 3.141516' // ``` [deprecated: 'use string interpolation instead'] -[manualfree] +[direct_array_access; manualfree] pub fn v_sprintf(str string, pt ...voidptr) string { mut res := strings.new_builder(pt.len * 16) defer { @@ -560,7 +560,7 @@ fn fabs(x f64) f64 { } // strings.Builder version of format_fl -[manualfree] +[direct_array_access; manualfree] pub fn format_fl_old(f f64, p BF_param) string { unsafe { mut s := '' diff --git a/vlib/strings/strings.c.v b/vlib/strings/strings.c.v index b475786ba9..ba14afcd74 100644 --- a/vlib/strings/strings.c.v +++ b/vlib/strings/strings.c.v @@ -1,6 +1,7 @@ module strings // strings.repeat - fill a string with `n` repetitions of the character `c` +[direct_array_access] pub fn repeat(c u8, n int) string { if n <= 0 { return '' @@ -16,6 +17,7 @@ pub fn repeat(c u8, n int) string { // strings.repeat_string - gives you `n` repetitions of the substring `s` // Note: strings.repeat, that repeats a single byte, is between 2x // and 24x faster than strings.repeat_string called for a 1 char string. +[direct_array_access] pub fn repeat_string(s string, n int) string { if n <= 0 || s.len == 0 { return '' diff --git a/vlib/v/tests/bench/bench_string_int.v b/vlib/v/tests/bench/bench_string_int.v new file mode 100644 index 0000000000..0b780012d0 --- /dev/null +++ b/vlib/v/tests/bench/bench_string_int.v @@ -0,0 +1,17 @@ +import benchmark + +const maxn = 999_999 + +fn main() { + mut snumbers := []string{cap: maxn} + for i in 0 .. maxn { + snumbers << i.str() + } + mut sum := i64(0) + mut bmark := benchmark.start() + for s in snumbers { + sum += s.int() + } + bmark.measure('s.int()') + dump(sum) +} diff --git a/vlib/v/tests/big_array_allocation_test.v b/vlib/v/tests/big_array_allocation_test.v index 1a6e5a1581..fc34a6f89e 100644 --- a/vlib/v/tests/big_array_allocation_test.v +++ b/vlib/v/tests/big_array_allocation_test.v @@ -1,3 +1,5 @@ +import os + [direct_array_access] fn test_big_int_array() { dump(sizeof(isize)) @@ -5,6 +7,11 @@ fn test_big_int_array() { if sizeof(isize) > 4 { maxn = 1_000_000_000 // 1 billion integers, when each is 4 bytes => require ~4GB } + // NB: this test requires RAM that many people do not have, so only run it in full, when VTEST_BIGMEM is 1 + vtest_bigmem := os.getenv('VTEST_BIGMEM').int() + if vtest_bigmem == 0 { + maxn = 10_000_000 + } dump(maxn) mut data := []int{len: maxn}