Try to use avx_hs() - slow and requires alignment 4096 bits < (l.size*l.size*l.c)

May be faster only from 8192 bits and more.
2023-08-10 21:13:14 +03:00 · 2018-08-08 19:07:10 +03:00
parent 0a326e7afe
commit a284a7da8d
4 changed files with 978 additions and 870 deletions
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -684,6 +684,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
            // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
            {
                size_t ldb_align = 256;// 8;
+                if (k > 4096)ldb_align = 4096;
+
                size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
                size_t t_intput_size = new_ldb * n;
                size_t t_bit_input_size = t_intput_size / 8;// +1;