Minor speedup on CPU

2023-08-10 21:13:14 +03:00 · 2019-01-26 19:12:46 +03:00
parent 630f441e08
commit 090d934c0f
6 changed files with 168 additions and 37 deletions
--- a/src/activations.c
+++ b/src/activations.c
@ -102,10 +102,17 @@ void activate_array(float *x, const int n, const ACTIVATION a)
    int i;
    if (a == LINEAR) {}
    else if (a == LEAKY) {
+        #pragma omp parallel for
        for (i = 0; i < n; ++i) {
            x[i] = leaky_activate(x[i]);
        }
    }
+    else if (a == LOGISTIC) {
+        #pragma omp parallel for
+        for (i = 0; i < n; ++i) {
+            x[i] = logistic_activate(x[i]);
+        }
+    }
    else {
        for (i = 0; i < n; ++i) {
            x[i] = activate(x[i], a);
--- a/src/blas.c
+++ b/src/blas.c
@ -172,8 +172,13 @@ void scal_cpu(int N, float ALPHA, float *X, int INCX)
 void fill_cpu(int N, float ALPHA, float *X, int INCX)
 {
    int i;
+    if (INCX == 1 && ALPHA == 0) {
+        memset(X, 0, N * sizeof(float));
+    }
+    else {
        for (i = 0; i < N; ++i) X[i*INCX] = ALPHA;
    }
+}

 void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
 {
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -776,7 +776,12 @@ size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input,
    size_t t_intput_size = new_ldb * bit_align;// n;
    size_t t_bit_input_size = t_intput_size / 8;// +1;

-    *t_bit_input = calloc(t_bit_input_size, sizeof(char));
+    static int last_t_bit_input_size = 0;
+    if (last_t_bit_input_size < t_bit_input_size) {
+        last_t_bit_input_size = t_bit_input_size;
+        *t_bit_input = realloc(*t_bit_input, last_t_bit_input_size * sizeof(char));
+    }
+    memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
    int src_size = k * bit_align;

    // b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
@ -798,7 +803,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);

-    if(l.xnor){
+    if (l.xnor && (!l.align_bit_weights || state.train)) {
        if (!l.align_bit_weights || state.train) {
            binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
            //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
@ -838,8 +843,26 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)

                const int new_c = l.c / 32;

-                float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
-                uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
+                static float *re_packed_input = NULL;
+                static int last_re_packed_input_size = 0;
+                int re_packed_input_size = l.c * l.w * l.h;
+                if (last_re_packed_input_size < re_packed_input_size) {
+                    last_re_packed_input_size = re_packed_input_size;
+                    re_packed_input = realloc(re_packed_input, last_re_packed_input_size * sizeof(float));
+                }
+                memset(re_packed_input, 0, re_packed_input_size * sizeof(float));
+
+                static uint32_t *bin_re_packed_input = NULL;
+                static int last_bin_re_packed_input_size = 0;
+                int in_re_packed_input_size = new_c * l.w * l.h + 1;
+                if (last_bin_re_packed_input_size < in_re_packed_input_size) {
+                    last_bin_re_packed_input_size = in_re_packed_input_size;
+                    bin_re_packed_input = realloc(bin_re_packed_input, last_bin_re_packed_input_size * sizeof(uint32_t));
+                }
+                memset(bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
+
+                //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
+                //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));

                // float32x4 by channel (as in cuDNN)
                repack_input(state.input, re_packed_input, l.w, l.h, l.c);
@ -847,7 +870,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                // 32 x floats -> 1 x uint32_t
                float_to_bit(re_packed_input, (char *)bin_re_packed_input, l.c * l.w * l.h);

-                free(re_packed_input);
+                //free(re_packed_input);

                // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
                //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
@ -859,7 +882,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
                //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);

-                free(bin_re_packed_input);
+                //free(bin_re_packed_input);

                int new_k = l.size*l.size*l.c / 32;

@ -876,7 +899,14 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                //size_t t_intput_size = new_ldb * l.bit_align;// n;
                //size_t t_bit_input_size = t_intput_size / 8;// +1;

-                char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
+                //char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
+                static char *t_bit_input = NULL;
+                static int last_t_bit_input_size = 0;
+                if (last_t_bit_input_size < t_bit_input_size) {
+                    last_t_bit_input_size = t_bit_input_size;
+                    t_bit_input = realloc(t_bit_input, last_t_bit_input_size * sizeof(char));
+                }
+                memset(t_bit_input, 0, t_bit_input_size * sizeof(char));

                transpose_uint32((uint32_t *)b, t_bit_input, new_k, n, n, new_ldb);

@ -889,10 +919,11 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                //    t_bit_input, new_ldb / 32,
                //    c, n, l.mean_arr);

-                free(t_bit_input);
+                //free(t_bit_input);

            }
-            else { // else (l.c % 32 != 0)
+            else
+            { // else (l.c % 32 != 0)

                //--------------------------------------------------------
                //printf(" l.index = %d - old XNOR \n", l.index);
@ -919,7 +950,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                    //size_t ldb_align = 256; // 256 bit for AVX2
                    int ldb_align = l.lda_align;
                    size_t new_ldb = k + (ldb_align - k%ldb_align);
-                    char *t_bit_input = NULL;
+                    static char *t_bit_input = NULL;
                    size_t t_intput_size = binary_transpose_align_input(k, n, b, &t_bit_input, ldb_align, l.bit_align);
                    //char *t_bit_input = calloc(new_ldb * n, sizeof(char));    // for im2col_cpu_custom_transpose() only
                    //float_to_bit(t_input, t_bit_input, new_ldb * n);    // for im2col_cpu_custom_transpose() only
@ -930,12 +961,18 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
                    //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);

                    //free(t_input);
-                    free(t_bit_input);
+                    //free(t_bit_input);
                    //}
                }

            }

+            add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+
+            //activate_array(l.output, m*n*l.batch, l.activation);
+            activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
+            return;
+
        }
        else {
            //printf(" l.index = %d - FP32 \n", l.index);
--- a/src/gemm.c
+++ b/src/gemm.c
@ -1151,6 +1151,23 @@ static inline int popcnt256_custom(__m256i n) {
        + _mm256_extract_epi64(val, 3);
 }

+static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
+    __m256i c_bit256 = _mm256_set1_epi8(255);
+
+    __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
+    c_bit256 = _mm256_andnot_si256(xor256, c_bit256);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT
+
+    *count_sum = _mm256_add_epi64(count256(c_bit256), *count_sum);    //  1st part - popcnt Mula<6C>s algorithm
+}
+
+// 2nd part - popcnt Mula<6C>s algorithm
+static inline int get_count_mula(__m256i count_sum) {
+    return _mm256_extract_epi64(count_sum, 0)
+        + _mm256_extract_epi64(count_sum, 1)
+        + _mm256_extract_epi64(count_sum, 2)
+        + _mm256_extract_epi64(count_sum, 3);
+}
+
 // 5x times faster than gemm()-float32
 // further optimizations: do mean-mult only for the last layer
 void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
@ -1168,45 +1185,101 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
    }
 #endif

+    //#pragma omp parallel for
+    //for (i = 0; i < M; ++i)
    #pragma omp parallel for
-    for (i = 0; i < M; ++i)
+    for (i = 0; i < (M/2)*2; i += 2)
    {   // l.n - filters [16 - 55 - 1024]
-        float mean_val = mean_arr[i];
+        float mean_val_0 = mean_arr[i + 0];
+        float mean_val_1 = mean_arr[i + 1];
        int j, k;
        __m256i all_1 = _mm256_set1_epi8(255);

-        for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
-            int count = 0;
+        //for (j = 0; j < N; ++j)
+        for (j = 0; j < (N/2)*2; j += 2)
+        { // out_h*out_w - one channel output size [169 - 173056]
+            //int count = 0;
            const int bit_step = 256;
-            __m256i count_sum = _mm256_set1_epi8(0);
+            __m256i count_sum_0 = _mm256_set1_epi8(0);
+            __m256i count_sum_1 = _mm256_set1_epi8(0);
+            __m256i count_sum_2 = _mm256_set1_epi8(0);
+            __m256i count_sum_3 = _mm256_set1_epi8(0);

            for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
-                __m256i a_bit256 = _mm256_loadu_si256((__m256i *)(A + (i*lda + k) / 8));
-                __m256i b_bit256 = _mm256_loadu_si256((__m256i *)(B + (j*ldb + k) / 8));
-                __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
-                __m256i c_bit256 = _mm256_andnot_si256(xor256, all_1);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT

-                count_sum = _mm256_add_epi64(count256(c_bit256), count_sum);    //  Mula<6C>s algorithm
+                __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
+                __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+
+                __m256i a_bit256_1 = _mm256_loadu_si256((__m256i *)(A + ((i + 1)*lda + k) / 8));
+                __m256i b_bit256_1 = _mm256_loadu_si256((__m256i *)(B + ((j + 1)*ldb + k) / 8));
+
+
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum_0);
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_1, &count_sum_1);
+
+                xnor_avx2_popcnt(a_bit256_1, b_bit256_0, &count_sum_2);
+                xnor_avx2_popcnt(a_bit256_1, b_bit256_1, &count_sum_3);

                //count += popcnt256(c_bit256);
-
                //binary_int64_printf(c_bit64);
                //printf(", count = %d \n\n", tmp_count);
            }

-            // count of 1 bits
-            //count = count_sum.m256i_i64[0] +
-            //    count_sum.m256i_i64[1] +
-            //    count_sum.m256i_i64[2] +
-             //   count_sum.m256i_i64[3];
-            count = _mm256_extract_epi64(count_sum, 0)
-                + _mm256_extract_epi64(count_sum, 1)
-                + _mm256_extract_epi64(count_sum, 2)
-                + _mm256_extract_epi64(count_sum, 3);
+            int count_0 = get_count_mula(count_sum_0);
+            int count_1 = get_count_mula(count_sum_1);
+            int count_2 = get_count_mula(count_sum_2);
+            int count_3 = get_count_mula(count_sum_3);

-            int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+            const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+            count_0 = count_0 - f1;    // remove extra bits (from empty space for align only)
+            count_1 = count_1 - f1;
+            count_2 = count_2 - f1;
+            count_3 = count_3 - f1;
+            C[i*ldc + (j + 0)] = (2 * count_0 - K) * mean_val_0;
+            C[i*ldc + (j + 1)] = (2 * count_1 - K) * mean_val_0;
+            C[(i + 1)*ldc + (j + 0)] = (2 * count_2 - K) * mean_val_1;
+            C[(i + 1)*ldc + (j + 1)] = (2 * count_3 - K) * mean_val_1;
+        }
+
+        int i_d;
+        for (i_d = 0; i_d < 2; ++i_d)
+        {
+            float mean_val = mean_arr[i + i_d];
+            for (j = (N / 2) * 2; j < N; j += 1)
+            { // out_h*out_w - one channel output size [169 - 173056]
+                const int bit_step = 256;
+                __m256i count_sum = _mm256_set1_epi8(0);
+
+                for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                    __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + i_d + 0)*lda + k) / 8));
+                    __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+                    xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
+                }
+                int count = get_count_mula(count_sum);
+                const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
                count = count - f1;    // remove extra bits (from empty space for align only)
+                C[(i + i_d)*ldc + j] = (2 * count - K) * mean_val;
+            }
+        }
+    }

+    for (i = (M / 2) * 2; i < M; i += 1)
+    {
+        float mean_val = mean_arr[i];
+        int j, k;
+        for (j = 0; j < N; j += 1)
+        { // out_h*out_w - one channel output size [169 - 173056]
+            const int bit_step = 256;
+            __m256i count_sum = _mm256_set1_epi8(0);
+
+            for (k = 0; k < K; k += bit_step) {   // l.size*l.size*l.c - one filter size [27 - 9216]
+                __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
+                __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
+                xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
+            }
+            int count = get_count_mula(count_sum);
+            const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
+            count = count - f1;    // remove extra bits (from empty space for align only)
            C[i*ldc + j] = (2 * count - K) * mean_val;
        }
    }
--- a/src/network.c
+++ b/src/network.c
@ -201,7 +201,7 @@ void forward_network(network net, network_state state)
    for(i = 0; i < net.n; ++i){
        state.index = i;
        layer l = net.layers[i];
-        if(l.delta){
+        if(l.delta && state.train){
            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
        }
        //double time = get_time_point();
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@ -58,8 +58,17 @@ void resize_shortcut_layer(layer *l, int w, int h)

 void forward_shortcut_layer(const layer l, network_state state)
 {
+    if (l.w == l.out_w && l.h == l.out_h && l.c == l.out_c) {
+        int size = l.batch * l.w * l.h * l.c;
+        int i;
+        #pragma omp parallel for
+        for(i = 0; i < size; ++i)
+            l.output[i] = state.input[i] + state.net.layers[l.index].output[i];
+    }
+    else {
        copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
        shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+    }
    activate_array(l.output, l.outputs*l.batch, l.activation);
 }