From 090d934c0f33007ca9422d92c0fdc211abf32ccc Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sat, 26 Jan 2019 19:12:46 +0300 Subject: [PATCH] Minor speedup on CPU --- src/activations.c | 7 +++ src/blas.c | 7 ++- src/convolutional_layer.c | 59 +++++++++++++++---- src/gemm.c | 117 +++++++++++++++++++++++++++++++------- src/network.c | 2 +- src/shortcut_layer.c | 13 ++++- 6 files changed, 168 insertions(+), 37 deletions(-) diff --git a/src/activations.c b/src/activations.c index c42a67a7..7aba7e26 100644 --- a/src/activations.c +++ b/src/activations.c @@ -102,10 +102,17 @@ void activate_array(float *x, const int n, const ACTIVATION a) int i; if (a == LINEAR) {} else if (a == LEAKY) { + #pragma omp parallel for for (i = 0; i < n; ++i) { x[i] = leaky_activate(x[i]); } } + else if (a == LOGISTIC) { + #pragma omp parallel for + for (i = 0; i < n; ++i) { + x[i] = logistic_activate(x[i]); + } + } else { for (i = 0; i < n; ++i) { x[i] = activate(x[i], a); diff --git a/src/blas.c b/src/blas.c index 70e06991..ae84dc72 100644 --- a/src/blas.c +++ b/src/blas.c @@ -172,7 +172,12 @@ void scal_cpu(int N, float ALPHA, float *X, int INCX) void fill_cpu(int N, float ALPHA, float *X, int INCX) { int i; - for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; + if (INCX == 1 && ALPHA == 0) { + memset(X, 0, N * sizeof(float)); + } + else { + for (i = 0; i < N; ++i) X[i*INCX] = ALPHA; + } } void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index d984dd9b..fd055a37 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -776,7 +776,12 @@ size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input, size_t t_intput_size = new_ldb * bit_align;// n; size_t t_bit_input_size = t_intput_size / 8;// +1; - *t_bit_input = calloc(t_bit_input_size, sizeof(char)); + static int last_t_bit_input_size = 0; + if (last_t_bit_input_size < t_bit_input_size) { + last_t_bit_input_size = t_bit_input_size; + *t_bit_input = realloc(*t_bit_input, last_t_bit_input_size * sizeof(char)); + } + memset(*t_bit_input, 0, t_bit_input_size * sizeof(char)); int src_size = k * bit_align; // b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size @@ -798,7 +803,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) fill_cpu(l.outputs*l.batch, 0, l.output, 1); - if(l.xnor){ + if (l.xnor && (!l.align_bit_weights || state.train)) { if (!l.align_bit_weights || state.train) { binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights); //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights); @@ -838,8 +843,26 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) const int new_c = l.c / 32; - float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float)); - uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); + static float *re_packed_input = NULL; + static int last_re_packed_input_size = 0; + int re_packed_input_size = l.c * l.w * l.h; + if (last_re_packed_input_size < re_packed_input_size) { + last_re_packed_input_size = re_packed_input_size; + re_packed_input = realloc(re_packed_input, last_re_packed_input_size * sizeof(float)); + } + memset(re_packed_input, 0, re_packed_input_size * sizeof(float)); + + static uint32_t *bin_re_packed_input = NULL; + static int last_bin_re_packed_input_size = 0; + int in_re_packed_input_size = new_c * l.w * l.h + 1; + if (last_bin_re_packed_input_size < in_re_packed_input_size) { + last_bin_re_packed_input_size = in_re_packed_input_size; + bin_re_packed_input = realloc(bin_re_packed_input, last_bin_re_packed_input_size * sizeof(uint32_t)); + } + memset(bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t)); + + //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float)); + //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); // float32x4 by channel (as in cuDNN) repack_input(state.input, re_packed_input, l.w, l.h, l.c); @@ -847,7 +870,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) // 32 x floats -> 1 x uint32_t float_to_bit(re_packed_input, (char *)bin_re_packed_input, l.c * l.w * l.h); - free(re_packed_input); + //free(re_packed_input); // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, @@ -859,7 +882,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); - free(bin_re_packed_input); + //free(bin_re_packed_input); int new_k = l.size*l.size*l.c / 32; @@ -876,7 +899,14 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) //size_t t_intput_size = new_ldb * l.bit_align;// n; //size_t t_bit_input_size = t_intput_size / 8;// +1; - char *t_bit_input = calloc(t_bit_input_size, sizeof(char)); + //char *t_bit_input = calloc(t_bit_input_size, sizeof(char)); + static char *t_bit_input = NULL; + static int last_t_bit_input_size = 0; + if (last_t_bit_input_size < t_bit_input_size) { + last_t_bit_input_size = t_bit_input_size; + t_bit_input = realloc(t_bit_input, last_t_bit_input_size * sizeof(char)); + } + memset(t_bit_input, 0, t_bit_input_size * sizeof(char)); transpose_uint32((uint32_t *)b, t_bit_input, new_k, n, n, new_ldb); @@ -889,10 +919,11 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) // t_bit_input, new_ldb / 32, // c, n, l.mean_arr); - free(t_bit_input); + //free(t_bit_input); } - else { // else (l.c % 32 != 0) + else + { // else (l.c % 32 != 0) //-------------------------------------------------------- //printf(" l.index = %d - old XNOR \n", l.index); @@ -919,7 +950,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) //size_t ldb_align = 256; // 256 bit for AVX2 int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); - char *t_bit_input = NULL; + static char *t_bit_input = NULL; size_t t_intput_size = binary_transpose_align_input(k, n, b, &t_bit_input, ldb_align, l.bit_align); //char *t_bit_input = calloc(new_ldb * n, sizeof(char)); // for im2col_cpu_custom_transpose() only //float_to_bit(t_input, t_bit_input, new_ldb * n); // for im2col_cpu_custom_transpose() only @@ -930,12 +961,18 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr); //free(t_input); - free(t_bit_input); + //free(t_bit_input); //} } } + add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); + + //activate_array(l.output, m*n*l.batch, l.activation); + activate_array_cpu_custom(l.output, m*n*l.batch, l.activation); + return; + } else { //printf(" l.index = %d - FP32 \n", l.index); diff --git a/src/gemm.c b/src/gemm.c index 8fa0f767..1b1d8fe2 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -1151,6 +1151,23 @@ static inline int popcnt256_custom(__m256i n) { + _mm256_extract_epi64(val, 3); } +static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) { + __m256i c_bit256 = _mm256_set1_epi8(255); + + __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b)) + c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT + + *count_sum = _mm256_add_epi64(count256(c_bit256), *count_sum); // 1st part - popcnt Mula’s algorithm +} + +// 2nd part - popcnt Mula’s algorithm +static inline int get_count_mula(__m256i count_sum) { + return _mm256_extract_epi64(count_sum, 0) + + _mm256_extract_epi64(count_sum, 1) + + _mm256_extract_epi64(count_sum, 2) + + _mm256_extract_epi64(count_sum, 3); +} + // 5x times faster than gemm()-float32 // further optimizations: do mean-mult only for the last layer void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED, @@ -1168,45 +1185,101 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED, } #endif + //#pragma omp parallel for + //for (i = 0; i < M; ++i) #pragma omp parallel for - for (i = 0; i < M; ++i) + for (i = 0; i < (M/2)*2; i += 2) { // l.n - filters [16 - 55 - 1024] - float mean_val = mean_arr[i]; + float mean_val_0 = mean_arr[i + 0]; + float mean_val_1 = mean_arr[i + 1]; int j, k; __m256i all_1 = _mm256_set1_epi8(255); - for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056] - int count = 0; + //for (j = 0; j < N; ++j) + for (j = 0; j < (N/2)*2; j += 2) + { // out_h*out_w - one channel output size [169 - 173056] + //int count = 0; const int bit_step = 256; - __m256i count_sum = _mm256_set1_epi8(0); + __m256i count_sum_0 = _mm256_set1_epi8(0); + __m256i count_sum_1 = _mm256_set1_epi8(0); + __m256i count_sum_2 = _mm256_set1_epi8(0); + __m256i count_sum_3 = _mm256_set1_epi8(0); for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216] - __m256i a_bit256 = _mm256_loadu_si256((__m256i *)(A + (i*lda + k) / 8)); - __m256i b_bit256 = _mm256_loadu_si256((__m256i *)(B + (j*ldb + k) / 8)); - __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b)) - __m256i c_bit256 = _mm256_andnot_si256(xor256, all_1); // can be optimized - we can do other NOT for wegihts once and do not do this NOT - count_sum = _mm256_add_epi64(count256(c_bit256), count_sum); // Mula’s algorithm + __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8)); + __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8)); + + __m256i a_bit256_1 = _mm256_loadu_si256((__m256i *)(A + ((i + 1)*lda + k) / 8)); + __m256i b_bit256_1 = _mm256_loadu_si256((__m256i *)(B + ((j + 1)*ldb + k) / 8)); + + + xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum_0); + xnor_avx2_popcnt(a_bit256_0, b_bit256_1, &count_sum_1); + + xnor_avx2_popcnt(a_bit256_1, b_bit256_0, &count_sum_2); + xnor_avx2_popcnt(a_bit256_1, b_bit256_1, &count_sum_3); //count += popcnt256(c_bit256); - //binary_int64_printf(c_bit64); //printf(", count = %d \n\n", tmp_count); } - // count of 1 bits - //count = count_sum.m256i_i64[0] + - // count_sum.m256i_i64[1] + - // count_sum.m256i_i64[2] + - // count_sum.m256i_i64[3]; - count = _mm256_extract_epi64(count_sum, 0) - + _mm256_extract_epi64(count_sum, 1) - + _mm256_extract_epi64(count_sum, 2) - + _mm256_extract_epi64(count_sum, 3); + int count_0 = get_count_mula(count_sum_0); + int count_1 = get_count_mula(count_sum_1); + int count_2 = get_count_mula(count_sum_2); + int count_3 = get_count_mula(count_sum_3); - int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step)); + const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step)); + count_0 = count_0 - f1; // remove extra bits (from empty space for align only) + count_1 = count_1 - f1; + count_2 = count_2 - f1; + count_3 = count_3 - f1; + C[i*ldc + (j + 0)] = (2 * count_0 - K) * mean_val_0; + C[i*ldc + (j + 1)] = (2 * count_1 - K) * mean_val_0; + C[(i + 1)*ldc + (j + 0)] = (2 * count_2 - K) * mean_val_1; + C[(i + 1)*ldc + (j + 1)] = (2 * count_3 - K) * mean_val_1; + } + + int i_d; + for (i_d = 0; i_d < 2; ++i_d) + { + float mean_val = mean_arr[i + i_d]; + for (j = (N / 2) * 2; j < N; j += 1) + { // out_h*out_w - one channel output size [169 - 173056] + const int bit_step = 256; + __m256i count_sum = _mm256_set1_epi8(0); + + for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216] + __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + i_d + 0)*lda + k) / 8)); + __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8)); + xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum); + } + int count = get_count_mula(count_sum); + const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step)); + count = count - f1; // remove extra bits (from empty space for align only) + C[(i + i_d)*ldc + j] = (2 * count - K) * mean_val; + } + } + } + + for (i = (M / 2) * 2; i < M; i += 1) + { + float mean_val = mean_arr[i]; + int j, k; + for (j = 0; j < N; j += 1) + { // out_h*out_w - one channel output size [169 - 173056] + const int bit_step = 256; + __m256i count_sum = _mm256_set1_epi8(0); + + for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216] + __m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8)); + __m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8)); + xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum); + } + int count = get_count_mula(count_sum); + const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step)); count = count - f1; // remove extra bits (from empty space for align only) - C[i*ldc + j] = (2 * count - K) * mean_val; } } diff --git a/src/network.c b/src/network.c index 8eb34b3f..b960f27f 100644 --- a/src/network.c +++ b/src/network.c @@ -201,7 +201,7 @@ void forward_network(network net, network_state state) for(i = 0; i < net.n; ++i){ state.index = i; layer l = net.layers[i]; - if(l.delta){ + if(l.delta && state.train){ scal_cpu(l.outputs * l.batch, 0, l.delta, 1); } //double time = get_time_point(); diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index 7cdc5368..263ee1f4 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -58,8 +58,17 @@ void resize_shortcut_layer(layer *l, int w, int h) void forward_shortcut_layer(const layer l, network_state state) { - copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); - shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); + if (l.w == l.out_w && l.h == l.out_h && l.c == l.out_c) { + int size = l.batch * l.w * l.h * l.c; + int i; + #pragma omp parallel for + for(i = 0; i < size; ++i) + l.output[i] = state.input[i] + state.net.layers[l.index].output[i]; + } + else { + copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); + shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); + } activate_array(l.output, l.outputs*l.batch, l.activation); }