Minor speedup on CPU

This commit is contained in:
AlexeyAB
2019-01-26 19:12:46 +03:00
parent 630f441e08
commit 090d934c0f
6 changed files with 168 additions and 37 deletions

View File

@ -102,10 +102,17 @@ void activate_array(float *x, const int n, const ACTIVATION a)
int i;
if (a == LINEAR) {}
else if (a == LEAKY) {
#pragma omp parallel for
for (i = 0; i < n; ++i) {
x[i] = leaky_activate(x[i]);
}
}
else if (a == LOGISTIC) {
#pragma omp parallel for
for (i = 0; i < n; ++i) {
x[i] = logistic_activate(x[i]);
}
}
else {
for (i = 0; i < n; ++i) {
x[i] = activate(x[i], a);

View File

@ -172,8 +172,13 @@ void scal_cpu(int N, float ALPHA, float *X, int INCX)
void fill_cpu(int N, float ALPHA, float *X, int INCX)
{
int i;
if (INCX == 1 && ALPHA == 0) {
memset(X, 0, N * sizeof(float));
}
else {
for (i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}
}
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{

View File

@ -776,7 +776,12 @@ size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input,
size_t t_intput_size = new_ldb * bit_align;// n;
size_t t_bit_input_size = t_intput_size / 8;// +1;
*t_bit_input = calloc(t_bit_input_size, sizeof(char));
static int last_t_bit_input_size = 0;
if (last_t_bit_input_size < t_bit_input_size) {
last_t_bit_input_size = t_bit_input_size;
*t_bit_input = realloc(*t_bit_input, last_t_bit_input_size * sizeof(char));
}
memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
int src_size = k * bit_align;
// b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
@ -798,7 +803,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
if(l.xnor){
if (l.xnor && (!l.align_bit_weights || state.train)) {
if (!l.align_bit_weights || state.train) {
binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
//printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
@ -838,8 +843,26 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
const int new_c = l.c / 32;
float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
static float *re_packed_input = NULL;
static int last_re_packed_input_size = 0;
int re_packed_input_size = l.c * l.w * l.h;
if (last_re_packed_input_size < re_packed_input_size) {
last_re_packed_input_size = re_packed_input_size;
re_packed_input = realloc(re_packed_input, last_re_packed_input_size * sizeof(float));
}
memset(re_packed_input, 0, re_packed_input_size * sizeof(float));
static uint32_t *bin_re_packed_input = NULL;
static int last_bin_re_packed_input_size = 0;
int in_re_packed_input_size = new_c * l.w * l.h + 1;
if (last_bin_re_packed_input_size < in_re_packed_input_size) {
last_bin_re_packed_input_size = in_re_packed_input_size;
bin_re_packed_input = realloc(bin_re_packed_input, last_bin_re_packed_input_size * sizeof(uint32_t));
}
memset(bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
//float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
//uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
// float32x4 by channel (as in cuDNN)
repack_input(state.input, re_packed_input, l.w, l.h, l.c);
@ -847,7 +870,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
// 32 x floats -> 1 x uint32_t
float_to_bit(re_packed_input, (char *)bin_re_packed_input, l.c * l.w * l.h);
free(re_packed_input);
//free(re_packed_input);
// slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
//convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
@ -859,7 +882,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
//im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
free(bin_re_packed_input);
//free(bin_re_packed_input);
int new_k = l.size*l.size*l.c / 32;
@ -876,7 +899,14 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
//size_t t_intput_size = new_ldb * l.bit_align;// n;
//size_t t_bit_input_size = t_intput_size / 8;// +1;
char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
//char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
static char *t_bit_input = NULL;
static int last_t_bit_input_size = 0;
if (last_t_bit_input_size < t_bit_input_size) {
last_t_bit_input_size = t_bit_input_size;
t_bit_input = realloc(t_bit_input, last_t_bit_input_size * sizeof(char));
}
memset(t_bit_input, 0, t_bit_input_size * sizeof(char));
transpose_uint32((uint32_t *)b, t_bit_input, new_k, n, n, new_ldb);
@ -889,10 +919,11 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
// t_bit_input, new_ldb / 32,
// c, n, l.mean_arr);
free(t_bit_input);
//free(t_bit_input);
}
else { // else (l.c % 32 != 0)
else
{ // else (l.c % 32 != 0)
//--------------------------------------------------------
//printf(" l.index = %d - old XNOR \n", l.index);
@ -919,7 +950,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
//size_t ldb_align = 256; // 256 bit for AVX2
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align);
char *t_bit_input = NULL;
static char *t_bit_input = NULL;
size_t t_intput_size = binary_transpose_align_input(k, n, b, &t_bit_input, ldb_align, l.bit_align);
//char *t_bit_input = calloc(new_ldb * n, sizeof(char)); // for im2col_cpu_custom_transpose() only
//float_to_bit(t_input, t_bit_input, new_ldb * n); // for im2col_cpu_custom_transpose() only
@ -930,12 +961,18 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
//gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
//free(t_input);
free(t_bit_input);
//free(t_bit_input);
//}
}
}
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
//activate_array(l.output, m*n*l.batch, l.activation);
activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
return;
}
else {
//printf(" l.index = %d - FP32 \n", l.index);

View File

@ -1151,6 +1151,23 @@ static inline int popcnt256_custom(__m256i n) {
+ _mm256_extract_epi64(val, 3);
}
static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
__m256i c_bit256 = _mm256_set1_epi8(255);
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT
*count_sum = _mm256_add_epi64(count256(c_bit256), *count_sum); // 1st part - popcnt Mula<6C>s algorithm
}
// 2nd part - popcnt Mula<6C>s algorithm
static inline int get_count_mula(__m256i count_sum) {
return _mm256_extract_epi64(count_sum, 0)
+ _mm256_extract_epi64(count_sum, 1)
+ _mm256_extract_epi64(count_sum, 2)
+ _mm256_extract_epi64(count_sum, 3);
}
// 5x times faster than gemm()-float32
// further optimizations: do mean-mult only for the last layer
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
@ -1168,45 +1185,101 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
}
#endif
//#pragma omp parallel for
//for (i = 0; i < M; ++i)
#pragma omp parallel for
for (i = 0; i < M; ++i)
for (i = 0; i < (M/2)*2; i += 2)
{ // l.n - filters [16 - 55 - 1024]
float mean_val = mean_arr[i];
float mean_val_0 = mean_arr[i + 0];
float mean_val_1 = mean_arr[i + 1];
int j, k;
__m256i all_1 = _mm256_set1_epi8(255);
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
int count = 0;
//for (j = 0; j < N; ++j)
for (j = 0; j < (N/2)*2; j += 2)
{ // out_h*out_w - one channel output size [169 - 173056]
//int count = 0;
const int bit_step = 256;
__m256i count_sum = _mm256_set1_epi8(0);
__m256i count_sum_0 = _mm256_set1_epi8(0);
__m256i count_sum_1 = _mm256_set1_epi8(0);
__m256i count_sum_2 = _mm256_set1_epi8(0);
__m256i count_sum_3 = _mm256_set1_epi8(0);
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
__m256i a_bit256 = _mm256_loadu_si256((__m256i *)(A + (i*lda + k) / 8));
__m256i b_bit256 = _mm256_loadu_si256((__m256i *)(B + (j*ldb + k) / 8));
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
__m256i c_bit256 = _mm256_andnot_si256(xor256, all_1); // can be optimized - we can do other NOT for wegihts once and do not do this NOT
count_sum = _mm256_add_epi64(count256(c_bit256), count_sum); // Mula<6C>s algorithm
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
__m256i a_bit256_1 = _mm256_loadu_si256((__m256i *)(A + ((i + 1)*lda + k) / 8));
__m256i b_bit256_1 = _mm256_loadu_si256((__m256i *)(B + ((j + 1)*ldb + k) / 8));
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum_0);
xnor_avx2_popcnt(a_bit256_0, b_bit256_1, &count_sum_1);
xnor_avx2_popcnt(a_bit256_1, b_bit256_0, &count_sum_2);
xnor_avx2_popcnt(a_bit256_1, b_bit256_1, &count_sum_3);
//count += popcnt256(c_bit256);
//binary_int64_printf(c_bit64);
//printf(", count = %d \n\n", tmp_count);
}
// count of 1 bits
//count = count_sum.m256i_i64[0] +
// count_sum.m256i_i64[1] +
// count_sum.m256i_i64[2] +
// count_sum.m256i_i64[3];
count = _mm256_extract_epi64(count_sum, 0)
+ _mm256_extract_epi64(count_sum, 1)
+ _mm256_extract_epi64(count_sum, 2)
+ _mm256_extract_epi64(count_sum, 3);
int count_0 = get_count_mula(count_sum_0);
int count_1 = get_count_mula(count_sum_1);
int count_2 = get_count_mula(count_sum_2);
int count_3 = get_count_mula(count_sum_3);
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count_0 = count_0 - f1; // remove extra bits (from empty space for align only)
count_1 = count_1 - f1;
count_2 = count_2 - f1;
count_3 = count_3 - f1;
C[i*ldc + (j + 0)] = (2 * count_0 - K) * mean_val_0;
C[i*ldc + (j + 1)] = (2 * count_1 - K) * mean_val_0;
C[(i + 1)*ldc + (j + 0)] = (2 * count_2 - K) * mean_val_1;
C[(i + 1)*ldc + (j + 1)] = (2 * count_3 - K) * mean_val_1;
}
int i_d;
for (i_d = 0; i_d < 2; ++i_d)
{
float mean_val = mean_arr[i + i_d];
for (j = (N / 2) * 2; j < N; j += 1)
{ // out_h*out_w - one channel output size [169 - 173056]
const int bit_step = 256;
__m256i count_sum = _mm256_set1_epi8(0);
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + i_d + 0)*lda + k) / 8));
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
}
int count = get_count_mula(count_sum);
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1; // remove extra bits (from empty space for align only)
C[(i + i_d)*ldc + j] = (2 * count - K) * mean_val;
}
}
}
for (i = (M / 2) * 2; i < M; i += 1)
{
float mean_val = mean_arr[i];
int j, k;
for (j = 0; j < N; j += 1)
{ // out_h*out_w - one channel output size [169 - 173056]
const int bit_step = 256;
__m256i count_sum = _mm256_set1_epi8(0);
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
}
int count = get_count_mula(count_sum);
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
count = count - f1; // remove extra bits (from empty space for align only)
C[i*ldc + j] = (2 * count - K) * mean_val;
}
}

View File

@ -201,7 +201,7 @@ void forward_network(network net, network_state state)
for(i = 0; i < net.n; ++i){
state.index = i;
layer l = net.layers[i];
if(l.delta){
if(l.delta && state.train){
scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
}
//double time = get_time_point();

View File

@ -58,8 +58,17 @@ void resize_shortcut_layer(layer *l, int w, int h)
void forward_shortcut_layer(const layer l, network_state state)
{
if (l.w == l.out_w && l.h == l.out_h && l.c == l.out_c) {
int size = l.batch * l.w * l.h * l.c;
int i;
#pragma omp parallel for
for(i = 0; i < size; ++i)
l.output[i] = state.input[i] + state.net.layers[l.index].output[i];
}
else {
copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
}
activate_array(l.output, l.outputs*l.batch, l.activation);
}