mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Minor speedup on CPU
This commit is contained in:
@ -102,10 +102,17 @@ void activate_array(float *x, const int n, const ACTIVATION a)
|
||||
int i;
|
||||
if (a == LINEAR) {}
|
||||
else if (a == LEAKY) {
|
||||
#pragma omp parallel for
|
||||
for (i = 0; i < n; ++i) {
|
||||
x[i] = leaky_activate(x[i]);
|
||||
}
|
||||
}
|
||||
else if (a == LOGISTIC) {
|
||||
#pragma omp parallel for
|
||||
for (i = 0; i < n; ++i) {
|
||||
x[i] = logistic_activate(x[i]);
|
||||
}
|
||||
}
|
||||
else {
|
||||
for (i = 0; i < n; ++i) {
|
||||
x[i] = activate(x[i], a);
|
||||
|
@ -172,8 +172,13 @@ void scal_cpu(int N, float ALPHA, float *X, int INCX)
|
||||
void fill_cpu(int N, float ALPHA, float *X, int INCX)
|
||||
{
|
||||
int i;
|
||||
if (INCX == 1 && ALPHA == 0) {
|
||||
memset(X, 0, N * sizeof(float));
|
||||
}
|
||||
else {
|
||||
for (i = 0; i < N; ++i) X[i*INCX] = ALPHA;
|
||||
}
|
||||
}
|
||||
|
||||
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
|
@ -776,7 +776,12 @@ size_t binary_transpose_align_input(int k, int n, float *b, char **t_bit_input,
|
||||
size_t t_intput_size = new_ldb * bit_align;// n;
|
||||
size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
|
||||
*t_bit_input = calloc(t_bit_input_size, sizeof(char));
|
||||
static int last_t_bit_input_size = 0;
|
||||
if (last_t_bit_input_size < t_bit_input_size) {
|
||||
last_t_bit_input_size = t_bit_input_size;
|
||||
*t_bit_input = realloc(*t_bit_input, last_t_bit_input_size * sizeof(char));
|
||||
}
|
||||
memset(*t_bit_input, 0, t_bit_input_size * sizeof(char));
|
||||
int src_size = k * bit_align;
|
||||
|
||||
// b - [bit_align, k] - [l.bit_align, l.size*l.size*l.c] = src_size
|
||||
@ -798,7 +803,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
|
||||
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
|
||||
|
||||
if(l.xnor){
|
||||
if (l.xnor && (!l.align_bit_weights || state.train)) {
|
||||
if (!l.align_bit_weights || state.train) {
|
||||
binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
|
||||
//printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
|
||||
@ -838,8 +843,26 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
|
||||
const int new_c = l.c / 32;
|
||||
|
||||
float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
|
||||
uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
|
||||
static float *re_packed_input = NULL;
|
||||
static int last_re_packed_input_size = 0;
|
||||
int re_packed_input_size = l.c * l.w * l.h;
|
||||
if (last_re_packed_input_size < re_packed_input_size) {
|
||||
last_re_packed_input_size = re_packed_input_size;
|
||||
re_packed_input = realloc(re_packed_input, last_re_packed_input_size * sizeof(float));
|
||||
}
|
||||
memset(re_packed_input, 0, re_packed_input_size * sizeof(float));
|
||||
|
||||
static uint32_t *bin_re_packed_input = NULL;
|
||||
static int last_bin_re_packed_input_size = 0;
|
||||
int in_re_packed_input_size = new_c * l.w * l.h + 1;
|
||||
if (last_bin_re_packed_input_size < in_re_packed_input_size) {
|
||||
last_bin_re_packed_input_size = in_re_packed_input_size;
|
||||
bin_re_packed_input = realloc(bin_re_packed_input, last_bin_re_packed_input_size * sizeof(uint32_t));
|
||||
}
|
||||
memset(bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
|
||||
|
||||
//float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
|
||||
//uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
|
||||
|
||||
// float32x4 by channel (as in cuDNN)
|
||||
repack_input(state.input, re_packed_input, l.w, l.h, l.c);
|
||||
@ -847,7 +870,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
// 32 x floats -> 1 x uint32_t
|
||||
float_to_bit(re_packed_input, (char *)bin_re_packed_input, l.c * l.w * l.h);
|
||||
|
||||
free(re_packed_input);
|
||||
//free(re_packed_input);
|
||||
|
||||
// slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
|
||||
//convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
|
||||
@ -859,7 +882,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
im2col_cpu_custom((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
|
||||
//im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
|
||||
|
||||
free(bin_re_packed_input);
|
||||
//free(bin_re_packed_input);
|
||||
|
||||
int new_k = l.size*l.size*l.c / 32;
|
||||
|
||||
@ -876,7 +899,14 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
//size_t t_intput_size = new_ldb * l.bit_align;// n;
|
||||
//size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
|
||||
char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
|
||||
//char *t_bit_input = calloc(t_bit_input_size, sizeof(char));
|
||||
static char *t_bit_input = NULL;
|
||||
static int last_t_bit_input_size = 0;
|
||||
if (last_t_bit_input_size < t_bit_input_size) {
|
||||
last_t_bit_input_size = t_bit_input_size;
|
||||
t_bit_input = realloc(t_bit_input, last_t_bit_input_size * sizeof(char));
|
||||
}
|
||||
memset(t_bit_input, 0, t_bit_input_size * sizeof(char));
|
||||
|
||||
transpose_uint32((uint32_t *)b, t_bit_input, new_k, n, n, new_ldb);
|
||||
|
||||
@ -889,10 +919,11 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
// t_bit_input, new_ldb / 32,
|
||||
// c, n, l.mean_arr);
|
||||
|
||||
free(t_bit_input);
|
||||
//free(t_bit_input);
|
||||
|
||||
}
|
||||
else { // else (l.c % 32 != 0)
|
||||
else
|
||||
{ // else (l.c % 32 != 0)
|
||||
|
||||
//--------------------------------------------------------
|
||||
//printf(" l.index = %d - old XNOR \n", l.index);
|
||||
@ -919,7 +950,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
//size_t ldb_align = 256; // 256 bit for AVX2
|
||||
int ldb_align = l.lda_align;
|
||||
size_t new_ldb = k + (ldb_align - k%ldb_align);
|
||||
char *t_bit_input = NULL;
|
||||
static char *t_bit_input = NULL;
|
||||
size_t t_intput_size = binary_transpose_align_input(k, n, b, &t_bit_input, ldb_align, l.bit_align);
|
||||
//char *t_bit_input = calloc(new_ldb * n, sizeof(char)); // for im2col_cpu_custom_transpose() only
|
||||
//float_to_bit(t_input, t_bit_input, new_ldb * n); // for im2col_cpu_custom_transpose() only
|
||||
@ -930,12 +961,18 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
||||
//gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
|
||||
|
||||
//free(t_input);
|
||||
free(t_bit_input);
|
||||
//free(t_bit_input);
|
||||
//}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
|
||||
|
||||
//activate_array(l.output, m*n*l.batch, l.activation);
|
||||
activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
|
||||
return;
|
||||
|
||||
}
|
||||
else {
|
||||
//printf(" l.index = %d - FP32 \n", l.index);
|
||||
|
115
src/gemm.c
115
src/gemm.c
@ -1151,6 +1151,23 @@ static inline int popcnt256_custom(__m256i n) {
|
||||
+ _mm256_extract_epi64(val, 3);
|
||||
}
|
||||
|
||||
static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
|
||||
__m256i c_bit256 = _mm256_set1_epi8(255);
|
||||
|
||||
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
|
||||
c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT
|
||||
|
||||
*count_sum = _mm256_add_epi64(count256(c_bit256), *count_sum); // 1st part - popcnt Mula<6C>s algorithm
|
||||
}
|
||||
|
||||
// 2nd part - popcnt Mula<6C>s algorithm
|
||||
static inline int get_count_mula(__m256i count_sum) {
|
||||
return _mm256_extract_epi64(count_sum, 0)
|
||||
+ _mm256_extract_epi64(count_sum, 1)
|
||||
+ _mm256_extract_epi64(count_sum, 2)
|
||||
+ _mm256_extract_epi64(count_sum, 3);
|
||||
}
|
||||
|
||||
// 5x times faster than gemm()-float32
|
||||
// further optimizations: do mean-mult only for the last layer
|
||||
void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
|
||||
@ -1168,45 +1185,101 @@ void gemm_nn_custom_bin_mean_transposed(int M, int N, int K, float ALPHA_UNUSED,
|
||||
}
|
||||
#endif
|
||||
|
||||
//#pragma omp parallel for
|
||||
//for (i = 0; i < M; ++i)
|
||||
#pragma omp parallel for
|
||||
for (i = 0; i < M; ++i)
|
||||
for (i = 0; i < (M/2)*2; i += 2)
|
||||
{ // l.n - filters [16 - 55 - 1024]
|
||||
float mean_val = mean_arr[i];
|
||||
float mean_val_0 = mean_arr[i + 0];
|
||||
float mean_val_1 = mean_arr[i + 1];
|
||||
int j, k;
|
||||
__m256i all_1 = _mm256_set1_epi8(255);
|
||||
|
||||
for (j = 0; j < N; ++j) { // out_h*out_w - one channel output size [169 - 173056]
|
||||
int count = 0;
|
||||
//for (j = 0; j < N; ++j)
|
||||
for (j = 0; j < (N/2)*2; j += 2)
|
||||
{ // out_h*out_w - one channel output size [169 - 173056]
|
||||
//int count = 0;
|
||||
const int bit_step = 256;
|
||||
__m256i count_sum = _mm256_set1_epi8(0);
|
||||
__m256i count_sum_0 = _mm256_set1_epi8(0);
|
||||
__m256i count_sum_1 = _mm256_set1_epi8(0);
|
||||
__m256i count_sum_2 = _mm256_set1_epi8(0);
|
||||
__m256i count_sum_3 = _mm256_set1_epi8(0);
|
||||
|
||||
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
|
||||
__m256i a_bit256 = _mm256_loadu_si256((__m256i *)(A + (i*lda + k) / 8));
|
||||
__m256i b_bit256 = _mm256_loadu_si256((__m256i *)(B + (j*ldb + k) / 8));
|
||||
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
|
||||
__m256i c_bit256 = _mm256_andnot_si256(xor256, all_1); // can be optimized - we can do other NOT for wegihts once and do not do this NOT
|
||||
|
||||
count_sum = _mm256_add_epi64(count256(c_bit256), count_sum); // Mula<6C>s algorithm
|
||||
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
|
||||
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
|
||||
|
||||
__m256i a_bit256_1 = _mm256_loadu_si256((__m256i *)(A + ((i + 1)*lda + k) / 8));
|
||||
__m256i b_bit256_1 = _mm256_loadu_si256((__m256i *)(B + ((j + 1)*ldb + k) / 8));
|
||||
|
||||
|
||||
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum_0);
|
||||
xnor_avx2_popcnt(a_bit256_0, b_bit256_1, &count_sum_1);
|
||||
|
||||
xnor_avx2_popcnt(a_bit256_1, b_bit256_0, &count_sum_2);
|
||||
xnor_avx2_popcnt(a_bit256_1, b_bit256_1, &count_sum_3);
|
||||
|
||||
//count += popcnt256(c_bit256);
|
||||
|
||||
//binary_int64_printf(c_bit64);
|
||||
//printf(", count = %d \n\n", tmp_count);
|
||||
}
|
||||
|
||||
// count of 1 bits
|
||||
//count = count_sum.m256i_i64[0] +
|
||||
// count_sum.m256i_i64[1] +
|
||||
// count_sum.m256i_i64[2] +
|
||||
// count_sum.m256i_i64[3];
|
||||
count = _mm256_extract_epi64(count_sum, 0)
|
||||
+ _mm256_extract_epi64(count_sum, 1)
|
||||
+ _mm256_extract_epi64(count_sum, 2)
|
||||
+ _mm256_extract_epi64(count_sum, 3);
|
||||
int count_0 = get_count_mula(count_sum_0);
|
||||
int count_1 = get_count_mula(count_sum_1);
|
||||
int count_2 = get_count_mula(count_sum_2);
|
||||
int count_3 = get_count_mula(count_sum_3);
|
||||
|
||||
int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
|
||||
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
|
||||
count_0 = count_0 - f1; // remove extra bits (from empty space for align only)
|
||||
count_1 = count_1 - f1;
|
||||
count_2 = count_2 - f1;
|
||||
count_3 = count_3 - f1;
|
||||
C[i*ldc + (j + 0)] = (2 * count_0 - K) * mean_val_0;
|
||||
C[i*ldc + (j + 1)] = (2 * count_1 - K) * mean_val_0;
|
||||
C[(i + 1)*ldc + (j + 0)] = (2 * count_2 - K) * mean_val_1;
|
||||
C[(i + 1)*ldc + (j + 1)] = (2 * count_3 - K) * mean_val_1;
|
||||
}
|
||||
|
||||
int i_d;
|
||||
for (i_d = 0; i_d < 2; ++i_d)
|
||||
{
|
||||
float mean_val = mean_arr[i + i_d];
|
||||
for (j = (N / 2) * 2; j < N; j += 1)
|
||||
{ // out_h*out_w - one channel output size [169 - 173056]
|
||||
const int bit_step = 256;
|
||||
__m256i count_sum = _mm256_set1_epi8(0);
|
||||
|
||||
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
|
||||
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + i_d + 0)*lda + k) / 8));
|
||||
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
|
||||
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
|
||||
}
|
||||
int count = get_count_mula(count_sum);
|
||||
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
|
||||
count = count - f1; // remove extra bits (from empty space for align only)
|
||||
C[(i + i_d)*ldc + j] = (2 * count - K) * mean_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (i = (M / 2) * 2; i < M; i += 1)
|
||||
{
|
||||
float mean_val = mean_arr[i];
|
||||
int j, k;
|
||||
for (j = 0; j < N; j += 1)
|
||||
{ // out_h*out_w - one channel output size [169 - 173056]
|
||||
const int bit_step = 256;
|
||||
__m256i count_sum = _mm256_set1_epi8(0);
|
||||
|
||||
for (k = 0; k < K; k += bit_step) { // l.size*l.size*l.c - one filter size [27 - 9216]
|
||||
__m256i a_bit256_0 = _mm256_loadu_si256((__m256i *)(A + ((i + 0)*lda + k) / 8));
|
||||
__m256i b_bit256_0 = _mm256_loadu_si256((__m256i *)(B + ((j + 0)*ldb + k) / 8));
|
||||
xnor_avx2_popcnt(a_bit256_0, b_bit256_0, &count_sum);
|
||||
}
|
||||
int count = get_count_mula(count_sum);
|
||||
const int f1 = (K % bit_step == 0) ? 0 : (bit_step - (K % bit_step));
|
||||
count = count - f1; // remove extra bits (from empty space for align only)
|
||||
C[i*ldc + j] = (2 * count - K) * mean_val;
|
||||
}
|
||||
}
|
||||
|
@ -201,7 +201,7 @@ void forward_network(network net, network_state state)
|
||||
for(i = 0; i < net.n; ++i){
|
||||
state.index = i;
|
||||
layer l = net.layers[i];
|
||||
if(l.delta){
|
||||
if(l.delta && state.train){
|
||||
scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
|
||||
}
|
||||
//double time = get_time_point();
|
||||
|
@ -58,8 +58,17 @@ void resize_shortcut_layer(layer *l, int w, int h)
|
||||
|
||||
void forward_shortcut_layer(const layer l, network_state state)
|
||||
{
|
||||
if (l.w == l.out_w && l.h == l.out_h && l.c == l.out_c) {
|
||||
int size = l.batch * l.w * l.h * l.c;
|
||||
int i;
|
||||
#pragma omp parallel for
|
||||
for(i = 0; i < size; ++i)
|
||||
l.output[i] = state.input[i] + state.net.layers[l.index].output[i];
|
||||
}
|
||||
else {
|
||||
copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
|
||||
shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
|
||||
}
|
||||
activate_array(l.output, l.outputs*l.batch, l.activation);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user