From 4f72fcc015a7825cf0c3d3eee143c13853b1a2c3 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Fri, 10 May 2019 16:46:48 +0300 Subject: [PATCH] Added grouped convolutional (depth-wise convolutional) --- src/conv_lstm_layer.c | 25 +-- src/conv_lstm_layer.h | 2 +- src/convolutional_kernels.cu | 150 +++++++------ src/convolutional_layer.c | 407 +++++++++++++++++++---------------- src/convolutional_layer.h | 2 +- src/crnn_layer.c | 9 +- src/crnn_layer.h | 2 +- src/detector.c | 11 +- src/network.c | 2 +- src/network_kernels.cu | 18 +- src/parser.c | 23 +- src/yolo_layer.c | 4 + 12 files changed, 349 insertions(+), 306 deletions(-) diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c index 7d395ad0..1764cb90 100644 --- a/src/conv_lstm_layer.c +++ b/src/conv_lstm_layer.c @@ -32,7 +32,7 @@ static void increment_layer(layer *l, int steps) } -layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor) +layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor) { fprintf(stderr, "CONV_LSTM Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters); /* @@ -57,6 +57,7 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i l.h = h; l.w = w; l.c = c; + l.groups = groups; l.out_c = output_filters; l.inputs = h * w * c; l.xnor = xnor; @@ -64,44 +65,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i // U l.uf = (layer*)malloc(sizeof(layer)); - *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.uf->batch = batch; if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size; l.ui = (layer*)malloc(sizeof(layer)); - *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.ui->batch = batch; if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size; l.ug = (layer*)malloc(sizeof(layer)); - *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.ug->batch = batch; if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size; l.uo = (layer*)malloc(sizeof(layer)); - *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.uo->batch = batch; if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size; // W l.wf = (layer*)malloc(sizeof(layer)); - *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.wf->batch = batch; if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size; l.wi = (layer*)malloc(sizeof(layer)); - *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.wi->batch = batch; if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size; l.wg = (layer*)malloc(sizeof(layer)); - *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.wg->batch = batch; if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size; l.wo = (layer*)malloc(sizeof(layer)); - *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.wo->batch = batch; if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size; @@ -109,21 +110,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i // V l.vf = (layer*)malloc(sizeof(layer)); if (l.peephole) { - *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.vf->batch = batch; if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size; } l.vi = (layer*)malloc(sizeof(layer)); if (l.peephole) { - *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.vi->batch = batch; if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size; } l.vo = (layer*)malloc(sizeof(layer)); if (l.peephole) { - *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.vo->batch = batch; if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size; } diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h index 3bd9b4ef..56a57298 100644 --- a/src/conv_lstm_layer.h +++ b/src/conv_lstm_layer.h @@ -9,7 +9,7 @@ #ifdef __cplusplus extern "C" { #endif -layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor); +layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor); void resize_conv_lstm_layer(layer *l, int w, int h); void free_state_conv_lstm(layer l); void randomize_state_conv_lstm(layer l); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index cfa4fe7e..9185279f 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -166,20 +166,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) { //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); if(l.binary){ - binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); + binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); swap_binary(&l); } if(l.xnor){ if (!l.align_bit_weights_gpu || state.train) { - //binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); + //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu); - fast_binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu); + fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu); } - //swap_binary(&l); - //binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); - //state.input = l.binary_input_gpu; - //cudaDeviceSynchronize(); if (l.align_bit_weights_gpu && !state.train && l.c >= 32) { @@ -187,11 +183,15 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //cudaError_t status = cudaSuccess; //int input_size = l.c*l.h*l.w*l.batch; - int m = l.n; - int k = l.size*l.size*l.c; + int m = l.n / l.groups; + int k = l.size*l.size*l.c / l.groups; int n = l.out_w*l.out_h; //float * a = l.weights_gpu; + // int i, j; + // for(i = 0; i < l.batch; ++i){ + // for (j = 0; j < l.groups; ++j) { + int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; //size_t t_intput_size = new_ldb * n; @@ -551,22 +551,25 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) #else fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); - int i; - int m = l.n; - int k = l.size*l.size*l.c; + int i, j; + int m = l.n / l.groups; + int k = l.size*l.size*l.c / l.groups; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ - float *im = state.input + i*l.c*l.h*l.w; - float * a = l.weights_gpu; - float * b = state.workspace; - float * c = l.output_gpu; - if (l.size == 1) { - b = im; + for (j = 0; j < l.groups; ++j) { + //float *im = state.input + i*l.c*l.h*l.w; + float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + float *a = l.weights_gpu + j*l.nweights / l.groups; + float *b = state.workspace; + float *c = l.output_gpu + (i*l.groups + j)*n*m; + if (l.size == 1) { + b = im; + } + else { + im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + } + gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n); } - else { - im2col_ongpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - } - gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); } if (l.batch_normalize) { @@ -782,32 +785,38 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state backward_batchnorm_layer_gpu(l, state); } - int m = l.n; - int n = l.size*l.size*l.c; + int m = l.n / l.groups; + int n = l.size*l.size*l.c / l.groups; int k = l.out_w*l.out_h; - int i; + int i, j; for(i = 0; i < l.batch; ++i){ - float * a = l.delta_gpu; - float * b = state.workspace; - float * c = l.weight_updates_gpu; + for (j = 0; j < l.groups; ++j) { + float * a = l.delta_gpu + (i*l.groups + j)*m*k; + float * b = state.workspace; + float * c = l.weight_updates_gpu + j*l.nweights / l.groups; - im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); + float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - if(state.delta){ - if(l.binary || l.xnor) swap_binary(&l); - float * a = l.weights_gpu; - float * b = l.delta_gpu; - float * c = state.workspace; + im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n); - gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k); + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + float * a = l.weights_gpu + j*l.nweights / l.groups; + float * b = l.delta_gpu + (i*l.groups + j)*m*k; + float * c = state.workspace; - col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta + i*l.c*l.h*l.w); - if(l.binary || l.xnor) { - swap_binary(&l); + gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k); + + float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + + col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta); + if (l.binary || l.xnor) { + swap_binary(&l); + } + if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w); } - if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w); } } #endif @@ -821,43 +830,43 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state } } -void pull_convolutional_layer(convolutional_layer layer) +void pull_convolutional_layer(convolutional_layer l) { - cuda_pull_array_async(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array_async(layer.biases_gpu, layer.biases, layer.n); - cuda_pull_array_async(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array_async(layer.bias_updates_gpu, layer.bias_updates, layer.n); - if (layer.batch_normalize){ - cuda_pull_array_async(layer.scales_gpu, layer.scales, layer.n); - cuda_pull_array_async(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); - cuda_pull_array_async(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); + cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights); + cuda_pull_array_async(l.biases_gpu, l.biases, l.n); + cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights); + cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n); + if (l.batch_normalize){ + cuda_pull_array_async(l.scales_gpu, l.scales, l.n); + cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n); } - if (layer.adam){ - cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); + if (l.adam){ + cuda_pull_array_async(l.m_gpu, l.m, l.nweights); + cuda_pull_array_async(l.v_gpu, l.v, l.nweights); } CHECK_CUDA(cudaPeekAtLastError()); cudaStreamSynchronize(get_cuda_stream()); } -void push_convolutional_layer(convolutional_layer layer) +void push_convolutional_layer(convolutional_layer l) { - cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); + cuda_push_array(l.weights_gpu, l.weights, l.nweights); #ifdef CUDNN_HALF - assert((layer.c*layer.n*layer.size*layer.size) > 0); - cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16); + assert(l.nweights > 0); + cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16); #endif - cuda_push_array(layer.biases_gpu, layer.biases, layer.n); - cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size); - cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); - if (layer.batch_normalize){ - cuda_push_array(layer.scales_gpu, layer.scales, layer.n); - cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); - cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); + cuda_push_array(l.biases_gpu, l.biases, l.n); + cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); + cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); + if (l.batch_normalize){ + cuda_push_array(l.scales_gpu, l.scales, l.n); + cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); } - if (layer.adam){ - cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); - cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); + if (l.adam){ + cuda_push_array(l.m_gpu, l.m, l.nweights); + cuda_push_array(l.v_gpu, l.v, l.nweights); } CHECK_CUDA(cudaPeekAtLastError()); } @@ -868,11 +877,10 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init //float momentum = a.momentum; //float decay = a.decay; //int batch = a.batch; - int size = l.size*l.size*l.c*l.n; // old if (l.adam) { //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t); - adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t); + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t); adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); if (l.scales_gpu) { @@ -883,9 +891,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); - axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1); axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 16cb6b93..f9c0ee01 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -140,7 +140,7 @@ size_t get_workspace_size32(layer l){ if (workspace_size < re_packed_input_size) workspace_size = re_packed_input_size; return workspace_size; } - return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float); + return (size_t)l.out_h*l.out_w*l.size*l.size*(l.c / l.groups)*sizeof(float); } size_t get_workspace_size16(layer l) { @@ -231,9 +231,14 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) // 3. FP32 Master Copy of Weights // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH)); + CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups)); #if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72) // cuDNN >= 7.2 CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION)); #endif +#else //if(CUDNN_MAJOR >= 7) + if (l->groups > 1) { + error("CUDNN < 7 doesn't support groups, please upgrade!"); + } #endif // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported @@ -243,23 +248,23 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) // backward delta CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w)); CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w)); - CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size)); + CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size)); // forward CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w)); CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w)); - CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size)); + CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size)); //#ifdef CUDNN_HALF // backward delta CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w)); CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w)); - CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size)); + CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size)); // forward CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w)); CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w)); - CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size)); + CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size)); // batch norm CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w)); @@ -326,17 +331,21 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) #endif #endif -convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index) +convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index) { int total_batch = batch*steps; int i; convolutional_layer l = { (LAYER_TYPE)0 }; l.type = CONVOLUTIONAL; + if (xnor) groups = 1; // disable groups for XNOR-net + if (groups < 1) groups = 1; + l.index = index; l.h = h; l.w = w; l.c = c; + l.groups = groups; l.n = n; l.binary = binary; l.xnor = xnor; @@ -348,17 +357,17 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, l.pad = padding; l.batch_normalize = batch_normalize; l.learning_rate_scale = 1; - l.nweights = l.c*l.n*l.size*l.size; + l.nweights = (c / groups) * n * size * size; - l.weights = (float*)calloc(c * n * size * size, sizeof(float)); - l.weight_updates = (float*)calloc(c * n * size * size, sizeof(float)); + l.weights = (float*)calloc(l.nweights, sizeof(float)); + l.weight_updates = (float*)calloc(l.nweights, sizeof(float)); l.biases = (float*)calloc(n, sizeof(float)); l.bias_updates = (float*)calloc(n, sizeof(float)); // float scale = 1./sqrt(size*size*c); - float scale = sqrt(2./(size*size*c)); - for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); + float scale = sqrt(2./(size*size*c/groups)); + for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1); // rand_normal(); int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); l.out_h = out_h; @@ -375,12 +384,12 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, l.backward = backward_convolutional_layer; l.update = update_convolutional_layer; if(binary){ - l.binary_weights = (float*)calloc(c * n * size * size, sizeof(float)); - l.cweights = (char*)calloc(c * n * size * size, sizeof(char)); + l.binary_weights = (float*)calloc(l.nweights, sizeof(float)); + l.cweights = (char*)calloc(l.nweights, sizeof(char)); l.scales = (float*)calloc(n, sizeof(float)); } if(xnor){ - l.binary_weights = (float*)calloc(c * n * size * size, sizeof(float)); + l.binary_weights = (float*)calloc(l.nweights, sizeof(float)); l.binary_input = (float*)calloc(l.inputs * l.batch, sizeof(float)); int align = 32;// 8; @@ -420,8 +429,8 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, } if(adam){ l.adam = 1; - l.m = (float*)calloc(c * n * size * size, sizeof(float)); - l.v = (float*)calloc(c * n * size * size, sizeof(float)); + l.m = (float*)calloc(l.nweights, sizeof(float)); + l.v = (float*)calloc(l.nweights, sizeof(float)); l.bias_m = (float*)calloc(n, sizeof(float)); l.scale_m = (float*)calloc(n, sizeof(float)); l.bias_v = (float*)calloc(n, sizeof(float)); @@ -435,19 +444,19 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, if(gpu_index >= 0){ if (adam) { - l.m_gpu = cuda_make_array(l.m, c*n*size*size); - l.v_gpu = cuda_make_array(l.v, c*n*size*size); + l.m_gpu = cuda_make_array(l.m, l.nweights); + l.v_gpu = cuda_make_array(l.v, l.nweights); l.bias_m_gpu = cuda_make_array(l.bias_m, n); l.bias_v_gpu = cuda_make_array(l.bias_v, n); l.scale_m_gpu = cuda_make_array(l.scale_m, n); l.scale_v_gpu = cuda_make_array(l.scale_v, n); } - l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); - l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); + l.weights_gpu = cuda_make_array(l.weights, l.nweights); + l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights); #ifdef CUDNN_HALF - l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2 + 1); //cuda_make_array(l.weights, c*n*size*size / 2); - l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2 + 1); //cuda_make_array(l.weight_updates, c*n*size*size / 2); + l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1); + l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1); #endif l.biases_gpu = cuda_make_array(l.biases, n); @@ -457,10 +466,10 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n); if(binary){ - l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size); + l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); } if(xnor){ - l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size); + l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights); l.mean_arr_gpu = cuda_make_array(0, l.n); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } @@ -490,7 +499,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, l.workspace_size = get_convolutional_workspace_size(l); //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); - l.bflops = (2.0 * l.n * l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.; + l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.; if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB"); else if (l.xnor) fprintf(stderr, "convX "); else fprintf(stderr, "conv "); @@ -504,8 +513,8 @@ void denormalize_convolutional_layer(convolutional_layer l) int i, j; for(i = 0; i < l.n; ++i){ float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001); - for(j = 0; j < l.c*l.size*l.size; ++j){ - l.weights[i*l.c*l.size*l.size + j] *= scale; + for(j = 0; j < l.nweights; ++j){ + l.weights[i*l.nweights + j] *= scale; } l.biases[i] -= l.rolling_mean[i] * scale; l.scales[i] = 1; @@ -516,7 +525,7 @@ void denormalize_convolutional_layer(convolutional_layer l) void test_convolutional_layer() { - convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0); + convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0); l.batch_normalize = 1; float data[] = {1,1,1,1,1, 1,1,1,1,1, @@ -691,8 +700,8 @@ void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, f void binary_align_weights(convolutional_layer *l) { - int m = l->n; - int k = l->size*l->size*l->c; + int m = l->n; // (l->n / l->groups) + int k = l->size*l->size*l->c; // ->size*l->size*(l->c / l->groups) size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8; l->new_lda = new_lda; @@ -823,13 +832,13 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); - int i; + int i, j; fill_cpu(l.outputs*l.batch, 0, l.output, 1); if (l.xnor && (!l.align_bit_weights || state.train)) { if (!l.align_bit_weights || state.train) { - binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights); + binarize_weights(l.weights, l.n, l.nweights, l.binary_weights); //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights); } swap_binary(&l); @@ -837,147 +846,150 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) state.input = l.binary_input; } - int m = l.n; - int k = l.size*l.size*l.c; + int m = l.n / l.groups; + int k = l.size*l.size*l.c / l.groups; int n = out_h*out_w; - float *a = l.weights; - float *b = state.workspace; - float *c = l.output; - static int u = 0; u++; for(i = 0; i < l.batch; ++i){ + for (j = 0; j < l.groups; ++j) { - //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); - //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n); - if (l.xnor && l.align_bit_weights && !state.train) - { - memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float)); + float *a = l.weights + j*l.nweights / l.groups; + float *b = state.workspace; + float *c = l.output + (i*l.groups + j)*n*m; - if(l.c % 32 == 0) + //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); + //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n); + if (l.xnor && l.align_bit_weights && !state.train) { - //printf(" l.index = %d - new XNOR \n", l.index); + memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float)); - int ldb_align = l.lda_align; - size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - //size_t t_intput_size = new_ldb * l.bit_align;// n; - //size_t t_bit_input_size = t_intput_size / 8;// +1; - - int re_packed_input_size = l.c * l.w * l.h; - memset(state.workspace, 0, re_packed_input_size * sizeof(float)); - - const size_t new_c = l.c / 32; - size_t in_re_packed_input_size = new_c * l.w * l.h + 1; - memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t)); - - //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float)); - //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); - - // float32x4 by channel (as in cuDNN) - repack_input(state.input, state.workspace, l.w, l.h, l.c); - - // 32 x floats -> 1 x uint32_t - float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h); - - //free(re_packed_input); - - // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) - //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, - // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); - - // // then exit from if() - - - im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); - //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); - - //free(bin_re_packed_input); - - int new_k = l.size*l.size*l.c / 32; - - // good for (l.c == 64) - //gemm_nn_bin_32bit_packed(m, n, new_k, 1, - // l.align_bit_weights, l.new_lda/32, - // b, n, - // c, n, l.mean_arr); - -// // then exit from if() - - transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb); - - // the main GEMM function - gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr); - - // // alternative GEMM - //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1, - // l.align_bit_weights, l.new_lda/32, - // t_bit_input, new_ldb / 32, - // c, n, l.mean_arr); - - //free(t_bit_input); - - } - else - { // else (l.c % 32 != 0) - - //-------------------------------------------------------- - //printf(" l.index = %d - old XNOR \n", l.index); - - //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); - im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); - - //size_t output_size = l.outputs; - //float *count_output = calloc(output_size, sizeof(float)); - //size_t bit_output_size = output_size / 8 + 1; - //char *bit_output = calloc(bit_output_size, sizeof(char)); - - //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col() - //size_t bit_input_size = intput_size / 8 + 1; - //char *bit_input = calloc(bit_input_size, sizeof(char)); - - //size_t weights_size = k * m; //l.size*l.size*l.c*l.n; - //size_t bit_weights_size = weights_size / 8 + 1; - - //char *bit_weights = calloc(bit_weights_size, sizeof(char)); - //float *mean_arr = calloc(l.n, sizeof(float)); - - // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits) + if (l.c % 32 == 0) { - //size_t ldb_align = 256; // 256 bit for AVX2 - int ldb_align = l.lda_align; - size_t new_ldb = k + (ldb_align - k%ldb_align); - size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align); + //printf(" l.index = %d - new XNOR \n", l.index); - // 5x times faster than gemm()-float32 + int ldb_align = l.lda_align; + size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; + //size_t t_intput_size = new_ldb * l.bit_align;// n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; + + int re_packed_input_size = l.c * l.w * l.h; + memset(state.workspace, 0, re_packed_input_size * sizeof(float)); + + const size_t new_c = l.c / 32; + size_t in_re_packed_input_size = new_c * l.w * l.h + 1; + memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t)); + + //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float)); + //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t)); + + // float32x4 by channel (as in cuDNN) + repack_input(state.input, state.workspace, l.w, l.h, l.c); + + // 32 x floats -> 1 x uint32_t + float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h); + + //free(re_packed_input); + + // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN) + //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output, + // l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); + + // // then exit from if() + + + im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b); + + //free(bin_re_packed_input); + + int new_k = l.size*l.size*l.c / 32; + + // good for (l.c == 64) + //gemm_nn_bin_32bit_packed(m, n, new_k, 1, + // l.align_bit_weights, l.new_lda/32, + // b, n, + // c, n, l.mean_arr); + + // // then exit from if() + + transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb); + + // the main GEMM function gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr); - //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr); + // // alternative GEMM + //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1, + // l.align_bit_weights, l.new_lda/32, + // t_bit_input, new_ldb / 32, + // c, n, l.mean_arr); - //free(t_input); //free(t_bit_input); - //} + + } + else + { // else (l.c % 32 != 0) + + //-------------------------------------------------------- + //printf(" l.index = %d - old XNOR \n", l.index); + + //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align); + im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); + + //size_t output_size = l.outputs; + //float *count_output = calloc(output_size, sizeof(float)); + //size_t bit_output_size = output_size / 8 + 1; + //char *bit_output = calloc(bit_output_size, sizeof(char)); + + //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col() + //size_t bit_input_size = intput_size / 8 + 1; + //char *bit_input = calloc(bit_input_size, sizeof(char)); + + //size_t weights_size = k * m; //l.size*l.size*l.c*l.n; + //size_t bit_weights_size = weights_size / 8 + 1; + + //char *bit_weights = calloc(bit_weights_size, sizeof(char)); + //float *mean_arr = calloc(l.n, sizeof(float)); + + // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits) + { + //size_t ldb_align = 256; // 256 bit for AVX2 + int ldb_align = l.lda_align; + size_t new_ldb = k + (ldb_align - k%ldb_align); + size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align); + + // 5x times faster than gemm()-float32 + gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr); + + //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr); + + //free(t_input); + //free(t_bit_input); + //} + } + } + add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); + + //activate_array(l.output, m*n*l.batch, l.activation); + activate_array_cpu_custom(l.output, m*n*l.batch, l.activation); + return; + } + else { + //printf(" l.index = %d - FP32 \n", l.index); + im2col_cpu(state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w, + l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b); - add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); - - //activate_array(l.output, m*n*l.batch, l.activation); - activate_array_cpu_custom(l.output, m*n*l.batch, l.activation); - return; - + gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); + // bit-count to float + } + c += n*m; + state.input += l.c*l.h*l.w; } - else { - //printf(" l.index = %d - FP32 \n", l.index); - im2col_cpu_custom(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); - - gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); - // bit-count to float - } - c += n*m; - state.input += l.c*l.h*l.w; } if(l.batch_normalize){ @@ -986,63 +998,72 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); //activate_array(l.output, m*n*l.batch, l.activation); - activate_array_cpu_custom(l.output, m*n*l.batch, l.activation); + activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation); if(l.binary || l.xnor) swap_binary(&l); } + void backward_convolutional_layer(convolutional_layer l, network_state state) { - int i; - int m = l.n; - int n = l.size*l.size*l.c; - int k = convolutional_out_height(l)* - convolutional_out_width(l); + int i, j; + int m = l.n / l.groups; + int n = l.size*l.size*l.c / l.groups; + int k = l.out_w*l.out_h; - gradient_array(l.output, m*k*l.batch, l.activation, l.delta); - backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); + gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); - if(l.batch_normalize){ + if (l.batch_normalize) { backward_batchnorm_layer(l, state); } + else { + backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); + } - for(i = 0; i < l.batch; ++i){ - float *a = l.delta + i*m*k; - float *b = state.workspace; - float *c = l.weight_updates; + for (i = 0; i < l.batch; ++i) { + for (j = 0; j < l.groups; ++j) { + float *a = l.delta + (i*l.groups + j)*m*k; + float *b = state.workspace; + float *c = l.weight_updates + j*l.nweights / l.groups; - float *im = state.input+i*l.c*l.h*l.w; + float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - im2col_cpu(im, l.c, l.h, l.w, + im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b); - gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); + gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n); - if(state.delta){ - a = l.weights; - b = l.delta + i*m*k; - c = state.workspace; + if (state.delta) { + a = l.weights + j*l.nweights / l.groups; + b = l.delta + (i*l.groups + j)*m*k; + c = state.workspace; - gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); + gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k); - col2im_cpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_cpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, + l.pad, state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w); + } } } } -void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay) +void update_convolutional_layer(convolutional_layer l, update_args a) { - int size = l.size*l.size*l.c*l.n; - axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + + axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); - if(l.scales){ - axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); + if (l.scales) { + axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1); scal_cpu(l.n, momentum, l.scale_updates, 1); } - axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1); - axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1); - scal_cpu(size, momentum, l.weight_updates, 1); + axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1); + axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1); + scal_cpu(l.nweights, momentum, l.weight_updates, 1); } @@ -1050,14 +1071,14 @@ image get_convolutional_weight(convolutional_layer l, int i) { int h = l.size; int w = l.size; - int c = l.c; - return float_to_image(w,h,c,l.weights+i*h*w*c); + int c = l.c / l.groups; + return float_to_image(w, h, c, l.weights + i*h*w*c); } void rgbgr_weights(convolutional_layer l) { int i; - for(i = 0; i < l.n; ++i){ + for (i = 0; i < l.n; ++i) { image im = get_convolutional_weight(l, i); if (im.c == 3) { rgbgr_image(im); @@ -1068,7 +1089,7 @@ void rgbgr_weights(convolutional_layer l) void rescale_weights(convolutional_layer l, float scale, float trans) { int i; - for(i = 0; i < l.n; ++i){ + for (i = 0; i < l.n; ++i) { image im = get_convolutional_weight(l, i); if (im.c == 3) { scale_image(im, scale); @@ -1080,12 +1101,18 @@ void rescale_weights(convolutional_layer l, float scale, float trans) image *get_weights(convolutional_layer l) { - image* weights = (image*)calloc(l.n, sizeof(image)); + image *weights = calloc(l.n, sizeof(image)); int i; - for(i = 0; i < l.n; ++i){ + for (i = 0; i < l.n; ++i) { weights[i] = copy_image(get_convolutional_weight(l, i)); - //normalize_image(weights[i]); + normalize_image(weights[i]); + /* + char buff[256]; + sprintf(buff, "filter%d", i); + save_image(weights[i], buff); + */ } + //error("hey"); return weights; } @@ -1102,4 +1129,4 @@ image *visualize_convolutional_layer(convolutional_layer l, char *window, image //save_image(dc, buff); free_image(dc); return single_weights; -} +} \ No newline at end of file diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index da7b8feb..dc00dabf 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -30,7 +30,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16); #endif size_t get_convolutional_workspace_size(layer l); -convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index); +convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index); void denormalize_convolutional_layer(convolutional_layer l); void resize_convolutional_layer(convolutional_layer *layer, int w, int h); void forward_convolutional_layer(const convolutional_layer layer, network_state state); diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 80a0c7f8..f466508e 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps) #endif } -layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor) +layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor) { fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters); batch = batch / steps; @@ -40,6 +40,7 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou l.h = h; l.w = w; l.c = c; + l.groups = groups; l.out_c = output_filters; l.inputs = h * w * c; l.hidden = h * w * hidden_filters; @@ -48,17 +49,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float)); l.input_layer = (layer*)malloc(sizeof(layer)); - *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.input_layer->batch = batch; if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size; l.self_layer = (layer*)malloc(sizeof(layer)); - *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.self_layer->batch = batch; if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size; l.output_layer = (layer*)malloc(sizeof(layer)); - *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); + *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0); l.output_layer->batch = batch; if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size; diff --git a/src/crnn_layer.h b/src/crnn_layer.h index 58944182..55feb599 100644 --- a/src/crnn_layer.h +++ b/src/crnn_layer.h @@ -9,7 +9,7 @@ #ifdef __cplusplus extern "C" { #endif -layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor); +layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor); void resize_crnn_layer(layer *l, int w, int h); void free_state_crnn(layer l); diff --git a/src/detector.c b/src/detector.c index a34354f1..4fc74585 100644 --- a/src/detector.c +++ b/src/detector.c @@ -42,19 +42,18 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i cuda_set_device(gpus[0]); printf(" Prepare additional network for mAP calculation...\n"); net_map = parse_network_cfg_custom(cfgfile, 1, 1); + const int net_classes = net_map.layers[net_map.n - 1].classes; int k; // free memory unnecessary arrays - for (k = 0; k < net_map.n; ++k) { - free_layer(net_map.layers[k]); - } + for (k = 0; k < net_map.n - 1; ++k) free_layer(net_map.layers[k]); char *name_list = option_find_str(options, "names", "data/names.list"); int names_size = 0; char **names = get_labels_custom(name_list, &names_size); - if (net_map.layers[net_map.n - 1].classes != names_size) { + if (net_classes != names_size) { printf(" Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n", - name_list, names_size, net_map.layers[net_map.n - 1].classes, cfgfile); - if (net_map.layers[net_map.n - 1].classes > names_size) getchar(); + name_list, names_size, net_classes, cfgfile); + if (net_classes > names_size) getchar(); } } diff --git a/src/network.c b/src/network.c index b25aa639..32f2b9e2 100644 --- a/src/network.c +++ b/src/network.c @@ -997,7 +997,7 @@ void fuse_conv_batchnorm(network net) { l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f); - const size_t filter_size = l->size*l->size*l->c; + const size_t filter_size = l->size*l->size*l->c / l->groups; int i; for (i = 0; i < filter_size; ++i) { int w_index = f*filter_size + i; diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 6e50df0b..40f71eb0 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -237,7 +237,7 @@ void pull_updates(layer l) { if(l.type == CONVOLUTIONAL){ cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n); - cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c); + cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights); if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n); } else if(l.type == CONNECTED){ cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs); @@ -249,7 +249,7 @@ void push_updates(layer l) { if(l.type == CONVOLUTIONAL){ cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); - cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c); + cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights); if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n); } else if(l.type == CONNECTED){ cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs); @@ -271,7 +271,7 @@ void merge_weights(layer l, layer base) { if (l.type == CONVOLUTIONAL) { axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1); - axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weights, 1, base.weights, 1); + axpy_cpu(l.nweights, 1, l.weights, 1, base.weights, 1); if (l.scales) { axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1); } @@ -285,7 +285,7 @@ void scale_weights(layer l, float s) { if (l.type == CONVOLUTIONAL) { scal_cpu(l.n, s, l.biases, 1); - scal_cpu(l.n*l.size*l.size*l.c, s, l.weights, 1); + scal_cpu(l.nweights, s, l.weights, 1); if (l.scales) { scal_cpu(l.n, s, l.scales, 1); } @@ -300,7 +300,7 @@ void pull_weights(layer l) { if(l.type == CONVOLUTIONAL){ cuda_pull_array(l.biases_gpu, l.biases, l.n); - cuda_pull_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c); + cuda_pull_array(l.weights_gpu, l.weights, l.nweights); if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n); } else if(l.type == CONNECTED){ cuda_pull_array(l.biases_gpu, l.biases, l.outputs); @@ -312,7 +312,7 @@ void push_weights(layer l) { if(l.type == CONVOLUTIONAL){ cuda_push_array(l.biases_gpu, l.biases, l.n); - cuda_push_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c); + cuda_push_array(l.weights_gpu, l.weights, l.nweights); if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n); } else if(l.type == CONNECTED){ cuda_push_array(l.biases_gpu, l.biases, l.outputs); @@ -324,7 +324,7 @@ void distribute_weights(layer l, layer base) { if(l.type == CONVOLUTIONAL){ cuda_push_array(l.biases_gpu, base.biases, l.n); - cuda_push_array(l.weights_gpu, base.weights, l.n*l.size*l.size*l.c); + cuda_push_array(l.weights_gpu, base.weights, l.nweights); if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n); } else if(l.type == CONNECTED){ cuda_push_array(l.biases_gpu, base.biases, l.outputs); @@ -337,7 +337,7 @@ void merge_updates(layer l, layer base) { if (l.type == CONVOLUTIONAL) { axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1); - axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1); + axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1); if (l.scale_updates) { axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1); } @@ -351,7 +351,7 @@ void distribute_updates(layer l, layer base) { if(l.type == CONVOLUTIONAL){ cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n); - cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.n*l.size*l.size*l.c); + cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights); if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n); } else if(l.type == CONNECTED){ cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs); diff --git a/src/parser.c b/src/parser.c index 9ca3d5e2..cca3641b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -150,6 +150,7 @@ local_layer parse_local(list *options, size_params params) convolutional_layer parse_convolutional(list *options, size_params params) { int n = option_find_int(options, "filters",1); + int groups = option_find_int_quiet(options, "groups", 1); int size = option_find_int(options, "size",1); int stride = option_find_int(options, "stride",1); int pad = option_find_int_quiet(options, "pad",0); @@ -170,7 +171,7 @@ convolutional_layer parse_convolutional(list *options, size_params params) int xnor = option_find_int_quiet(options, "xnor", 0); int use_bin_output = option_find_int_quiet(options, "bin_output", 0); - convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index); + convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index); layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.dot = option_find_float_quiet(options, "dot", 0); @@ -193,12 +194,13 @@ layer parse_crnn(list *options, size_params params) int output_filters = option_find_int(options, "output",1); int hidden_filters = option_find_int(options, "hidden",1); + int groups = option_find_int_quiet(options, "groups", 1); char *activation_s = option_find_str(options, "activation", "logistic"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int xnor = option_find_int_quiet(options, "xnor", 0); - layer l = make_crnn_layer(params.batch, params.w, params.h, params.c, hidden_filters, output_filters, params.time_steps, size, stride, padding, activation, batch_normalize, xnor); + layer l = make_crnn_layer(params.batch, params.w, params.h, params.c, hidden_filters, output_filters, groups, params.time_steps, size, stride, padding, activation, batch_normalize, xnor); l.shortcut = option_find_int_quiet(options, "shortcut", 0); @@ -251,13 +253,14 @@ layer parse_conv_lstm(list *options, size_params params) if (pad) padding = size / 2; int output_filters = option_find_int(options, "output", 1); + int groups = option_find_int_quiet(options, "groups", 1); char *activation_s = option_find_str(options, "activation", "LINEAR"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int xnor = option_find_int_quiet(options, "xnor", 0); int peephole = option_find_int_quiet(options, "peephole", 1); - layer l = make_conv_lstm_layer(params.batch, params.w, params.h, params.c, output_filters, params.time_steps, size, stride, padding, activation, batch_normalize, peephole, xnor); + layer l = make_conv_lstm_layer(params.batch, params.w, params.h, params.c, output_filters, groups, params.time_steps, size, stride, padding, activation, batch_normalize, peephole, xnor); l.shortcut = option_find_int_quiet(options, "shortcut", 0); @@ -989,8 +992,8 @@ void save_convolutional_weights_binary(layer l, FILE *fp) pull_convolutional_layer(l); } #endif - binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights); - int size = l.c*l.size*l.size; + int size = (l.c/l.groups)*l.size*l.size; + binarize_weights(l.weights, l.n, size, l.binary_weights); int i, j, k; fwrite(l.biases, sizeof(float), l.n, fp); if (l.batch_normalize){ @@ -1025,7 +1028,7 @@ void save_convolutional_weights(layer l, FILE *fp) pull_convolutional_layer(l); } #endif - int num = l.n*l.c*l.size*l.size; + int num = l.nweights; fwrite(l.biases, sizeof(float), l.n, fp); if (l.batch_normalize){ fwrite(l.scales, sizeof(float), l.n, fp); @@ -1209,7 +1212,7 @@ void load_convolutional_weights_binary(layer l, FILE *fp) fread(l.rolling_mean, sizeof(float), l.n, fp); fread(l.rolling_variance, sizeof(float), l.n, fp); } - int size = l.c*l.size*l.size; + int size = (l.c / l.groups)*l.size*l.size; int i, j, k; for(i = 0; i < l.n; ++i){ float mean = 0; @@ -1237,7 +1240,7 @@ void load_convolutional_weights(layer l, FILE *fp) //load_convolutional_weights_binary(l, fp); //return; } - int num = l.n*l.c*l.size*l.size; + int num = l.nweights; fread(l.biases, sizeof(float), l.n, fp); //fread(l.weights, sizeof(float), num, fp); // as in connected layer if (l.batch_normalize && (!l.dontloadscales)){ @@ -1267,9 +1270,9 @@ void load_convolutional_weights(layer l, FILE *fp) //} //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1); if (l.flipped) { - transpose_matrix(l.weights, l.c*l.size*l.size, l.n); + transpose_matrix(l.weights, (l.c/l.groups)*l.size*l.size, l.n); } - //if (l.binary) binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.weights); + //if (l.binary) binarize_weights(l.weights, l.n, (l.c/l.groups)*l.size*l.size, l.weights); #ifdef GPU if(gpu_index >= 0){ push_convolutional_layer(l); diff --git a/src/yolo_layer.c b/src/yolo_layer.c index d303b5aa..ae48ef7a 100644 --- a/src/yolo_layer.c +++ b/src/yolo_layer.c @@ -240,6 +240,7 @@ void forward_yolo_layer(const layer l, network_state state) int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; if (class_id >= l.classes) { printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1); + printf(" truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id); getchar(); continue; // if label contains class_id more than number of classes in the cfg-file } @@ -271,6 +272,9 @@ void forward_yolo_layer(const layer l, network_state state) } for(t = 0; t < l.max_boxes; ++t){ box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1); + if (truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 || truth.w < 0 || truth.h < 0) { + printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", truth.x, truth.y, truth.w, truth.h); + } int class_id = state.truth[t*(4 + 1) + b*l.truths + 4]; if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file