Added grouped convolutional (depth-wise convolutional)

This commit is contained in:
AlexeyAB
2019-05-10 16:46:48 +03:00
parent a7e5976c1b
commit 4f72fcc015
12 changed files with 349 additions and 306 deletions

View File

@ -166,20 +166,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
{
//fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
}
if(l.xnor){
if (!l.align_bit_weights_gpu || state.train) {
//binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
//binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
fast_binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
}
//swap_binary(&l);
//binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
//state.input = l.binary_input_gpu;
//cudaDeviceSynchronize();
if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
{
@ -187,11 +183,15 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//cudaError_t status = cudaSuccess;
//int input_size = l.c*l.h*l.w*l.batch;
int m = l.n;
int k = l.size*l.size*l.c;
int m = l.n / l.groups;
int k = l.size*l.size*l.c / l.groups;
int n = l.out_w*l.out_h;
//float * a = l.weights_gpu;
// int i, j;
// for(i = 0; i < l.batch; ++i){
// for (j = 0; j < l.groups; ++j) {
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
//size_t t_intput_size = new_ldb * n;
@ -551,22 +551,25 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
#else
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
int i;
int m = l.n;
int k = l.size*l.size*l.c;
int i, j;
int m = l.n / l.groups;
int k = l.size*l.size*l.c / l.groups;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
float *im = state.input + i*l.c*l.h*l.w;
float * a = l.weights_gpu;
float * b = state.workspace;
float * c = l.output_gpu;
if (l.size == 1) {
b = im;
for (j = 0; j < l.groups; ++j) {
//float *im = state.input + i*l.c*l.h*l.w;
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
float *a = l.weights_gpu + j*l.nweights / l.groups;
float *b = state.workspace;
float *c = l.output_gpu + (i*l.groups + j)*n*m;
if (l.size == 1) {
b = im;
}
else {
im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
}
gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
}
else {
im2col_ongpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
}
gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
}
if (l.batch_normalize) {
@ -782,32 +785,38 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
backward_batchnorm_layer_gpu(l, state);
}
int m = l.n;
int n = l.size*l.size*l.c;
int m = l.n / l.groups;
int n = l.size*l.size*l.c / l.groups;
int k = l.out_w*l.out_h;
int i;
int i, j;
for(i = 0; i < l.batch; ++i){
float * a = l.delta_gpu;
float * b = state.workspace;
float * c = l.weight_updates_gpu;
for (j = 0; j < l.groups; ++j) {
float * a = l.delta_gpu + (i*l.groups + j)*m*k;
float * b = state.workspace;
float * c = l.weight_updates_gpu + j*l.nweights / l.groups;
im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
if(state.delta){
if(l.binary || l.xnor) swap_binary(&l);
float * a = l.weights_gpu;
float * b = l.delta_gpu;
float * c = state.workspace;
im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
if (state.delta) {
if (l.binary || l.xnor) swap_binary(&l);
float * a = l.weights_gpu + j*l.nweights / l.groups;
float * b = l.delta_gpu + (i*l.groups + j)*m*k;
float * c = state.workspace;
col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta + i*l.c*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta);
if (l.binary || l.xnor) {
swap_binary(&l);
}
if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
}
if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
}
}
#endif
@ -821,43 +830,43 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
}
}
void pull_convolutional_layer(convolutional_layer layer)
void pull_convolutional_layer(convolutional_layer l)
{
cuda_pull_array_async(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array_async(layer.biases_gpu, layer.biases, layer.n);
cuda_pull_array_async(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array_async(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_pull_array_async(layer.scales_gpu, layer.scales, layer.n);
cuda_pull_array_async(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_pull_array_async(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
cuda_pull_array_async(l.biases_gpu, l.biases, l.n);
cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_pull_array_async(l.scales_gpu, l.scales, l.n);
cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
if (layer.adam){
cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
if (l.adam){
cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
}
CHECK_CUDA(cudaPeekAtLastError());
cudaStreamSynchronize(get_cuda_stream());
}
void push_convolutional_layer(convolutional_layer layer)
void push_convolutional_layer(convolutional_layer l)
{
cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
#ifdef CUDNN_HALF
assert((layer.c*layer.n*layer.size*layer.size) > 0);
cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16);
assert(l.nweights > 0);
cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
#endif
cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
if (layer.batch_normalize){
cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
cuda_push_array(l.biases_gpu, l.biases, l.n);
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
if (l.batch_normalize){
cuda_push_array(l.scales_gpu, l.scales, l.n);
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
}
if (layer.adam){
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
if (l.adam){
cuda_push_array(l.m_gpu, l.m, l.nweights);
cuda_push_array(l.v_gpu, l.v, l.nweights);
}
CHECK_CUDA(cudaPeekAtLastError());
}
@ -868,11 +877,10 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
//float momentum = a.momentum;
//float decay = a.decay;
//int batch = a.batch;
int size = l.size*l.size*l.c*l.n; // old
if (l.adam) {
//adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t);
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
if (l.scales_gpu) {
@ -883,9 +891,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
//axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
//axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
//scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(size, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);