From f0582446f26ada84144eeea2721a0aafd7950b73 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sat, 15 Jun 2019 02:32:57 +0300 Subject: [PATCH] Fixed Depth-wise (grouped) convolution for CPU and GPU (CUDNN=0/1) --- src/avgpool_layer.c | 2 +- src/convolutional_kernels.cu | 10 +++++-- src/convolutional_layer.c | 56 +++++++++++++++++++++++------------- src/maxpool_layer.c | 2 +- src/parser.c | 2 +- 5 files changed, 46 insertions(+), 26 deletions(-) diff --git a/src/avgpool_layer.c b/src/avgpool_layer.c index 20838bbd..3ae022e8 100644 --- a/src/avgpool_layer.c +++ b/src/avgpool_layer.c @@ -4,7 +4,7 @@ avgpool_layer make_avgpool_layer(int batch, int w, int h, int c) { - fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c); + fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c); avgpool_layer l = { (LAYER_TYPE)0 }; l.type = AVGPOOL; l.batch = batch; diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index dd6625f1..ffc9583d 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -578,7 +578,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) state.workspace); // output } - gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n); + //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n); + gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); } } @@ -817,7 +818,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.stride, l.stride, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) state.workspace); // output - gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n); + //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n); + gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n); if (state.delta) { if (l.binary || l.xnor) swap_binary(&l); @@ -825,7 +827,9 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state float * b = l.delta_gpu + (i*l.groups + j)*m*k; float * c = state.workspace; - gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k); + //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k); + gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k); + float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w; diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index de77a585..c7f51a66 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -547,7 +547,16 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB"); else if (l.xnor) fprintf(stderr, "convX "); else fprintf(stderr, "conv "); - fprintf(stderr, "%5d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); + + if(groups > 1) fprintf(stderr, "%5d/%3d ", n, groups); + else fprintf(stderr, "%5d ", n); + + if(dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride, dilation); + else fprintf(stderr, "%2d x%2d/%2d ", size, size, stride); + + fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); + + //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); return l; } @@ -897,12 +906,13 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) static int u = 0; u++; - for(i = 0; i < l.batch; ++i){ - for (j = 0; j < l.groups; ++j) { - - float *a = l.weights + j*l.nweights / l.groups; + for(i = 0; i < l.batch; ++i) + { + for (j = 0; j < l.groups; ++j) + { + float *a = l.weights +j*l.nweights / l.groups; float *b = state.workspace; - float *c = l.output + (i*l.groups + j)*n*m; + float *c = l.output +(i*l.groups + j)*n*m; //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n); @@ -1025,23 +1035,29 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) } else { //printf(" l.index = %d - FP32 \n", l.index); - float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; - //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b); + float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w; + if (l.size == 1) { + b = im; + } + else { + //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b); - im2col_cpu_ext(im, // input - l.c / l.groups, // input channels - l.h, l.w, // input size (h, w) - l.size, l.size, // kernel size (h, w) - l.pad, l.pad, // padding (h, w) - l.stride, l.stride, // stride (h, w) - l.dilation, l.dilation, // dilation (h, w) - b); // output + im2col_cpu_ext(im, // input + l.c / l.groups, // input channels + l.h, l.w, // input size (h, w) + l.size, l.size, // kernel size (h, w) + l.pad, l.pad, // padding (h, w) + l.stride, l.stride, // stride (h, w) + l.dilation, l.dilation, // dilation (h, w) + b); // output + + } gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n); // bit-count to float } - c += n*m; - state.input += l.c*l.h*l.w; + //c += n*m; + //state.input += l.c*l.h*l.w; } } @@ -1079,7 +1095,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) float *b = state.workspace; float *c = l.weight_updates + j*l.nweights / l.groups; - float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w; + float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w; //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b); im2col_cpu_ext( @@ -1112,7 +1128,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) l.pad, l.pad, // padding (h, w) l.stride, l.stride, // stride (h, w) l.dilation, l.dilation, // dilation (h, w) - state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w); // output (delta) + state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta) } } } diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c index 2e217459..68f38199 100644 --- a/src/maxpool_layer.c +++ b/src/maxpool_layer.c @@ -78,7 +78,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s #endif // GPU l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.; - fprintf(stderr, "max %d x %d / %d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); + fprintf(stderr, "max %d x %d/%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); return l; } diff --git a/src/parser.c b/src/parser.c index 2c095132..f3c02198 100644 --- a/src/parser.c +++ b/src/parser.c @@ -834,7 +834,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps) n = n->next; int count = 0; free_section(s); - fprintf(stderr, "layer filters size input output\n"); + fprintf(stderr, " layer filters size/strd(dil) input output\n"); while(n){ params.index = count; fprintf(stderr, "%4d ", count);