From f0582446f26ada84144eeea2721a0aafd7950b73 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sat, 15 Jun 2019 02:32:57 +0300
Subject: [PATCH] Fixed Depth-wise (grouped) convolution for CPU and GPU
 (CUDNN=0/1)

---
 src/avgpool_layer.c          |  2 +-
 src/convolutional_kernels.cu | 10 +++++--
 src/convolutional_layer.c    | 56 +++++++++++++++++++++++-------------
 src/maxpool_layer.c          |  2 +-
 src/parser.c                 |  2 +-
 5 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/src/avgpool_layer.c b/src/avgpool_layer.c
index 20838bbd..3ae022e8 100644
--- a/src/avgpool_layer.c
+++ b/src/avgpool_layer.c
@@ -4,7 +4,7 @@
 
 avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
 {
-    fprintf(stderr, "avg                     %4d x%4d x%4d   ->  %4d\n",  w, h, c, c);
+    fprintf(stderr, "avg                         %4d x%4d x%4d  ->   %4d\n",  w, h, c, c);
     avgpool_layer l = { (LAYER_TYPE)0 };
     l.type = AVGPOOL;
     l.batch = batch;
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index dd6625f1..ffc9583d 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -578,7 +578,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
                     state.workspace);       // output
 
             }
-            gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
+            //gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
+            gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
         }
     }
 
@@ -817,7 +818,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 l.stride, l.stride,     // stride (h, w)
                 l.dilation, l.dilation, // dilation (h, w)
                 state.workspace);       // output
-            gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
+            //gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
+            gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
 
             if (state.delta) {
                 if (l.binary || l.xnor) swap_binary(&l);
@@ -825,7 +827,9 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 float * b = l.delta_gpu + (i*l.groups + j)*m*k;
                 float * c = state.workspace;
 
-                gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
+                //gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
+                gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
+
 
                 float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
 
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index de77a585..c7f51a66 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -547,7 +547,16 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
     else if (l.xnor) fprintf(stderr, "convX ");
     else fprintf(stderr, "conv  ");
-    fprintf(stderr, "%5d %2d x%2d /%2d(%d)%4d x%4d x%4d   ->  %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+
+    if(groups > 1) fprintf(stderr, "%5d/%3d ", n, groups);
+    else           fprintf(stderr, "%5d     ", n);
+
+    if(dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride, dilation);
+    else             fprintf(stderr, "%2d x%2d/%2d   ", size, size, stride);
+
+    fprintf(stderr, "%4d x%4d x%4d  -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+
+    //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d  -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
 
     return l;
 }
@@ -897,12 +906,13 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
     static int u = 0;
     u++;
 
-    for(i = 0; i < l.batch; ++i){
-        for (j = 0; j < l.groups; ++j) {
-
-            float *a = l.weights + j*l.nweights / l.groups;
+    for(i = 0; i < l.batch; ++i)
+    {
+        for (j = 0; j < l.groups; ++j)
+        {
+            float *a = l.weights +j*l.nweights / l.groups;
             float *b = state.workspace;
-            float *c = l.output + (i*l.groups + j)*n*m;
+            float *c = l.output +(i*l.groups + j)*n*m;
 
             //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
             //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
@@ -1025,23 +1035,29 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
             }
             else {
                 //printf(" l.index = %d - FP32 \n", l.index);
-                float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
-                //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
+                float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
+                if (l.size == 1) {
+                    b = im;
+                }
+                else {
+                    //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
 
-                im2col_cpu_ext(im,   // input
-                    l.c / l.groups,     // input channels
-                    l.h, l.w,           // input size (h, w)
-                    l.size, l.size,     // kernel size (h, w)
-                    l.pad, l.pad,       // padding (h, w)
-                    l.stride, l.stride, // stride (h, w)
-                    l.dilation, l.dilation, // dilation (h, w)
-                    b);                 // output
+                    im2col_cpu_ext(im,   // input
+                        l.c / l.groups,     // input channels
+                        l.h, l.w,           // input size (h, w)
+                        l.size, l.size,     // kernel size (h, w)
+                        l.pad, l.pad,       // padding (h, w)
+                        l.stride, l.stride, // stride (h, w)
+                        l.dilation, l.dilation, // dilation (h, w)
+                        b);                 // output
+
+                }
 
                 gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
                 // bit-count to float
             }
-            c += n*m;
-            state.input += l.c*l.h*l.w;
+            //c += n*m;
+            //state.input += l.c*l.h*l.w;
         }
     }
 
@@ -1079,7 +1095,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
             float *b = state.workspace;
             float *c = l.weight_updates + j*l.nweights / l.groups;
 
-            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+            float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
 
             //im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
             im2col_cpu_ext(
@@ -1112,7 +1128,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
                     l.pad, l.pad,           // padding (h, w)
                     l.stride, l.stride,     // stride (h, w)
                     l.dilation, l.dilation, // dilation (h, w)
-                    state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w); // output (delta)
+                    state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
             }
         }
     }
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 2e217459..68f38199 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -78,7 +78,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
 
     #endif  // GPU
 	l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
-    fprintf(stderr, "max          %d x %d / %d  %4d x%4d x%4d   ->  %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
+    fprintf(stderr, "max              %d x %d/%2d   %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
     return l;
 }
 
diff --git a/src/parser.c b/src/parser.c
index 2c095132..f3c02198 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -834,7 +834,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
     n = n->next;
     int count = 0;
     free_section(s);
-    fprintf(stderr, "layer     filters    size              input                output\n");
+    fprintf(stderr, "   layer   filters  size/strd(dil)      input                output\n");
     while(n){
         params.index = count;
         fprintf(stderr, "%4d ", count);