From d4402d29c2845a9aef97f9394dc619986ffbdf0c Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Sun, 30 Jun 2019 13:53:11 +0300
Subject: [PATCH] Added maxpool_depth= and out_channels= params to [maxpool]

---
 include/darknet.h            |  2 +
 src/maxpool_layer.c          | 53 ++++++++++++++++++++++++---
 src/maxpool_layer.h          |  2 +-
 src/maxpool_layer_kernels.cu | 71 +++++++++++++++++++++++++++++++++++-
 src/parser.c                 |  4 +-
 5 files changed, 123 insertions(+), 9 deletions(-)

diff --git a/include/darknet.h b/include/darknet.h
index eefe4df4..1584a362 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -208,6 +208,8 @@ struct layer {
     int side;
     int stride;
     int dilation;
+    int maxpool_depth;
+    int out_channels;
     int reverse;
     int flatten;
     int spatial;
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 00cb0473..4d2ee49f 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -45,7 +45,7 @@ void cudnn_maxpool_setup(layer *l)
 }
 
 
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding)
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding, int maxpool_depth, int out_channels)
 {
     maxpool_layer l = { (LAYER_TYPE)0 };
     l.type = MAXPOOL;
@@ -54,9 +54,18 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
     l.w = w;
     l.c = c;
     l.pad = padding;
-    l.out_w = (w + padding - size) / stride + 1;
-    l.out_h = (h + padding - size) / stride + 1;
-    l.out_c = c;
+    l.maxpool_depth = maxpool_depth;
+    l.out_channels = out_channels;
+    if (maxpool_depth) {
+        l.out_c = out_channels;
+        l.out_w = l.w;
+        l.out_h = l.h;
+    }
+    else {
+        l.out_w = (w + padding - size) / stride + 1;
+        l.out_h = (h + padding - size) / stride + 1;
+        l.out_c = c;
+    }
     l.outputs = l.out_h * l.out_w * l.out_c;
     l.inputs = h*w*c;
     l.size = size;
@@ -90,7 +99,7 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
 
     l->out_w = (w + l->pad - l->size) / l->stride + 1;
     l->out_h = (h + l->pad - l->size) / l->stride + 1;
-    l->outputs = l->out_w * l->out_h * l->c;
+    l->outputs = l->out_w * l->out_h * l->out_c;
     int output_size = l->outputs * l->batch;
 
     l->indexes = (int*)realloc(l->indexes, output_size * sizeof(int));
@@ -111,6 +120,37 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
 
 void forward_maxpool_layer(const maxpool_layer l, network_state state)
 {
+    if (l.maxpool_depth)
+    {
+        int b, i, j, k, g;
+        for (b = 0; b < l.batch; ++b) {
+            #pragma omp parallel for
+            for (i = 0; i < l.h; ++i) {
+                for (j = 0; j < l.w; ++j) {
+                    for (g = 0; g < l.out_c; ++g)
+                    {
+                        int out_index = j + l.w*(i + l.h*(g + l.out_c*b));
+                        float max = -FLT_MAX;
+                        int max_i = -1;
+
+                        for (k = g; k < l.c; k += l.out_c)
+                        {
+                            int in_index = j + l.w*(i + l.h*(k + l.c*b));
+                            float val = state.input[in_index];
+
+                            max_i = (val > max) ? in_index : max_i;
+                            max = (val > max) ? val : max;
+                        }
+                        l.output[out_index] = max;
+                        l.indexes[out_index] = max_i;
+                    }
+                }
+            }
+        }
+        return;
+    }
+
+
     if (!state.train) {
         forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch);
         return;
@@ -156,7 +196,8 @@ void backward_maxpool_layer(const maxpool_layer l, network_state state)
     int i;
     int h = l.out_h;
     int w = l.out_w;
-    int c = l.c;
+    int c = l.out_c;
+    #pragma omp parallel for
     for(i = 0; i < h*w*c*l.batch; ++i){
         int index = l.indexes[i];
         state.delta[index] += l.delta[i];
diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h
index 0a90c376..0c1f6148 100644
--- a/src/maxpool_layer.h
+++ b/src/maxpool_layer.h
@@ -12,7 +12,7 @@ typedef layer maxpool_layer;
 extern "C" {
 #endif
 image get_maxpool_image(maxpool_layer l);
-maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding);
+maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding, int maxpool_depth, int out_channels);
 void resize_maxpool_layer(maxpool_layer *l, int w, int h);
 void forward_maxpool_layer(const maxpool_layer l, network_state state);
 void backward_maxpool_layer(const maxpool_layer l, network_state state);
diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu
index 341a5b8b..82d631b3 100644
--- a/src/maxpool_layer_kernels.cu
+++ b/src/maxpool_layer_kernels.cu
@@ -5,6 +5,50 @@
 #include "maxpool_layer.h"
 #include "dark_cuda.h"
 
+__global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int j = id % w;
+    id = id / w;
+    int i = id % h;
+    id = id / h;
+    //int g = id % out_c;
+    //id = id / out_c;
+    int b = id % batch;
+
+    int k;
+    for (int g = 0; g < out_c; ++g)
+    {
+        int out_index = j + w*(i + h*(g + out_c*b));
+        float max = -FLT_MAX;
+        int max_i = -1;
+
+        for (k = g; k < c; k += out_c)
+        {
+            int in_index = j + w*(i + h*(k + c*b));
+            float val = input[in_index];
+
+            max_i = (val > max) ? in_index : max_i;
+            max = (val > max) ? val : max;
+        }
+        output[out_index] = max;
+        indexes[out_index] = max_i;
+    }
+}
+
+
+__global__ void backward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int batch, float *delta, float *prev_delta, int *indexes)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= n) return;
+
+    int index = indexes[id];
+    prev_delta[index] += delta[id];
+}
+
+
 __global__ void forward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_c, int stride, int size, int pad, float *input, float *output, int *indexes)
 {
     int h = (in_h + pad - size) / stride + 1;
@@ -84,6 +128,19 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
 
 extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
 {
+    if (layer.maxpool_depth) {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = 1;// layer.out_c;
+
+        size_t n = h*w*c*layer.batch;
+
+        forward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(
+            n, layer.w, layer.h, layer.c, layer.out_c, layer.batch, state.input, layer.output_gpu, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+
+        return;
+    }
 
 #ifdef CUDNN_DISABLED
     if (!state.train && layer.stride == layer.size) {
@@ -111,7 +168,7 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
 
     int h = layer.out_h;
     int w = layer.out_w;
-    int c = layer.c;
+    int c = layer.out_c;
 
     size_t n = h*w*c*layer.batch;
 
@@ -121,6 +178,18 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta
 
 extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
 {
+    if (layer.maxpool_depth) {
+        int h = layer.out_h;
+        int w = layer.out_w;
+        int c = layer.out_c;
+
+        size_t n = h * w * c * layer.batch;
+
+        backward_maxpool_depth_layer_kernel << <cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >> >(n, layer.w, layer.h, layer.c, layer.batch, layer.delta_gpu, state.delta, layer.indexes_gpu);
+        CHECK_CUDA(cudaPeekAtLastError());
+        return;
+    }
+
     size_t n = layer.h*layer.w*layer.c*layer.batch;
 
     backward_maxpool_layer_kernel<<<cuda_gridsize(n), BLOCK, 0, get_cuda_stream() >>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu);
diff --git a/src/parser.c b/src/parser.c
index 5f6658e6..30510132 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -534,6 +534,8 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     int stride = option_find_int(options, "stride",1);
     int size = option_find_int(options, "size",stride);
     int padding = option_find_int_quiet(options, "padding", size-1);
+    int maxpool_depth = option_find_int_quiet(options, "maxpool_depth", 0);
+    int out_channels = option_find_int_quiet(options, "out_channels", 1);
 
     int batch,h,w,c;
     h = params.h;
@@ -542,7 +544,7 @@ maxpool_layer parse_maxpool(list *options, size_params params)
     batch=params.batch;
     if(!(h && w && c)) error("Layer before maxpool layer must output image.");
 
-    maxpool_layer layer = make_maxpool_layer(batch,h,w,c,size,stride,padding);
+    maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride, padding, maxpool_depth, out_channels);
     return layer;
 }