From 4f72fcc015a7825cf0c3d3eee143c13853b1a2c3 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 10 May 2019 16:46:48 +0300
Subject: [PATCH] Added grouped convolutional (depth-wise convolutional)

---
 src/conv_lstm_layer.c        |  25 +--
 src/conv_lstm_layer.h        |   2 +-
 src/convolutional_kernels.cu | 150 +++++++------
 src/convolutional_layer.c    | 407 +++++++++++++++++++----------------
 src/convolutional_layer.h    |   2 +-
 src/crnn_layer.c             |   9 +-
 src/crnn_layer.h             |   2 +-
 src/detector.c               |  11 +-
 src/network.c                |   2 +-
 src/network_kernels.cu       |  18 +-
 src/parser.c                 |  23 +-
 src/yolo_layer.c             |   4 +
 12 files changed, 349 insertions(+), 306 deletions(-)

diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c
index 7d395ad0..1764cb90 100644
--- a/src/conv_lstm_layer.c
+++ b/src/conv_lstm_layer.c
@@ -32,7 +32,7 @@ static void increment_layer(layer *l, int steps)
 }
 
 
-layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor)
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor)
 {
     fprintf(stderr, "CONV_LSTM Layer: %d x %d x %d image, %d filters\n", h, w, c, output_filters);
     /*
@@ -57,6 +57,7 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     l.h = h;
     l.w = w;
     l.c = c;
+    l.groups = groups;
     l.out_c = output_filters;
     l.inputs = h * w * c;
     l.xnor = xnor;
@@ -64,44 +65,44 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
 
     // U
     l.uf = (layer*)malloc(sizeof(layer));
-    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.uf) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.uf->batch = batch;
     if (l.workspace_size < l.uf->workspace_size) l.workspace_size = l.uf->workspace_size;
 
     l.ui = (layer*)malloc(sizeof(layer));
-    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.ui) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.ui->batch = batch;
     if (l.workspace_size < l.ui->workspace_size) l.workspace_size = l.ui->workspace_size;
 
     l.ug = (layer*)malloc(sizeof(layer));
-    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.ug) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.ug->batch = batch;
     if (l.workspace_size < l.ug->workspace_size) l.workspace_size = l.ug->workspace_size;
 
     l.uo = (layer*)malloc(sizeof(layer));
-    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.uo) = make_convolutional_layer(batch, steps, h, w, c, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.uo->batch = batch;
     if (l.workspace_size < l.uo->workspace_size) l.workspace_size = l.uo->workspace_size;
 
 
     // W
     l.wf = (layer*)malloc(sizeof(layer));
-    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.wf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.wf->batch = batch;
     if (l.workspace_size < l.wf->workspace_size) l.workspace_size = l.wf->workspace_size;
 
     l.wi = (layer*)malloc(sizeof(layer));
-    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.wi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.wi->batch = batch;
     if (l.workspace_size < l.wi->workspace_size) l.workspace_size = l.wi->workspace_size;
 
     l.wg = (layer*)malloc(sizeof(layer));
-    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.wg) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.wg->batch = batch;
     if (l.workspace_size < l.wg->workspace_size) l.workspace_size = l.wg->workspace_size;
 
     l.wo = (layer*)malloc(sizeof(layer));
-    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.wo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.wo->batch = batch;
     if (l.workspace_size < l.wo->workspace_size) l.workspace_size = l.wo->workspace_size;
 
@@ -109,21 +110,21 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i
     // V
     l.vf = (layer*)malloc(sizeof(layer));
     if (l.peephole) {
-        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+        *(l.vf) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
         l.vf->batch = batch;
         if (l.workspace_size < l.vf->workspace_size) l.workspace_size = l.vf->workspace_size;
     }
 
     l.vi = (layer*)malloc(sizeof(layer));
     if (l.peephole) {
-        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+        *(l.vi) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
         l.vi->batch = batch;
         if (l.workspace_size < l.vi->workspace_size) l.workspace_size = l.vi->workspace_size;
     }
 
     l.vo = (layer*)malloc(sizeof(layer));
     if (l.peephole) {
-        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+        *(l.vo) = make_convolutional_layer(batch, steps, h, w, output_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
         l.vo->batch = batch;
         if (l.workspace_size < l.vo->workspace_size) l.workspace_size = l.vo->workspace_size;
     }
diff --git a/src/conv_lstm_layer.h b/src/conv_lstm_layer.h
index 3bd9b4ef..56a57298 100644
--- a/src/conv_lstm_layer.h
+++ b/src/conv_lstm_layer.h
@@ -9,7 +9,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor);
+layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int peephole, int xnor);
 void resize_conv_lstm_layer(layer *l, int w, int h);
 void free_state_conv_lstm(layer l);
 void randomize_state_conv_lstm(layer l);
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index cfa4fe7e..9185279f 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -166,20 +166,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 {
     //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
     if(l.binary){
-        binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+        binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
         swap_binary(&l);
     }
 
     if(l.xnor){
         if (!l.align_bit_weights_gpu || state.train) {
-            //binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
+            //binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
 
-            fast_binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
+            fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
         }
-        //swap_binary(&l);
-        //binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
-        //state.input = l.binary_input_gpu;
-        //cudaDeviceSynchronize();
 
         if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
         {
@@ -187,11 +183,15 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
             //cudaError_t status = cudaSuccess;
             //int input_size = l.c*l.h*l.w*l.batch;
 
-            int m = l.n;
-            int k = l.size*l.size*l.c;
+            int m = l.n / l.groups;
+            int k = l.size*l.size*l.c / l.groups;
             int n = l.out_w*l.out_h;
             //float * a = l.weights_gpu;
 
+            // int i, j;
+            // for(i = 0; i < l.batch; ++i){
+            // for (j = 0; j < l.groups; ++j) {
+
             int ldb_align = l.lda_align;
             size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
             //size_t t_intput_size = new_ldb * n;
@@ -551,22 +551,25 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 #else
     fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
-    int i;
-    int m = l.n;
-    int k = l.size*l.size*l.c;
+    int i, j;
+    int m = l.n / l.groups;
+    int k = l.size*l.size*l.c / l.groups;
     int n = l.out_w*l.out_h;
     for(i = 0; i < l.batch; ++i){
-        float *im = state.input + i*l.c*l.h*l.w;
-        float * a = l.weights_gpu;
-        float * b = state.workspace;
-        float * c = l.output_gpu;
-        if (l.size == 1) {
-            b = im;
+        for (j = 0; j < l.groups; ++j) {
+            //float *im = state.input + i*l.c*l.h*l.w;
+            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+            float *a = l.weights_gpu + j*l.nweights / l.groups;
+            float *b = state.workspace;
+            float *c = l.output_gpu + (i*l.groups + j)*n*m;
+            if (l.size == 1) {
+                b = im;
+            }
+            else {
+                im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+            }
+            gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
         }
-        else {
-            im2col_ongpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
-        }
-        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
     }
 
     if (l.batch_normalize) {
@@ -782,32 +785,38 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
         backward_batchnorm_layer_gpu(l, state);
     }
 
-    int m = l.n;
-    int n = l.size*l.size*l.c;
+    int m = l.n / l.groups;
+    int n = l.size*l.size*l.c / l.groups;
     int k = l.out_w*l.out_h;
 
-    int i;
+    int i, j;
     for(i = 0; i < l.batch; ++i){
-        float * a = l.delta_gpu;
-        float * b = state.workspace;
-        float * c = l.weight_updates_gpu;
+        for (j = 0; j < l.groups; ++j) {
+            float * a = l.delta_gpu + (i*l.groups + j)*m*k;
+            float * b = state.workspace;
+            float * c = l.weight_updates_gpu + j*l.nweights / l.groups;
 
-        im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.workspace);
-        gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
+            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
 
-        if(state.delta){
-            if(l.binary || l.xnor) swap_binary(&l);
-            float * a = l.weights_gpu;
-            float * b = l.delta_gpu;
-            float * c = state.workspace;
+            im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+            gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
 
-            gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
+            if (state.delta) {
+                if (l.binary || l.xnor) swap_binary(&l);
+                float * a = l.weights_gpu + j*l.nweights / l.groups;
+                float * b = l.delta_gpu + (i*l.groups + j)*m*k;
+                float * c = state.workspace;
 
-            col2im_ongpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta + i*l.c*l.h*l.w);
-            if(l.binary || l.xnor) {
-                swap_binary(&l);
+                gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
+
+                float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
+
+                col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta);
+                if (l.binary || l.xnor) {
+                    swap_binary(&l);
+                }
+                if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
             }
-            if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
         }
     }
 #endif
@@ -821,43 +830,43 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
     }
 }
 
-void pull_convolutional_layer(convolutional_layer layer)
+void pull_convolutional_layer(convolutional_layer l)
 {
-    cuda_pull_array_async(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array_async(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array_async(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array_async(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_pull_array_async(layer.scales_gpu, layer.scales, layer.n);
-        cuda_pull_array_async(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_pull_array_async(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
+    cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
+    cuda_pull_array_async(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array_async(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
-    if (layer.adam){
-        cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    if (l.adam){
+        cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
+        cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
     }
     CHECK_CUDA(cudaPeekAtLastError());
     cudaStreamSynchronize(get_cuda_stream());
 }
 
-void push_convolutional_layer(convolutional_layer layer)
+void push_convolutional_layer(convolutional_layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
+    cuda_push_array(l.weights_gpu, l.weights, l.nweights);
 #ifdef CUDNN_HALF
-    assert((layer.c*layer.n*layer.size*layer.size) > 0);
-    cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16);
+    assert(l.nweights > 0);
+    cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
 #endif
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
-    if (layer.batch_normalize){
-        cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
-        cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
-        cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
     }
-    if (layer.adam){
-        cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
-        cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
+    if (l.adam){
+        cuda_push_array(l.m_gpu, l.m, l.nweights);
+        cuda_push_array(l.v_gpu, l.v, l.nweights);
     }
     CHECK_CUDA(cudaPeekAtLastError());
 }
@@ -868,11 +877,10 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
     //float momentum = a.momentum;
     //float decay = a.decay;
     //int batch = a.batch;
-    int size = l.size*l.size*l.c*l.n;   // old
 
     if (l.adam) {
         //adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
-        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t);
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
 
         adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
         if (l.scales_gpu) {
@@ -883,9 +891,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
         //axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
         //axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
         //scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
-        axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-        axpy_ongpu(size, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-        scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
+        axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
         axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
         scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 16cb6b93..f9c0ee01 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -140,7 +140,7 @@ size_t get_workspace_size32(layer l){
         if (workspace_size < re_packed_input_size) workspace_size = re_packed_input_size;
         return workspace_size;
     }
-    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
+    return (size_t)l.out_h*l.out_w*l.size*l.size*(l.c / l.groups)*sizeof(float);
 }
 
 size_t get_workspace_size16(layer l) {
@@ -231,9 +231,14 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
     // 3. FP32 Master Copy of Weights
     // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
     CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH));
+    CHECK_CUDNN(cudnnSetConvolutionGroupCount(l->convDesc, l->groups));
 #if((CUDNN_MAJOR*10 + CUDNN_MINOR) >= 72)   // cuDNN >= 7.2
     CHECK_CUDNN(cudnnSetConvolutionMathType(l->convDesc, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif
+#else   //if(CUDNN_MAJOR >= 7)
+    if (l->groups > 1) {
+        error("CUDNN < 7 doesn't support groups, please upgrade!");
+    }
 #endif
 
     // INT8_CONFIG, INT8_EXT_CONFIG, INT8x4_CONFIG and INT8x4_EXT_CONFIG are only supported
@@ -243,23 +248,23 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
     // backward delta
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
-    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
 
     // forward
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->c, l->h, l->w));
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w));
-    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
 
 //#ifdef CUDNN_HALF
     // backward delta
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
-    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
 
     // forward
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w));
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
-    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size));
+    CHECK_CUDNN(cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c / l->groups, l->size, l->size));
 
     // batch norm
     CHECK_CUDNN(cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w));
@@ -326,17 +331,21 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index)
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index)
 {
     int total_batch = batch*steps;
     int i;
     convolutional_layer l = { (LAYER_TYPE)0 };
     l.type = CONVOLUTIONAL;
 
+    if (xnor) groups = 1;   // disable groups for XNOR-net
+    if (groups < 1) groups = 1;
+
     l.index = index;
     l.h = h;
     l.w = w;
     l.c = c;
+    l.groups = groups;
     l.n = n;
     l.binary = binary;
     l.xnor = xnor;
@@ -348,17 +357,17 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.pad = padding;
     l.batch_normalize = batch_normalize;
     l.learning_rate_scale = 1;
-    l.nweights = l.c*l.n*l.size*l.size;
+    l.nweights = (c / groups) * n * size * size;
 
-    l.weights = (float*)calloc(c * n * size * size, sizeof(float));
-    l.weight_updates = (float*)calloc(c * n * size * size, sizeof(float));
+    l.weights = (float*)calloc(l.nweights, sizeof(float));
+    l.weight_updates = (float*)calloc(l.nweights, sizeof(float));
 
     l.biases = (float*)calloc(n, sizeof(float));
     l.bias_updates = (float*)calloc(n, sizeof(float));
 
     // float scale = 1./sqrt(size*size*c);
-    float scale = sqrt(2./(size*size*c));
-    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
+    float scale = sqrt(2./(size*size*c/groups));
+    for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_uniform(-1, 1);   // rand_normal();
     int out_h = convolutional_out_height(l);
     int out_w = convolutional_out_width(l);
     l.out_h = out_h;
@@ -375,12 +384,12 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.backward = backward_convolutional_layer;
     l.update = update_convolutional_layer;
     if(binary){
-        l.binary_weights = (float*)calloc(c * n * size * size, sizeof(float));
-        l.cweights = (char*)calloc(c * n * size * size, sizeof(char));
+        l.binary_weights = (float*)calloc(l.nweights, sizeof(float));
+        l.cweights = (char*)calloc(l.nweights, sizeof(char));
         l.scales = (float*)calloc(n, sizeof(float));
     }
     if(xnor){
-        l.binary_weights = (float*)calloc(c * n * size * size, sizeof(float));
+        l.binary_weights = (float*)calloc(l.nweights, sizeof(float));
         l.binary_input = (float*)calloc(l.inputs * l.batch, sizeof(float));
 
         int align = 32;// 8;
@@ -420,8 +429,8 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     }
     if(adam){
         l.adam = 1;
-        l.m = (float*)calloc(c * n * size * size, sizeof(float));
-        l.v = (float*)calloc(c * n * size * size, sizeof(float));
+        l.m = (float*)calloc(l.nweights, sizeof(float));
+        l.v = (float*)calloc(l.nweights, sizeof(float));
         l.bias_m = (float*)calloc(n, sizeof(float));
         l.scale_m = (float*)calloc(n, sizeof(float));
         l.bias_v = (float*)calloc(n, sizeof(float));
@@ -435,19 +444,19 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
 
     if(gpu_index >= 0){
         if (adam) {
-            l.m_gpu = cuda_make_array(l.m, c*n*size*size);
-            l.v_gpu = cuda_make_array(l.v, c*n*size*size);
+            l.m_gpu = cuda_make_array(l.m, l.nweights);
+            l.v_gpu = cuda_make_array(l.v, l.nweights);
             l.bias_m_gpu = cuda_make_array(l.bias_m, n);
             l.bias_v_gpu = cuda_make_array(l.bias_v, n);
             l.scale_m_gpu = cuda_make_array(l.scale_m, n);
             l.scale_v_gpu = cuda_make_array(l.scale_v, n);
         }
 
-        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+        l.weights_gpu = cuda_make_array(l.weights, l.nweights);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, l.nweights);
 #ifdef CUDNN_HALF
-        l.weights_gpu16 = cuda_make_array(NULL, c*n*size*size / 2 + 1); //cuda_make_array(l.weights, c*n*size*size / 2);
-        l.weight_updates_gpu16 = cuda_make_array(NULL, c*n*size*size / 2 + 1); //cuda_make_array(l.weight_updates, c*n*size*size / 2);
+        l.weights_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
+        l.weight_updates_gpu16 = cuda_make_array(NULL, l.nweights / 2 + 1);
 #endif
 
         l.biases_gpu = cuda_make_array(l.biases, n);
@@ -457,10 +466,10 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
         l.delta_gpu = cuda_make_array(l.delta, total_batch*out_h*out_w*n);
 
         if(binary){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
         }
         if(xnor){
-            l.binary_weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+            l.binary_weights_gpu = cuda_make_array(l.weights, l.nweights);
             l.mean_arr_gpu = cuda_make_array(0, l.n);
             l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch);
         }
@@ -490,7 +499,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
     l.workspace_size = get_convolutional_workspace_size(l);
 
     //fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
-    l.bflops = (2.0 * l.n * l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
+    l.bflops = (2.0 * l.nweights * l.out_h*l.out_w) / 1000000000.;
     if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
     else if (l.xnor) fprintf(stderr, "convX ");
     else fprintf(stderr, "conv  ");
@@ -504,8 +513,8 @@ void denormalize_convolutional_layer(convolutional_layer l)
     int i, j;
     for(i = 0; i < l.n; ++i){
         float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .00001);
-        for(j = 0; j < l.c*l.size*l.size; ++j){
-            l.weights[i*l.c*l.size*l.size + j] *= scale;
+        for(j = 0; j < l.nweights; ++j){
+            l.weights[i*l.nweights + j] *= scale;
         }
         l.biases[i] -= l.rolling_mean[i] * scale;
         l.scales[i] = 1;
@@ -516,7 +525,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0);
+    convolutional_layer l = make_convolutional_layer(1, 1, 5, 5, 3, 2, 1, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
@@ -691,8 +700,8 @@ void bit_to_float(unsigned char *src, float *dst, size_t size, size_t filters, f
 
 void binary_align_weights(convolutional_layer *l)
 {
-    int m = l->n;
-    int k = l->size*l->size*l->c;
+    int m = l->n;   // (l->n / l->groups)
+    int k = l->size*l->size*l->c;   // ->size*l->size*(l->c / l->groups)
     size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8;
     l->new_lda = new_lda;
 
@@ -823,13 +832,13 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
 {
     int out_h = convolutional_out_height(l);
     int out_w = convolutional_out_width(l);
-    int i;
+    int i, j;
 
     fill_cpu(l.outputs*l.batch, 0, l.output, 1);
 
     if (l.xnor && (!l.align_bit_weights || state.train)) {
         if (!l.align_bit_weights || state.train) {
-            binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
+            binarize_weights(l.weights, l.n, l.nweights, l.binary_weights);
             //printf("\n binarize_weights l.align_bit_weights = %p \n", l.align_bit_weights);
         }
         swap_binary(&l);
@@ -837,147 +846,150 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
         state.input = l.binary_input;
     }
 
-    int m = l.n;
-    int k = l.size*l.size*l.c;
+    int m = l.n / l.groups;
+    int k = l.size*l.size*l.c / l.groups;
     int n = out_h*out_w;
 
-    float *a = l.weights;
-    float *b = state.workspace;
-    float *c = l.output;
-
     static int u = 0;
     u++;
 
     for(i = 0; i < l.batch; ++i){
+        for (j = 0; j < l.groups; ++j) {
 
-        //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
-        if (l.xnor && l.align_bit_weights && !state.train)
-        {
-            memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
+            float *a = l.weights + j*l.nweights / l.groups;
+            float *b = state.workspace;
+            float *c = l.output + (i*l.groups + j)*n*m;
 
-            if(l.c % 32 == 0)
+            //gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            //gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
+            if (l.xnor && l.align_bit_weights && !state.train)
             {
-                //printf(" l.index = %d - new XNOR \n", l.index);
+                memset(b, 0, l.bit_align*l.size*l.size*l.c * sizeof(float));
 
-                int ldb_align = l.lda_align;
-                size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
-                //size_t t_intput_size = new_ldb * l.bit_align;// n;
-                //size_t t_bit_input_size = t_intput_size / 8;// +1;
-
-                int re_packed_input_size = l.c * l.w * l.h;
-                memset(state.workspace, 0, re_packed_input_size * sizeof(float));
-
-                const size_t new_c = l.c / 32;
-                size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
-                memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
-
-                //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
-                //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
-
-                // float32x4 by channel (as in cuDNN)
-                repack_input(state.input, state.workspace, l.w, l.h, l.c);
-
-                // 32 x floats -> 1 x uint32_t
-                float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
-
-                //free(re_packed_input);
-
-                // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
-                //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
-                //    l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
-
-                // // then exit from if()
-
-
-                im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
-                //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
-
-                //free(bin_re_packed_input);
-
-                int new_k = l.size*l.size*l.c / 32;
-
-                // good for (l.c == 64)
-                //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
-                //    l.align_bit_weights, l.new_lda/32,
-                //    b, n,
-                //    c, n, l.mean_arr);
-
-// // then exit from if()
-
-                transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
-
-                // the main GEMM function
-                gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
-
-                // // alternative GEMM
-                //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
-                //    l.align_bit_weights, l.new_lda/32,
-                //    t_bit_input, new_ldb / 32,
-                //    c, n, l.mean_arr);
-
-                //free(t_bit_input);
-
-            }
-            else
-            { // else (l.c % 32 != 0)
-
-                //--------------------------------------------------------
-                //printf(" l.index = %d - old XNOR \n", l.index);
-
-                //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align);
-                im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
-
-                //size_t output_size = l.outputs;
-                //float *count_output = calloc(output_size, sizeof(float));
-                //size_t bit_output_size = output_size / 8 + 1;
-                //char *bit_output = calloc(bit_output_size, sizeof(char));
-
-                //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col()
-                //size_t bit_input_size = intput_size / 8 + 1;
-                //char *bit_input = calloc(bit_input_size, sizeof(char));
-
-                //size_t weights_size = k * m; //l.size*l.size*l.c*l.n;
-                //size_t bit_weights_size = weights_size / 8 + 1;
-
-                //char *bit_weights = calloc(bit_weights_size, sizeof(char));
-                //float *mean_arr = calloc(l.n, sizeof(float));
-
-                // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
+                if (l.c % 32 == 0)
                 {
-                    //size_t ldb_align = 256; // 256 bit for AVX2
-                    int ldb_align = l.lda_align;
-                    size_t new_ldb = k + (ldb_align - k%ldb_align);
-                    size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
+                    //printf(" l.index = %d - new XNOR \n", l.index);
 
-                    // 5x times faster than gemm()-float32
+                    int ldb_align = l.lda_align;
+                    size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
+                    //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                    //size_t t_bit_input_size = t_intput_size / 8;// +1;
+
+                    int re_packed_input_size = l.c * l.w * l.h;
+                    memset(state.workspace, 0, re_packed_input_size * sizeof(float));
+
+                    const size_t new_c = l.c / 32;
+                    size_t in_re_packed_input_size = new_c * l.w * l.h + 1;
+                    memset(l.bin_re_packed_input, 0, in_re_packed_input_size * sizeof(uint32_t));
+
+                    //float *re_packed_input = calloc(l.c * l.w * l.h, sizeof(float));
+                    //uint32_t *bin_re_packed_input = calloc(new_c * l.w * l.h + 1, sizeof(uint32_t));
+
+                    // float32x4 by channel (as in cuDNN)
+                    repack_input(state.input, state.workspace, l.w, l.h, l.c);
+
+                    // 32 x floats -> 1 x uint32_t
+                    float_to_bit(state.workspace, (unsigned char *)l.bin_re_packed_input, l.c * l.w * l.h);
+
+                    //free(re_packed_input);
+
+                    // slow - convolution the packed inputs and weights: float x 32 by channel (as in cuDNN)
+                    //convolution_repacked((uint32_t *)bin_re_packed_input, (uint32_t *)l.align_bit_weights, l.output,
+                    //    l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr);
+
+                    // // then exit from if()
+
+
+                    im2col_cpu_custom((float *)l.bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
+                    //im2col_cpu((float *)bin_re_packed_input, new_c, l.h, l.w, l.size, l.stride, l.pad, b);
+
+                    //free(bin_re_packed_input);
+
+                    int new_k = l.size*l.size*l.c / 32;
+
+                    // good for (l.c == 64)
+                    //gemm_nn_bin_32bit_packed(m, n, new_k, 1,
+                    //    l.align_bit_weights, l.new_lda/32,
+                    //    b, n,
+                    //    c, n, l.mean_arr);
+
+    // // then exit from if()
+
+                    transpose_uint32((uint32_t *)state.workspace, (uint32_t*)l.t_bit_input, new_k, n, n, new_ldb);
+
+                    // the main GEMM function
                     gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
 
-                    //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
+                    // // alternative GEMM
+                    //gemm_nn_bin_transposed_32bit_packed(m, n, new_k, 1,
+                    //    l.align_bit_weights, l.new_lda/32,
+                    //    t_bit_input, new_ldb / 32,
+                    //    c, n, l.mean_arr);
 
-                    //free(t_input);
                     //free(t_bit_input);
-                    //}
+
+                }
+                else
+                { // else (l.c % 32 != 0)
+
+                    //--------------------------------------------------------
+                    //printf(" l.index = %d - old XNOR \n", l.index);
+
+                    //im2col_cpu_custom_align(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b, l.bit_align);
+                    im2col_cpu_custom_bin(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align);
+
+                    //size_t output_size = l.outputs;
+                    //float *count_output = calloc(output_size, sizeof(float));
+                    //size_t bit_output_size = output_size / 8 + 1;
+                    //char *bit_output = calloc(bit_output_size, sizeof(char));
+
+                    //size_t intput_size = n * k; // (out_h*out_w) X (l.size*l.size*l.c) : after im2col()
+                    //size_t bit_input_size = intput_size / 8 + 1;
+                    //char *bit_input = calloc(bit_input_size, sizeof(char));
+
+                    //size_t weights_size = k * m; //l.size*l.size*l.c*l.n;
+                    //size_t bit_weights_size = weights_size / 8 + 1;
+
+                    //char *bit_weights = calloc(bit_weights_size, sizeof(char));
+                    //float *mean_arr = calloc(l.n, sizeof(float));
+
+                    // transpose B from NxK to KxN (x-axis (ldb = l.size*l.size*l.c) - should be multiple of 8 bits)
+                    {
+                        //size_t ldb_align = 256; // 256 bit for AVX2
+                        int ldb_align = l.lda_align;
+                        size_t new_ldb = k + (ldb_align - k%ldb_align);
+                        size_t t_intput_size = binary_transpose_align_input(k, n, state.workspace, &l.t_bit_input, ldb_align, l.bit_align);
+
+                        // 5x times faster than gemm()-float32
+                        gemm_nn_custom_bin_mean_transposed(m, n, k, 1, (unsigned char*)l.align_bit_weights, new_ldb, (unsigned char*)l.t_bit_input, new_ldb, c, n, l.mean_arr);
+
+                        //gemm_nn_custom_bin_mean_transposed(m, n, k, 1, bit_weights, k, t_bit_input, new_ldb, c, n, mean_arr);
+
+                        //free(t_input);
+                        //free(t_bit_input);
+                        //}
+                    }
+
                 }
 
+                add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
+
+                //activate_array(l.output, m*n*l.batch, l.activation);
+                activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
+                return;
+
             }
+            else {
+                //printf(" l.index = %d - FP32 \n", l.index);
+                im2col_cpu(state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w,
+                    l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
 
-            add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
-
-            //activate_array(l.output, m*n*l.batch, l.activation);
-            activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
-            return;
-
+                gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
+                // bit-count to float
+            }
+            c += n*m;
+            state.input += l.c*l.h*l.w;
         }
-        else {
-            //printf(" l.index = %d - FP32 \n", l.index);
-            im2col_cpu_custom(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b);
-
-            gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
-            // bit-count to float
-        }
-        c += n*m;
-        state.input += l.c*l.h*l.w;
     }
 
     if(l.batch_normalize){
@@ -986,63 +998,72 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
     add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
 
     //activate_array(l.output, m*n*l.batch, l.activation);
-    activate_array_cpu_custom(l.output, m*n*l.batch, l.activation);
+    activate_array_cpu_custom(l.output, l.outputs*l.batch, l.activation);
 
     if(l.binary || l.xnor) swap_binary(&l);
 }
 
+
 void backward_convolutional_layer(convolutional_layer l, network_state state)
 {
-    int i;
-    int m = l.n;
-    int n = l.size*l.size*l.c;
-    int k = convolutional_out_height(l)*
-        convolutional_out_width(l);
+    int i, j;
+    int m = l.n / l.groups;
+    int n = l.size*l.size*l.c / l.groups;
+    int k = l.out_w*l.out_h;
 
-    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
 
-    if(l.batch_normalize){
+    if (l.batch_normalize) {
         backward_batchnorm_layer(l, state);
     }
+    else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
+    }
 
-    for(i = 0; i < l.batch; ++i){
-        float *a = l.delta + i*m*k;
-        float *b = state.workspace;
-        float *c = l.weight_updates;
+    for (i = 0; i < l.batch; ++i) {
+        for (j = 0; j < l.groups; ++j) {
+            float *a = l.delta + (i*l.groups + j)*m*k;
+            float *b = state.workspace;
+            float *c = l.weight_updates + j*l.nweights / l.groups;
 
-        float *im = state.input+i*l.c*l.h*l.w;
+            float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
 
-        im2col_cpu(im, l.c, l.h, l.w,
+            im2col_cpu(im, l.c / l.groups, l.h, l.w,
                 l.size, l.stride, l.pad, b);
-        gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
+            gemm(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
 
-        if(state.delta){
-            a = l.weights;
-            b = l.delta + i*m*k;
-            c = state.workspace;
+            if (state.delta) {
+                a = l.weights + j*l.nweights / l.groups;
+                b = l.delta + (i*l.groups + j)*m*k;
+                c = state.workspace;
 
-            gemm(1,0,n,k,m,1,a,n,b,k,0,c,k);
+                gemm(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
 
-            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+                col2im_cpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride,
+                    l.pad, state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w);
+            }
         }
     }
 }
 
-void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay)
+void update_convolutional_layer(convolutional_layer l, update_args a)
 {
-    int size = l.size*l.size*l.c*l.n;
-    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
+    float learning_rate = a.learning_rate*l.learning_rate_scale;
+    float momentum = a.momentum;
+    float decay = a.decay;
+    int batch = a.batch;
+
+    axpy_cpu(l.n, learning_rate / batch, l.bias_updates, 1, l.biases, 1);
     scal_cpu(l.n, momentum, l.bias_updates, 1);
 
-    if(l.scales){
-        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+    if (l.scales) {
+        axpy_cpu(l.n, learning_rate / batch, l.scale_updates, 1, l.scales, 1);
         scal_cpu(l.n, momentum, l.scale_updates, 1);
     }
 
-    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
-    scal_cpu(size, momentum, l.weight_updates, 1);
+    axpy_cpu(l.nweights, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(l.nweights, learning_rate / batch, l.weight_updates, 1, l.weights, 1);
+    scal_cpu(l.nweights, momentum, l.weight_updates, 1);
 }
 
 
@@ -1050,14 +1071,14 @@ image get_convolutional_weight(convolutional_layer l, int i)
 {
     int h = l.size;
     int w = l.size;
-    int c = l.c;
-    return float_to_image(w,h,c,l.weights+i*h*w*c);
+    int c = l.c / l.groups;
+    return float_to_image(w, h, c, l.weights + i*h*w*c);
 }
 
 void rgbgr_weights(convolutional_layer l)
 {
     int i;
-    for(i = 0; i < l.n; ++i){
+    for (i = 0; i < l.n; ++i) {
         image im = get_convolutional_weight(l, i);
         if (im.c == 3) {
             rgbgr_image(im);
@@ -1068,7 +1089,7 @@ void rgbgr_weights(convolutional_layer l)
 void rescale_weights(convolutional_layer l, float scale, float trans)
 {
     int i;
-    for(i = 0; i < l.n; ++i){
+    for (i = 0; i < l.n; ++i) {
         image im = get_convolutional_weight(l, i);
         if (im.c == 3) {
             scale_image(im, scale);
@@ -1080,12 +1101,18 @@ void rescale_weights(convolutional_layer l, float scale, float trans)
 
 image *get_weights(convolutional_layer l)
 {
-    image* weights = (image*)calloc(l.n, sizeof(image));
+    image *weights = calloc(l.n, sizeof(image));
     int i;
-    for(i = 0; i < l.n; ++i){
+    for (i = 0; i < l.n; ++i) {
         weights[i] = copy_image(get_convolutional_weight(l, i));
-        //normalize_image(weights[i]);
+        normalize_image(weights[i]);
+        /*
+        char buff[256];
+        sprintf(buff, "filter%d", i);
+        save_image(weights[i], buff);
+        */
     }
+    //error("hey");
     return weights;
 }
 
@@ -1102,4 +1129,4 @@ image *visualize_convolutional_layer(convolutional_layer l, char *window, image
     //save_image(dc, buff);
     free_image(dc);
     return single_weights;
-}
+}
\ No newline at end of file
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index da7b8feb..dc00dabf 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -30,7 +30,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 
 size_t get_convolutional_workspace_size(layer l);
-convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index);
+convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, int c, int n, int groups, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index 80a0c7f8..f466508e 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
 #endif
 }
 
-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor)
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor)
 {
     fprintf(stderr, "CRNN Layer: %d x %d x %d image, %d filters\n", h,w,c,output_filters);
     batch = batch / steps;
@@ -40,6 +40,7 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.h = h;
     l.w = w;
     l.c = c;
+    l.groups = groups;
     l.out_c = output_filters;
     l.inputs = h * w * c;
     l.hidden = h * w * hidden_filters;
@@ -48,17 +49,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
     l.state = (float*)calloc(l.hidden * l.batch * (l.steps + 1), sizeof(float));
 
     l.input_layer = (layer*)malloc(sizeof(layer));
-    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch, steps, h, w, c, hidden_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.input_layer->batch = batch;
     if (l.workspace_size < l.input_layer->workspace_size) l.workspace_size = l.input_layer->workspace_size;
 
     l.self_layer = (layer*)malloc(sizeof(layer));
-    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, hidden_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.self_layer->batch = batch;
     if (l.workspace_size < l.self_layer->workspace_size) l.workspace_size = l.self_layer->workspace_size;
 
     l.output_layer = (layer*)malloc(sizeof(layer));
-    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch, steps, h, w, hidden_filters, output_filters, groups, size, stride, pad, activation, batch_normalize, 0, xnor, 0, 0, 0);
     l.output_layer->batch = batch;
     if (l.workspace_size < l.output_layer->workspace_size) l.workspace_size = l.output_layer->workspace_size;
 
diff --git a/src/crnn_layer.h b/src/crnn_layer.h
index 58944182..55feb599 100644
--- a/src/crnn_layer.h
+++ b/src/crnn_layer.h
@@ -9,7 +9,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor);
+layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int groups, int steps, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int xnor);
 void resize_crnn_layer(layer *l, int w, int h);
 void free_state_crnn(layer l);
 
diff --git a/src/detector.c b/src/detector.c
index a34354f1..4fc74585 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -42,19 +42,18 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         cuda_set_device(gpus[0]);
         printf(" Prepare additional network for mAP calculation...\n");
         net_map = parse_network_cfg_custom(cfgfile, 1, 1);
+        const int net_classes = net_map.layers[net_map.n - 1].classes;
 
         int k;  // free memory unnecessary arrays
-        for (k = 0; k < net_map.n; ++k) {
-            free_layer(net_map.layers[k]);
-        }
+        for (k = 0; k < net_map.n - 1; ++k) free_layer(net_map.layers[k]);
 
         char *name_list = option_find_str(options, "names", "data/names.list");
         int names_size = 0;
         char **names = get_labels_custom(name_list, &names_size);
-        if (net_map.layers[net_map.n - 1].classes != names_size) {
+        if (net_classes != names_size) {
             printf(" Error: in the file %s number of names %d that isn't equal to classes=%d in the file %s \n",
-                name_list, names_size, net_map.layers[net_map.n - 1].classes, cfgfile);
-            if (net_map.layers[net_map.n - 1].classes > names_size) getchar();
+                name_list, names_size, net_classes, cfgfile);
+            if (net_classes > names_size) getchar();
         }
     }
 
diff --git a/src/network.c b/src/network.c
index b25aa639..32f2b9e2 100644
--- a/src/network.c
+++ b/src/network.c
@@ -997,7 +997,7 @@ void fuse_conv_batchnorm(network net)
                 {
                     l->biases[f] = l->biases[f] - (double)l->scales[f] * l->rolling_mean[f] / (sqrt((double)l->rolling_variance[f]) + .000001f);
 
-                    const size_t filter_size = l->size*l->size*l->c;
+                    const size_t filter_size = l->size*l->size*l->c / l->groups;
                     int i;
                     for (i = 0; i < filter_size; ++i) {
                         int w_index = f*filter_size + i;
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index 6e50df0b..40f71eb0 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -237,7 +237,7 @@ void pull_updates(layer l)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
+        cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
         if(l.scale_updates) cuda_pull_array(l.scale_updates_gpu, l.scale_updates, l.n);
     } else if(l.type == CONNECTED){
         cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
@@ -249,7 +249,7 @@ void push_updates(layer l)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.n*l.size*l.size*l.c);
+        cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
         if(l.scale_updates) cuda_push_array(l.scale_updates_gpu, l.scale_updates, l.n);
     } else if(l.type == CONNECTED){
         cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.outputs);
@@ -271,7 +271,7 @@ void merge_weights(layer l, layer base)
 {
     if (l.type == CONVOLUTIONAL) {
         axpy_cpu(l.n, 1, l.biases, 1, base.biases, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weights, 1, base.weights, 1);
+        axpy_cpu(l.nweights, 1, l.weights, 1, base.weights, 1);
         if (l.scales) {
             axpy_cpu(l.n, 1, l.scales, 1, base.scales, 1);
         }
@@ -285,7 +285,7 @@ void scale_weights(layer l, float s)
 {
     if (l.type == CONVOLUTIONAL) {
         scal_cpu(l.n, s, l.biases, 1);
-        scal_cpu(l.n*l.size*l.size*l.c, s, l.weights, 1);
+        scal_cpu(l.nweights, s, l.weights, 1);
         if (l.scales) {
             scal_cpu(l.n, s, l.scales, 1);
         }
@@ -300,7 +300,7 @@ void pull_weights(layer l)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_pull_array(l.biases_gpu, l.biases, l.n);
-        cuda_pull_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
+        cuda_pull_array(l.weights_gpu, l.weights, l.nweights);
         if(l.scales) cuda_pull_array(l.scales_gpu, l.scales, l.n);
     } else if(l.type == CONNECTED){
         cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
@@ -312,7 +312,7 @@ void push_weights(layer l)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_push_array(l.biases_gpu, l.biases, l.n);
-        cuda_push_array(l.weights_gpu, l.weights, l.n*l.size*l.size*l.c);
+        cuda_push_array(l.weights_gpu, l.weights, l.nweights);
         if(l.scales) cuda_push_array(l.scales_gpu, l.scales, l.n);
     } else if(l.type == CONNECTED){
         cuda_push_array(l.biases_gpu, l.biases, l.outputs);
@@ -324,7 +324,7 @@ void distribute_weights(layer l, layer base)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_push_array(l.biases_gpu, base.biases, l.n);
-        cuda_push_array(l.weights_gpu, base.weights, l.n*l.size*l.size*l.c);
+        cuda_push_array(l.weights_gpu, base.weights, l.nweights);
         if(base.scales) cuda_push_array(l.scales_gpu, base.scales, l.n);
     } else if(l.type == CONNECTED){
         cuda_push_array(l.biases_gpu, base.biases, l.outputs);
@@ -337,7 +337,7 @@ void merge_updates(layer l, layer base)
 {
     if (l.type == CONVOLUTIONAL) {
         axpy_cpu(l.n, 1, l.bias_updates, 1, base.bias_updates, 1);
-        axpy_cpu(l.n*l.size*l.size*l.c, 1, l.weight_updates, 1, base.weight_updates, 1);
+        axpy_cpu(l.nweights, 1, l.weight_updates, 1, base.weight_updates, 1);
         if (l.scale_updates) {
             axpy_cpu(l.n, 1, l.scale_updates, 1, base.scale_updates, 1);
         }
@@ -351,7 +351,7 @@ void distribute_updates(layer l, layer base)
 {
     if(l.type == CONVOLUTIONAL){
         cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.n);
-        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.n*l.size*l.size*l.c);
+        cuda_push_array(l.weight_updates_gpu, base.weight_updates, l.nweights);
         if(base.scale_updates) cuda_push_array(l.scale_updates_gpu, base.scale_updates, l.n);
     } else if(l.type == CONNECTED){
         cuda_push_array(l.bias_updates_gpu, base.bias_updates, l.outputs);
diff --git a/src/parser.c b/src/parser.c
index 9ca3d5e2..cca3641b 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -150,6 +150,7 @@ local_layer parse_local(list *options, size_params params)
 convolutional_layer parse_convolutional(list *options, size_params params)
 {
     int n = option_find_int(options, "filters",1);
+    int groups = option_find_int_quiet(options, "groups", 1);
     int size = option_find_int(options, "size",1);
     int stride = option_find_int(options, "stride",1);
     int pad = option_find_int_quiet(options, "pad",0);
@@ -170,7 +171,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index);
+    convolutional_layer layer = make_convolutional_layer(batch,1,h,w,c,n,groups,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
 
@@ -193,12 +194,13 @@ layer parse_crnn(list *options, size_params params)
 
     int output_filters = option_find_int(options, "output",1);
     int hidden_filters = option_find_int(options, "hidden",1);
+    int groups = option_find_int_quiet(options, "groups", 1);
     char *activation_s = option_find_str(options, "activation", "logistic");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
     int xnor = option_find_int_quiet(options, "xnor", 0);
 
-    layer l = make_crnn_layer(params.batch, params.w, params.h, params.c, hidden_filters, output_filters, params.time_steps, size, stride, padding, activation, batch_normalize, xnor);
+    layer l = make_crnn_layer(params.batch, params.w, params.h, params.c, hidden_filters, output_filters, groups, params.time_steps, size, stride, padding, activation, batch_normalize, xnor);
 
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
 
@@ -251,13 +253,14 @@ layer parse_conv_lstm(list *options, size_params params)
     if (pad) padding = size / 2;
 
     int output_filters = option_find_int(options, "output", 1);
+    int groups = option_find_int_quiet(options, "groups", 1);
     char *activation_s = option_find_str(options, "activation", "LINEAR");
     ACTIVATION activation = get_activation(activation_s);
     int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int peephole = option_find_int_quiet(options, "peephole", 1);
 
-    layer l = make_conv_lstm_layer(params.batch, params.w, params.h, params.c, output_filters, params.time_steps, size, stride, padding, activation, batch_normalize, peephole, xnor);
+    layer l = make_conv_lstm_layer(params.batch, params.w, params.h, params.c, output_filters, groups, params.time_steps, size, stride, padding, activation, batch_normalize, peephole, xnor);
 
     l.shortcut = option_find_int_quiet(options, "shortcut", 0);
 
@@ -989,8 +992,8 @@ void save_convolutional_weights_binary(layer l, FILE *fp)
         pull_convolutional_layer(l);
     }
 #endif
-    binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights);
-    int size = l.c*l.size*l.size;
+    int size = (l.c/l.groups)*l.size*l.size;
+    binarize_weights(l.weights, l.n, size, l.binary_weights);
     int i, j, k;
     fwrite(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize){
@@ -1025,7 +1028,7 @@ void save_convolutional_weights(layer l, FILE *fp)
         pull_convolutional_layer(l);
     }
 #endif
-    int num = l.n*l.c*l.size*l.size;
+    int num = l.nweights;
     fwrite(l.biases, sizeof(float), l.n, fp);
     if (l.batch_normalize){
         fwrite(l.scales, sizeof(float), l.n, fp);
@@ -1209,7 +1212,7 @@ void load_convolutional_weights_binary(layer l, FILE *fp)
         fread(l.rolling_mean, sizeof(float), l.n, fp);
         fread(l.rolling_variance, sizeof(float), l.n, fp);
     }
-    int size = l.c*l.size*l.size;
+    int size = (l.c / l.groups)*l.size*l.size;
     int i, j, k;
     for(i = 0; i < l.n; ++i){
         float mean = 0;
@@ -1237,7 +1240,7 @@ void load_convolutional_weights(layer l, FILE *fp)
         //load_convolutional_weights_binary(l, fp);
         //return;
     }
-    int num = l.n*l.c*l.size*l.size;
+    int num = l.nweights;
     fread(l.biases, sizeof(float), l.n, fp);
     //fread(l.weights, sizeof(float), num, fp); // as in connected layer
     if (l.batch_normalize && (!l.dontloadscales)){
@@ -1267,9 +1270,9 @@ void load_convolutional_weights(layer l, FILE *fp)
     //}
     //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
     if (l.flipped) {
-        transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
+        transpose_matrix(l.weights, (l.c/l.groups)*l.size*l.size, l.n);
     }
-    //if (l.binary) binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.weights);
+    //if (l.binary) binarize_weights(l.weights, l.n, (l.c/l.groups)*l.size*l.size, l.weights);
 #ifdef GPU
     if(gpu_index >= 0){
         push_convolutional_layer(l);
diff --git a/src/yolo_layer.c b/src/yolo_layer.c
index d303b5aa..ae48ef7a 100644
--- a/src/yolo_layer.c
+++ b/src/yolo_layer.c
@@ -240,6 +240,7 @@ void forward_yolo_layer(const layer l, network_state state)
                         int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
                         if (class_id >= l.classes) {
                             printf(" Warning: in txt-labels class_id=%d >= classes=%d in cfg-file. In txt-labels class_id should be [from 0 to %d] \n", class_id, l.classes, l.classes - 1);
+                            printf(" truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f, class_id = %d \n", truth.x, truth.y, truth.w, truth.h, class_id);
                             getchar();
                             continue; // if label contains class_id more than number of classes in the cfg-file
                         }
@@ -271,6 +272,9 @@ void forward_yolo_layer(const layer l, network_state state)
         }
         for(t = 0; t < l.max_boxes; ++t){
             box truth = float_to_box_stride(state.truth + t*(4 + 1) + b*l.truths, 1);
+            if (truth.x < 0 || truth.y < 0 || truth.x > 1 || truth.y > 1 || truth.w < 0 || truth.h < 0) {
+                printf(" Wrong label: truth.x = %f, truth.y = %f, truth.w = %f, truth.h = %f \n", truth.x, truth.y, truth.w, truth.h);
+            }
             int class_id = state.truth[t*(4 + 1) + b*l.truths + 4];
             if (class_id >= l.classes) continue; // if label contains class_id more than number of classes in the cfg-file