From 7c2f30232132e9fc90155288689016db93d17990 Mon Sep 17 00:00:00 2001
From: AlexeyAB <alexeyab84@gmail.com>
Date: Fri, 7 Dec 2018 22:40:10 +0300
Subject: [PATCH] Fixed nan issue for training with CUDNN_HALF=1 by using
 Tensor Cores

---
 build/darknet/x64/darknet_many_images.cmd |   2 +-
 src/convolutional_kernels.cu              | 458 ++++++++++++----------
 src/convolutional_layer.c                 |  66 ++--
 src/convolutional_layer.h                 |   2 +-
 src/crnn_layer.c                          |   6 +-
 src/detector.c                            |  15 +-
 src/layer.h                               |  12 +-
 src/network.h                             |   1 +
 src/parser.c                              |   5 +-
 9 files changed, 318 insertions(+), 249 deletions(-)

diff --git a/build/darknet/x64/darknet_many_images.cmd b/build/darknet/x64/darknet_many_images.cmd
index 5f4658e4..a763d022 100644
--- a/build/darknet/x64/darknet_many_images.cmd
+++ b/build/darknet/x64/darknet_many_images.cmd
@@ -1,4 +1,4 @@
-darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -dont_show < data/train.txt > result.txt
+darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -ext_output -dont_show < data/train.txt > result.txt
 
 
 pause
\ No newline at end of file
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 6824083e..04d3820f 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -138,7 +138,9 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f
 __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
 {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]);
+    // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
     //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
 }
 
@@ -290,113 +292,128 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
     float alpha = 1, beta = 0;
 
-#ifdef CUDNN_HALF
-    // Note: For improved performance it is advised to use beta[0] = 0.0.
-    // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
-    // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
-    // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
-    // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
-
-    const size_t input16_size = l.batch*l.c*l.w*l.h;
-    const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
-
-    if (*state.net.max_input16_size < input16_size) {
-        //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
-        *state.net.max_input16_size = input16_size;
-        if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
-        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
-    }
-    float *input16 = *state.net.input16_gpu;
-
-    if (*state.net.max_output16_size < output16_size) {
-        *state.net.max_output16_size = output16_size;
-        if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
-        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
-    }
-    float *output16 = *state.net.output16_gpu;
-
-    cuda_convert_f32_to_f16(state.input, input16_size, input16);
-
-    //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
-    cudnnConvolutionForward(cudnn_handle(),
-        &alpha,
-        l.srcTensorDesc,
-        input16,
-        l.weightDesc,
-        l.weights_gpu16,
-        l.convDesc,
-        l.fw_algo,
-        state.workspace,
-        l.workspace_size,
-        &beta,
-        l.dstTensorDesc,
-        output16);
-
-
-    if (l.batch_normalize)
+//#ifdef CUDNN_HALF
+    //if (state.use_mixed_precision) {
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if(state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in))
     {
-        if (state.train) // Training
-        {
-            copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
-            //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
-            float one = 1;
-            float zero = 0;
-            // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
-            // compared to FP32, it’s just that the statistics and value adjustment should be done in FP32.
-            cudnnBatchNormalizationForwardTraining(cudnn_handle(),
-                CUDNN_BATCHNORM_SPATIAL,
-                &one,
-                &zero,
-                l.normDstTensorDescF16,
-                l.x_gpu,            // input
-                l.normDstTensorDescF16,
-                output16,            // output
-                l.normTensorDesc,
-                l.scales_gpu,
-                l.biases_gpu,
-                .01,
-                l.rolling_mean_gpu,        // output (should be FP32)
-                l.rolling_variance_gpu,    // output (should be FP32)
-                .00001,
-                l.mean_gpu,            // output (should be FP32)
-                l.variance_gpu);    // output (should be FP32)
+        //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
 
-            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
-            //forward_batchnorm_layer_gpu(l, state);
+        // Note: For improved performance it is advised to use beta[0] = 0.0.
+        // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
+        // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
+        // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED
+        // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops
+
+        const size_t input16_size = l.batch*l.c*l.w*l.h;
+        const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w;
+
+        if (*state.net.max_input16_size < input16_size) {
+            //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size);
+            *state.net.max_input16_size = input16_size;
+            if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
+            *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
         }
-        else // Detection
+        float *input16 = *state.net.input16_gpu;
+
+        if (*state.net.max_output16_size < output16_size) {
+            *state.net.max_output16_size = output16_size;
+            if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
+            *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
+        }
+        float *output16 = *state.net.output16_gpu;
+
+        cuda_convert_f32_to_f16(state.input, input16_size, input16);
+
+        //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
+        cudnnConvolutionForward(cudnn_handle(),
+            &alpha,
+            l.srcTensorDesc16,
+            input16,
+            l.weightDesc16,
+            l.weights_gpu16,
+            l.convDesc,
+            l.fw_algo16,
+            state.workspace,
+            l.workspace_size,
+            &beta,
+            l.dstTensorDesc16,
+            output16);
+
+
+        if (l.batch_normalize)
+        {
+            if (state.train) // Training
+            {
+                copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1);
+                //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream());
+                float one = 1;
+                float zero = 0;
+                // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth
+                // compared to FP32, it’s just that the statistics and value adjustment should be done in FP32.
+                cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                    CUDNN_BATCHNORM_SPATIAL,
+                    &one,
+                    &zero,
+                    l.normDstTensorDescF16,
+                    l.x_gpu,            // input
+                    l.normDstTensorDescF16,
+                    output16,            // output
+                    l.normTensorDesc,
+                    l.scales_gpu,
+                    l.biases_gpu,
+                    .01,
+                    l.rolling_mean_gpu,        // output (should be FP32)
+                    l.rolling_variance_gpu,    // output (should be FP32)
+                    .00001,
+                    l.mean_gpu,            // output (should be FP32)
+                    l.variance_gpu);    // output (should be FP32)
+
+                cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
+                //forward_batchnorm_layer_gpu(l, state);
+            }
+            else // Detection
+            {
+                cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
+                normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+                scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+                add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+            }
+        }
+        else // BIAS only
         {
             cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
-            normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
-            scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
-            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
         }
     }
-    else // BIAS only
-    {
-        cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
-        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+    else {
+
+        //#else
+
+        cudnnConvolutionForward(cudnn_handle(),
+            &alpha, //&one,
+            l.srcTensorDesc,
+            state.input,
+            l.weightDesc,
+            l.weights_gpu,
+            l.convDesc,
+            l.fw_algo,
+            state.workspace,
+            l.workspace_size,
+            &beta,  //&one,
+            l.dstTensorDesc,
+            l.output_gpu);
+
+        //cudaDeviceSynchronize();
+        if (l.batch_normalize) {
+            forward_batchnorm_layer_gpu(l, state);
+        }
+        else {
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
+    //#endif    // CUDNN_HALF
     }
 
-#else
-
-    cudnnConvolutionForward(cudnn_handle(),
-                &alpha, //&one,
-                l.srcTensorDesc,
-                state.input,
-                l.weightDesc,
-                l.weights_gpu,
-                l.convDesc,
-                l.fw_algo,
-                state.workspace,
-                l.workspace_size,
-                &beta,  //&one,
-                l.dstTensorDesc,
-                l.output_gpu);
-
-    //cudaDeviceSynchronize();
-#endif    // CUDNN_HALF
-
 
 #else
     fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
@@ -418,16 +435,17 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
         }
         gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
     }
-#endif
 
-#ifndef CUDNN_HALF
     if (l.batch_normalize) {
         forward_batchnorm_layer_gpu(l, state);
     }
     else {
         add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
     }
-#endif // no CUDNN_HALF
+#endif
+
+//#ifndef CUDNN_HALF
+//#endif // no CUDNN_HALF
 
     if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
     //if(l.dot > 0) dot_error_gpu(l);
@@ -441,13 +459,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
 
     backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
 
-#ifndef CUDNN_HALF
-    if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
-    } else {
-        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
-    }
-#endif // no CUDNN_HALF
+//#ifndef CUDNN_HALF
+    //if(l.batch_normalize){
+    //    backward_batchnorm_layer_gpu(l, state);
+    //} else {
+    //    //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    //}
+//#endif // no CUDNN_HALF
     float *original_input = state.input;
 
     if(l.xnor) state.input = l.binary_input_gpu;
@@ -455,117 +473,126 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
     float one = 1;
     float alpha = 1, beta = 0;
 
-#ifdef CUDNN_HALF
-
-    const size_t input16_size = l.batch*l.c*l.w*l.h;
-    const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
-
-    if (*state.net.max_input16_size < input16_size) {
-        *state.net.max_input16_size = input16_size;
-        if(*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
-        *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
-    }
-    float *input16 = *state.net.input16_gpu;
-
-    if (*state.net.max_output16_size < delta16_size) {
-        *state.net.max_output16_size = delta16_size;
-        if(*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
-        *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
-    }
-    float *delta16 = *state.net.output16_gpu;
-
-    cuda_convert_f32_to_f16(state.input, input16_size, input16);
-    cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
-
-    if (l.batch_normalize) {
-        //if (!state.train) {
-        //    l.mean_gpu = l.rolling_mean_gpu;
-        //    l.variance_gpu = l.rolling_variance_gpu;
-        //}
-        float one = 1;
-        float zero = 0;
-        cudnnBatchNormalizationBackward(cudnn_handle(),
-            CUDNN_BATCHNORM_SPATIAL,
-            &one,
-            &zero,
-            &one,
-            &one,
-            l.normDstTensorDescF16,
-            l.x_gpu,                // input
-            l.normDstTensorDescF16,
-            delta16,                // input
-            l.normDstTensorDescF16,
-            l.x_norm_gpu,            // output
-            l.normTensorDesc,
-            l.scales_gpu,            // output (should be FP32)
-            l.scale_updates_gpu,    // output (should be FP32)
-            l.bias_updates_gpu,        // output (should be FP32)
-            .00001,
-            l.mean_gpu,                // input (should be FP32)
-            l.variance_gpu);        // input (should be FP32)
-        copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
-        //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
-    }
-    else
+//#ifdef CUDNN_HALF
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in))
     {
-        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
-    }
 
-    // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
-    // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
+        const size_t input16_size = l.batch*l.c*l.w*l.h;
+        const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
 
-    // calculate conv weight updates
-    // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
-    //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
-    cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16);
+        if (*state.net.max_input16_size < input16_size) {
+            *state.net.max_input16_size = input16_size;
+            if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu);
+            *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size);
+        }
+        float *input16 = *state.net.input16_gpu;
 
-    cudnnConvolutionBackwardFilter(cudnn_handle(),
-        &one,
-        l.srcTensorDesc,
-        input16, //state.input,
-        l.ddstTensorDesc,
-        delta16, //l.delta_gpu,
-        l.convDesc,
-        l.bf_algo,
-        state.workspace,
-        l.workspace_size,
-        &one,
-        l.dweightDesc,
-        l.weight_updates_gpu16);    // l.weight_updates_gpu);
+        if (*state.net.max_output16_size < delta16_size) {
+            *state.net.max_output16_size = delta16_size;
+            if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu);
+            *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size);
+        }
+        float *delta16 = *state.net.output16_gpu;
 
-    cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu);
+        cuda_convert_f32_to_f16(state.input, input16_size, input16);
+        cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16);
 
-    if (state.delta) {
-        if (l.binary || l.xnor) swap_binary(&l);
+        if (l.batch_normalize) {
+            //if (!state.train) {
+            //    l.mean_gpu = l.rolling_mean_gpu;
+            //    l.variance_gpu = l.rolling_variance_gpu;
+            //}
+            float one = 1;
+            float zero = 0;
+            cudnnBatchNormalizationBackward(cudnn_handle(),
+                CUDNN_BATCHNORM_SPATIAL,
+                &one,
+                &zero,
+                &one,
+                &one,
+                l.normDstTensorDescF16,
+                l.x_gpu,                // input
+                l.normDstTensorDescF16,
+                delta16,                // input
+                l.normDstTensorDescF16,
+                l.x_norm_gpu,            // output
+                l.normTensorDesc,
+                l.scales_gpu,            // output (should be FP32)
+                l.scale_updates_gpu,    // output (should be FP32)
+                l.bias_updates_gpu,        // output (should be FP32)
+                .00001,
+                l.mean_gpu,                // input (should be FP32)
+                l.variance_gpu);        // input (should be FP32)
+            copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1);
+            //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream());
+        }
+        else
+        {
+            //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
 
-        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
-        // calculate delta for the next layer
-        // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
-        // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
-        cudnnConvolutionBackwardData(cudnn_handle(),
-            &alpha,
-            l.weightDesc,
-            l.weights_gpu16, //l.weights_gpu,
-            l.ddstTensorDesc,
+        // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16
+        // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16)
+
+        // calculate conv weight updates
+        // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum
+        //   so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m
+        cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16);
+
+        cudnnConvolutionBackwardFilter(cudnn_handle(),
+            &one,
+            l.srcTensorDesc16,
+            input16, //state.input,
+            l.ddstTensorDesc16,
             delta16, //l.delta_gpu,
             l.convDesc,
-            l.bd_algo,
+            l.bf_algo16,
             state.workspace,
             l.workspace_size,
-            &beta,
-            l.dsrcTensorDesc,
-            input16);    // state.delta);
+            &one,
+            l.dweightDesc,
+            l.weight_updates_gpu16);    // l.weight_updates_gpu);
 
-        cuda_convert_f16_to_f32(input16, input16_size, state.delta);
+        cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu);
 
-        if (l.binary || l.xnor) swap_binary(&l);
-        if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        if (state.delta) {
+            if (l.binary || l.xnor) swap_binary(&l);
+
+            // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
+            // calculate delta for the next layer
+            // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16
+            // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
+            cudnnConvolutionBackwardData(cudnn_handle(),
+                &alpha,
+                l.weightDesc16,
+                l.weights_gpu16, //l.weights_gpu,
+                l.ddstTensorDesc16,
+                delta16, //l.delta_gpu,
+                l.convDesc,
+                l.bd_algo16,
+                state.workspace,
+                l.workspace_size,
+                &beta,
+                l.dsrcTensorDesc16,
+                input16);    // state.delta);
+
+            cuda_convert_f16_to_f32(input16, input16_size, state.delta);
+
+            if (l.binary || l.xnor) swap_binary(&l);
+            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        }
     }
-#else    // CUDNN_HALF
+    else {
+        //#else    // CUDNN_HALF
 
-    // calculate conv weight updates
-    // if used: beta=1 then loss decreases faster
-    cudnnConvolutionBackwardFilter(cudnn_handle(),
+        if(l.batch_normalize){
+            backward_batchnorm_layer_gpu(l, state);
+        }
+
+        // calculate conv weight updates
+        // if used: beta=1 then loss decreases faster
+        cudnnConvolutionBackwardFilter(cudnn_handle(),
             &one,
             l.srcTensorDesc,
             state.input,
@@ -579,11 +606,11 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
             l.dweightDesc,
             l.weight_updates_gpu);
 
-    if(state.delta){
-        if(l.binary || l.xnor) swap_binary(&l);
-        // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
-        // calculate delta for the next layer
-        cudnnConvolutionBackwardData(cudnn_handle(),
+        if (state.delta) {
+            if (l.binary || l.xnor) swap_binary(&l);
+            // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData
+            // calculate delta for the next layer
+            cudnnConvolutionBackwardData(cudnn_handle(),
                 &one,
                 l.weightDesc,
                 l.weights_gpu,
@@ -596,13 +623,18 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
                 &one,
                 l.dsrcTensorDesc,
                 state.delta);
-        if(l.binary || l.xnor) swap_binary(&l);
-        if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+            if (l.binary || l.xnor) swap_binary(&l);
+            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
+        }
     }
 
-#endif    // CUDNN_HALF
+//#endif    // CUDNN_HALF
 
 #else    // CUDNN
+    if (l.batch_normalize) {
+        backward_batchnorm_layer_gpu(l, state);
+    }
+
     int m = l.n;
     int n = l.size*l.size*l.c;
     int k = l.out_w*l.out_h;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 8e83758c..9fc33c0c 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -141,14 +141,12 @@ size_t get_workspace_size(layer l){
 void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 {
 
-#ifdef CUDNN_HALF
+// CUDNN_HALF
     // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
     //   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
     // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
-    const cudnnDataType_t data_type = CUDNN_DATA_HALF;
-#else
+
     cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#endif
 
 #if(CUDNN_MAJOR >= 7)
     // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
@@ -179,11 +177,25 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
     cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
     cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
 
+#ifdef CUDNN_HALF
+    // backward delta
+    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+    // forward
+    cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+    // batch norm
+    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+#endif
+
     // batch norm
     cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
     cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
 
-    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
 #if(CUDNN_MAJOR >= 6)
     cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn >= 6.0
 #else
@@ -225,32 +237,32 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
             0,
             &l->bf_algo);
 
-    if (data_type == CUDNN_DATA_HALF)
+    //if (data_type == CUDNN_DATA_HALF)
     {
         // HALF-16 if(data_type == CUDNN_DATA_HALF)
-        l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-        l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-        l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+        l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+        l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+        l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
 
         // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
-        //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
-        //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
-        //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+        //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+        //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+        //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
 
         int fw = 0, bd = 0, bf = 0;
-        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
+        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
             //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
-        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
+        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
             //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
 
-        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
+        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
             //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
-        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
+        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
             //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
 
-        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
+        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
             //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
-        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
+        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
             //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
 
         //if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
@@ -260,12 +272,13 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output)
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index)
 {
     int i;
     convolutional_layer l = {0};
     l.type = CONVOLUTIONAL;
 
+    l.index = index;
     l.h = h;
     l.w = w;
     l.c = c;
@@ -392,15 +405,24 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
             l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
         }
 #ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
-        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
         cudnnCreateTensorDescriptor(&l.normTensorDesc);
+
+        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
         cudnnCreateTensorDescriptor(&l.srcTensorDesc);
         cudnnCreateTensorDescriptor(&l.dstTensorDesc);
         cudnnCreateFilterDescriptor(&l.weightDesc);
         cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
         cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
         cudnnCreateFilterDescriptor(&l.dweightDesc);
+
+        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
+        cudnnCreateTensorDescriptor(&l.srcTensorDesc16);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc16);
+        cudnnCreateFilterDescriptor(&l.weightDesc16);
+        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc16);
+        cudnnCreateTensorDescriptor(&l.ddstTensorDesc16);
+        cudnnCreateFilterDescriptor(&l.dweightDesc16);
+
         cudnnCreateConvolutionDescriptor(&l.convDesc);
         cudnn_convolutional_setup(&l, cudnn_fastest);
 #endif
@@ -436,7 +458,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
 
 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0);
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0);
     l.batch_normalize = 1;
     float data[] = {1,1,1,1,1,
         1,1,1,1,1,
diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h
index 20e9d68e..0bd9849e 100644
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@@ -25,7 +25,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 #endif
 
-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output);
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
diff --git a/src/crnn_layer.c b/src/crnn_layer.c
index f78e0c9d..db384cfb 100644
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
 
     l.input_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
     l.input_layer->batch = batch;
 
     l.self_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
     l.self_layer->batch = batch;
 
     l.output_layer = malloc(sizeof(layer));
     fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
     l.output_layer->batch = batch;
 
     l.output = l.output_layer->output;
diff --git a/src/detector.c b/src/detector.c
index ba4d809b..ee25685c 100644
--- a/src/detector.c
+++ b/src/detector.c
@@ -91,8 +91,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
 
     int init_w = net.w;
     int init_h = net.h;
-    int iter_save;
+    int iter_save, iter_save_last;
     iter_save = get_current_batch(net);
+    iter_save_last = get_current_batch(net);
 
     load_args args = {0};
     args.w = net.w;
@@ -210,7 +211,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
 
         //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
         //if (i % 100 == 0) {
-        if(i >= (iter_save + 100)) {
+        if(i >= (iter_save + 1000)) {
             iter_save = i;
 #ifdef GPU
             if (ngpus != 1) sync_nets(nets, ngpus, 0);
@@ -219,6 +220,16 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
             save_weights(net, buff);
         }
+
+        if (i >= (iter_save_last + 100)) {
+            iter_save_last = i;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_last.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
         free_data(train);
     }
 #ifdef GPU
diff --git a/src/layer.h b/src/layer.h
index 2cfa0871..a4ebbfbc 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -299,14 +299,16 @@ struct layer{
     float * norms_gpu;
     #ifdef CUDNN
     cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16;
     cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc16, ddstTensorDesc16;
 	cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc, normDstTensorDescF16;
-    cudnnFilterDescriptor_t weightDesc;
-    cudnnFilterDescriptor_t dweightDesc;
+    cudnnFilterDescriptor_t weightDesc, weightDesc16;
+    cudnnFilterDescriptor_t dweightDesc, dweightDesc16;
     cudnnConvolutionDescriptor_t convDesc;
-    cudnnConvolutionFwdAlgo_t fw_algo;
-    cudnnConvolutionBwdDataAlgo_t bd_algo;
-    cudnnConvolutionBwdFilterAlgo_t bf_algo;
+    cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16;
+    cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
     cudnnPoolingDescriptor_t poolingDesc;
     #endif
     #endif
diff --git a/src/network.h b/src/network.h
index 18dc5953..f630fb96 100644
--- a/src/network.h
+++ b/src/network.h
@@ -42,6 +42,7 @@ typedef struct network{
     int   *steps;
     int num_steps;
     int burn_in;
+    int cudnn_half;
 
     int adam;
     float B1;
diff --git a/src/parser.c b/src/parser.c
index c82c4a2f..c1647666 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -165,7 +165,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
     int xnor = option_find_int_quiet(options, "xnor", 0);
     int use_bin_output = option_find_int_quiet(options, "bin_output", 0);
 
-    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output);
+    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index);
     layer.flipped = option_find_int_quiet(options, "flipped", 0);
     layer.dot = option_find_float_quiet(options, "dot", 0);
 
@@ -655,7 +655,8 @@ void parse_net_options(list *options, network *net)
     net->policy = get_policy(policy_s);
     net->burn_in = option_find_int_quiet(options, "burn_in", 0);
 #ifdef CUDNN_HALF
-    net->burn_in = 0;
+    //net->burn_in = 0;
+    net->cudnn_half = 1;
 #endif
     if(net->policy == STEP){
         net->step = option_find_int(options, "step", 1);