Fixed nan issue for training with CUDNN_HALF=1 by using Tensor Cores

2023-08-10 21:13:14 +03:00 · 2018-12-07 22:40:10 +03:00
parent 21a4ec9390
commit 7c2f302321
9 changed files with 318 additions and 249 deletions
--- a/build/darknet/x64/darknet_many_images.cmd
+++ b/build/darknet/x64/darknet_many_images.cmd
@ -1,4 +1,4 @@
-darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -dont_show < data/train.txt > result.txt
+darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -ext_output -dont_show < data/train.txt > result.txt


 pause
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -138,7 +138,9 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f
 __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16)
 {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]);
+    if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]);
+    // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn
    //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]);
 }

@ -290,7 +292,13 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
    float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
    float alpha = 1, beta = 0;

-#ifdef CUDNN_HALF
+//#ifdef CUDNN_HALF
+    //if (state.use_mixed_precision) {
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if(state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in))
+    {
+        //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
+
        // Note: For improved performance it is advised to use beta[0] = 0.0.
        // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH;
        // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF
@ -320,16 +328,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
        //fill_ongpu(output16_size / 2, 0, (float *)output16, 1);
        cudnnConvolutionForward(cudnn_handle(),
            &alpha,
-        l.srcTensorDesc,
+            l.srcTensorDesc16,
            input16,
-        l.weightDesc,
+            l.weightDesc16,
            l.weights_gpu16,
            l.convDesc,
-        l.fw_algo,
+            l.fw_algo16,
            state.workspace,
            l.workspace_size,
            &beta,
-        l.dstTensorDesc,
+            l.dstTensorDesc16,
            output16);


@ -377,8 +385,10 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
            cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu);
            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
        }
+    }
+    else {

-#else
+        //#else

        cudnnConvolutionForward(cudnn_handle(),
            &alpha, //&one,
@ -395,7 +405,14 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
            l.output_gpu);

        //cudaDeviceSynchronize();
-#endif    // CUDNN_HALF
+        if (l.batch_normalize) {
+            forward_batchnorm_layer_gpu(l, state);
+        }
+        else {
+            add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+        }
+    //#endif    // CUDNN_HALF
+    }


 #else
@ -418,16 +435,17 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
        }
        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
    }
-#endif

-#ifndef CUDNN_HALF
    if (l.batch_normalize) {
        forward_batchnorm_layer_gpu(l, state);
    }
    else {
        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
-#endif // no CUDNN_HALF
+#endif
+
+//#ifndef CUDNN_HALF
+//#endif // no CUDNN_HALF

    if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
    //if(l.dot > 0) dot_error_gpu(l);
@ -441,13 +459,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state

    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);

-#ifndef CUDNN_HALF
-    if(l.batch_normalize){
-        backward_batchnorm_layer_gpu(l, state);
-    } else {
-        //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
-    }
-#endif // no CUDNN_HALF
+//#ifndef CUDNN_HALF
+    //if(l.batch_normalize){
+    //    backward_batchnorm_layer_gpu(l, state);
+    //} else {
+    //    //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    //}
+//#endif // no CUDNN_HALF
    float *original_input = state.input;

    if(l.xnor) state.input = l.binary_input_gpu;
@ -455,7 +473,10 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
    float one = 1;
    float alpha = 1, beta = 0;

-#ifdef CUDNN_HALF
+//#ifdef CUDNN_HALF
+    int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
+    if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in))
+    {

        const size_t input16_size = l.batch*l.c*l.w*l.h;
        const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h;
@ -521,12 +542,12 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state

        cudnnConvolutionBackwardFilter(cudnn_handle(),
            &one,
-        l.srcTensorDesc,
+            l.srcTensorDesc16,
            input16, //state.input,
-        l.ddstTensorDesc,
+            l.ddstTensorDesc16,
            delta16, //l.delta_gpu,
            l.convDesc,
-        l.bf_algo,
+            l.bf_algo16,
            state.workspace,
            l.workspace_size,
            &one,
@ -544,16 +565,16 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
            // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16)
            cudnnConvolutionBackwardData(cudnn_handle(),
                &alpha,
-            l.weightDesc,
+                l.weightDesc16,
                l.weights_gpu16, //l.weights_gpu,
-            l.ddstTensorDesc,
+                l.ddstTensorDesc16,
                delta16, //l.delta_gpu,
                l.convDesc,
-            l.bd_algo,
+                l.bd_algo16,
                state.workspace,
                l.workspace_size,
                &beta,
-            l.dsrcTensorDesc,
+                l.dsrcTensorDesc16,
                input16);    // state.delta);

            cuda_convert_f16_to_f32(input16, input16_size, state.delta);
@ -561,7 +582,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
            if (l.binary || l.xnor) swap_binary(&l);
            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
        }
-#else    // CUDNN_HALF
+    }
+    else {
+        //#else    // CUDNN_HALF
+
+        if(l.batch_normalize){
+            backward_batchnorm_layer_gpu(l, state);
+        }

        // calculate conv weight updates
        // if used: beta=1 then loss decreases faster
@ -599,10 +626,15 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
            if (l.binary || l.xnor) swap_binary(&l);
            if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta);
        }
+    }

-#endif    // CUDNN_HALF
+//#endif    // CUDNN_HALF

 #else    // CUDNN
+    if (l.batch_normalize) {
+        backward_batchnorm_layer_gpu(l, state);
+    }
+
    int m = l.n;
    int n = l.size*l.size*l.c;
    int k = l.out_w*l.out_h;
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -141,14 +141,12 @@ size_t get_workspace_size(layer l){
 void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 {

-#ifdef CUDNN_HALF
+// CUDNN_HALF
    // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
    //   Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
    // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
-    const cudnnDataType_t data_type = CUDNN_DATA_HALF;
-#else
+
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
-#endif

 #if(CUDNN_MAJOR >= 7)
    // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
@ -179,11 +177,25 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
    cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);

+#ifdef CUDNN_HALF
+    // backward delta
+    cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+    // forward
+    cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
+    cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+    cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
+
+    // batch norm
+    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
+#endif
+
    // batch norm
    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
    cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);

-    cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
 #if(CUDNN_MAJOR >= 6)
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT);    // cudnn >= 6.0
 #else
@ -225,32 +237,32 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
            0,
            &l->bf_algo);

-    if (data_type == CUDNN_DATA_HALF)
+    //if (data_type == CUDNN_DATA_HALF)
    {
        // HALF-16 if(data_type == CUDNN_DATA_HALF)
-        l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
-        l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-        l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+        l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
+        l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+        l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;

        // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
-        //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
-        //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
-        //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
+        //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
+        //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
+        //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;

        int fw = 0, bd = 0, bf = 0;
-        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
+        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
-        if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
+        if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
            //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");

-        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
+        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1  \n");
-        if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
+        if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
            //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");

-        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
+        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1   \n");
-        if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
+        if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
            //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");

        //if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
@ -260,12 +272,13 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
 #endif
 #endif

-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output)
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index)
 {
    int i;
    convolutional_layer l = {0};
    l.type = CONVOLUTIONAL;

+    l.index = index;
    l.h = h;
    l.w = w;
    l.c = c;
@ -392,15 +405,24 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        }
 #ifdef CUDNN
-        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
-        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+
+        cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
        cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
        cudnnCreateFilterDescriptor(&l.dweightDesc);
+
+        cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
+        cudnnCreateTensorDescriptor(&l.srcTensorDesc16);
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc16);
+        cudnnCreateFilterDescriptor(&l.weightDesc16);
+        cudnnCreateTensorDescriptor(&l.dsrcTensorDesc16);
+        cudnnCreateTensorDescriptor(&l.ddstTensorDesc16);
+        cudnnCreateFilterDescriptor(&l.dweightDesc16);
+
        cudnnCreateConvolutionDescriptor(&l.convDesc);
        cudnn_convolutional_setup(&l, cudnn_fastest);
 #endif
@ -436,7 +458,7 @@ void denormalize_convolutional_layer(convolutional_layer l)

 void test_convolutional_layer()
 {
-    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0);
+    convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0);
    l.batch_normalize = 1;
    float data[] = {1,1,1,1,1,
        1,1,1,1,1,
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@ -25,7 +25,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16);
 #endif
 #endif

-convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output);
+convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index);
 void denormalize_convolutional_layer(convolutional_layer l);
 void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
 void forward_convolutional_layer(const convolutional_layer layer, network_state state);
--- a/src/crnn_layer.c
+++ b/src/crnn_layer.c
@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou

    l.input_layer = malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
    l.input_layer->batch = batch;

    l.self_layer = malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
    l.self_layer->batch = batch;

    l.output_layer = malloc(sizeof(layer));
    fprintf(stderr, "\t\t");
-    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0);
+    *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1,  activation, batch_normalize, 0, 0, 0, 0, 0);
    l.output_layer->batch = batch;

    l.output = l.output_layer->output;
--- a/src/detector.c
+++ b/src/detector.c
@ -91,8 +91,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i

    int init_w = net.w;
    int init_h = net.h;
-    int iter_save;
+    int iter_save, iter_save_last;
    iter_save = get_current_batch(net);
+    iter_save_last = get_current_batch(net);

    load_args args = {0};
    args.w = net.w;
@ -210,7 +211,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i

        //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
        //if (i % 100 == 0) {
-        if(i >= (iter_save + 100)) {
+        if(i >= (iter_save + 1000)) {
            iter_save = i;
 #ifdef GPU
            if (ngpus != 1) sync_nets(nets, ngpus, 0);
@ -219,6 +220,16 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
            save_weights(net, buff);
        }
+
+        if (i >= (iter_save_last + 100)) {
+            iter_save_last = i;
+#ifdef GPU
+            if (ngpus != 1) sync_nets(nets, ngpus, 0);
+#endif
+            char buff[256];
+            sprintf(buff, "%s/%s_last.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
        free_data(train);
    }
 #ifdef GPU
--- a/src/layer.h
+++ b/src/layer.h
@ -299,14 +299,16 @@ struct layer{
    float * norms_gpu;
    #ifdef CUDNN
    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
+    cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16;
    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t dsrcTensorDesc16, ddstTensorDesc16;
 	cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc, normDstTensorDescF16;
-    cudnnFilterDescriptor_t weightDesc;
-    cudnnFilterDescriptor_t dweightDesc;
+    cudnnFilterDescriptor_t weightDesc, weightDesc16;
+    cudnnFilterDescriptor_t dweightDesc, dweightDesc16;
    cudnnConvolutionDescriptor_t convDesc;
-    cudnnConvolutionFwdAlgo_t fw_algo;
-    cudnnConvolutionBwdDataAlgo_t bd_algo;
-    cudnnConvolutionBwdFilterAlgo_t bf_algo;
+    cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16;
+    cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16;
+    cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16;
    cudnnPoolingDescriptor_t poolingDesc;
    #endif
    #endif
--- a/src/network.h
+++ b/src/network.h
@ -42,6 +42,7 @@ typedef struct network{
    int   *steps;
    int num_steps;
    int burn_in;
+    int cudnn_half;

    int adam;
    float B1;
--- a/src/parser.c
+++ b/src/parser.c
@ -165,7 +165,7 @@ convolutional_layer parse_convolutional(list *options, size_params params)
    int xnor = option_find_int_quiet(options, "xnor", 0);
    int use_bin_output = option_find_int_quiet(options, "bin_output", 0);

-    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output);
+    convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index);
    layer.flipped = option_find_int_quiet(options, "flipped", 0);
    layer.dot = option_find_float_quiet(options, "dot", 0);

@ -655,7 +655,8 @@ void parse_net_options(list *options, network *net)
    net->policy = get_policy(policy_s);
    net->burn_in = option_find_int_quiet(options, "burn_in", 0);
 #ifdef CUDNN_HALF
-    net->burn_in = 0;
+    //net->burn_in = 0;
+    net->cudnn_half = 1;
 #endif
    if(net->policy == STEP){
        net->step = option_find_int(options, "step", 1);