From 7c2f30232132e9fc90155288689016db93d17990 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Fri, 7 Dec 2018 22:40:10 +0300 Subject: [PATCH] Fixed nan issue for training with CUDNN_HALF=1 by using Tensor Cores --- build/darknet/x64/darknet_many_images.cmd | 2 +- src/convolutional_kernels.cu | 458 ++++++++++++---------- src/convolutional_layer.c | 66 ++-- src/convolutional_layer.h | 2 +- src/crnn_layer.c | 6 +- src/detector.c | 15 +- src/layer.h | 12 +- src/network.h | 1 + src/parser.c | 5 +- 9 files changed, 318 insertions(+), 249 deletions(-) diff --git a/build/darknet/x64/darknet_many_images.cmd b/build/darknet/x64/darknet_many_images.cmd index 5f4658e4..a763d022 100644 --- a/build/darknet/x64/darknet_many_images.cmd +++ b/build/darknet/x64/darknet_many_images.cmd @@ -1,4 +1,4 @@ -darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -dont_show < data/train.txt > result.txt +darknet.exe detector test data/voc.data cfg/yolov2-voc.cfg yolo-voc.weights -ext_output -dont_show < data/train.txt > result.txt pause \ No newline at end of file diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 6824083e..04d3820f 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -138,7 +138,9 @@ void fast_binarize_weights_gpu(float *weights, int n, int size, float *binary, f __global__ void cuda_f32_to_f16(float* input_f32, size_t size, half *output_f16) { int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); + //if (idx < size) output_f16[idx] = __float2half(input_f32[idx]); + if (idx < size) output_f16[idx] = __float2half_rn(input_f32[idx]); + // __float2half_ru, __float2half_rd, __float2half_rz, __float2half_rn //if (idx < size) *((unsigned short *)output_f16 + idx) = __float2half(input_f32[idx]); } @@ -290,113 +292,128 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT float alpha = 1, beta = 0; -#ifdef CUDNN_HALF - // Note: For improved performance it is advised to use beta[0] = 0.0. - // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH; - // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF - // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED - // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops - - const size_t input16_size = l.batch*l.c*l.w*l.h; - const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w; - - if (*state.net.max_input16_size < input16_size) { - //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size); - *state.net.max_input16_size = input16_size; - if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); - *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); - } - float *input16 = *state.net.input16_gpu; - - if (*state.net.max_output16_size < output16_size) { - *state.net.max_output16_size = output16_size; - if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); - *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); - } - float *output16 = *state.net.output16_gpu; - - cuda_convert_f32_to_f16(state.input, input16_size, input16); - - //fill_ongpu(output16_size / 2, 0, (float *)output16, 1); - cudnnConvolutionForward(cudnn_handle(), - &alpha, - l.srcTensorDesc, - input16, - l.weightDesc, - l.weights_gpu16, - l.convDesc, - l.fw_algo, - state.workspace, - l.workspace_size, - &beta, - l.dstTensorDesc, - output16); - - - if (l.batch_normalize) +//#ifdef CUDNN_HALF + //if (state.use_mixed_precision) { + int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + if(state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in)) { - if (state.train) // Training - { - copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1); - //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream()); - float one = 1; - float zero = 0; - // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth - // compared to FP32, it’s just that the statistics and value adjustment should be done in FP32. - cudnnBatchNormalizationForwardTraining(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - l.normDstTensorDescF16, - l.x_gpu, // input - l.normDstTensorDescF16, - output16, // output - l.normTensorDesc, - l.scales_gpu, - l.biases_gpu, - .01, - l.rolling_mean_gpu, // output (should be FP32) - l.rolling_variance_gpu, // output (should be FP32) - .00001, - l.mean_gpu, // output (should be FP32) - l.variance_gpu); // output (should be FP32) + //printf("\n CUDNN_HALF!!! state.index = %d \n", state.index); - cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - //forward_batchnorm_layer_gpu(l, state); + // Note: For improved performance it is advised to use beta[0] = 0.0. + // For Tensor Core: cudnnSetConvolutionMathType() where cudnnMathType_t mathType = CUDNN_TENSOR_OP_MATH; + // 1. or CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM and use CUDNN_DATA_HALF + // 2. or CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED + // More: http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#tensor_ops + + const size_t input16_size = l.batch*l.c*l.w*l.h; + const size_t output16_size = l.batch*l.out_c*l.out_h*l.out_w; + + if (*state.net.max_input16_size < input16_size) { + //printf("\n input16_size: cur = %zu \t max = %zu \n", input16_size, *state.net.max_input16_size); + *state.net.max_input16_size = input16_size; + if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); + *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); } - else // Detection + float *input16 = *state.net.input16_gpu; + + if (*state.net.max_output16_size < output16_size) { + *state.net.max_output16_size = output16_size; + if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); + *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); + } + float *output16 = *state.net.output16_gpu; + + cuda_convert_f32_to_f16(state.input, input16_size, input16); + + //fill_ongpu(output16_size / 2, 0, (float *)output16, 1); + cudnnConvolutionForward(cudnn_handle(), + &alpha, + l.srcTensorDesc16, + input16, + l.weightDesc16, + l.weights_gpu16, + l.convDesc, + l.fw_algo16, + state.workspace, + l.workspace_size, + &beta, + l.dstTensorDesc16, + output16); + + + if (l.batch_normalize) + { + if (state.train) // Training + { + copy_ongpu(l.outputs*l.batch / 2, output16, 1, l.x_gpu, 1); + //cudaMemcpyAsync(l.x_gpu, output16, l.outputs*l.batch*sizeof(half), cudaMemcpyDefault, get_cuda_stream()); + float one = 1; + float zero = 0; + // Batch-normalization can still take FP16 inputs and outputs, saving half the bandwidth + // compared to FP32, it’s just that the statistics and value adjustment should be done in FP32. + cudnnBatchNormalizationForwardTraining(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + l.normDstTensorDescF16, + l.x_gpu, // input + l.normDstTensorDescF16, + output16, // output + l.normTensorDesc, + l.scales_gpu, + l.biases_gpu, + .01, + l.rolling_mean_gpu, // output (should be FP32) + l.rolling_variance_gpu, // output (should be FP32) + .00001, + l.mean_gpu, // output (should be FP32) + l.variance_gpu); // output (should be FP32) + + cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); + //forward_batchnorm_layer_gpu(l, state); + } + else // Detection + { + cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); + normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); + scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); + } + } + else // BIAS only { cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); - scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } } - else // BIAS only - { - cuda_convert_f16_to_f32(output16, output16_size, l.output_gpu); - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + else { + + //#else + + cudnnConvolutionForward(cudnn_handle(), + &alpha, //&one, + l.srcTensorDesc, + state.input, + l.weightDesc, + l.weights_gpu, + l.convDesc, + l.fw_algo, + state.workspace, + l.workspace_size, + &beta, //&one, + l.dstTensorDesc, + l.output_gpu); + + //cudaDeviceSynchronize(); + if (l.batch_normalize) { + forward_batchnorm_layer_gpu(l, state); + } + else { + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } + //#endif // CUDNN_HALF } -#else - - cudnnConvolutionForward(cudnn_handle(), - &alpha, //&one, - l.srcTensorDesc, - state.input, - l.weightDesc, - l.weights_gpu, - l.convDesc, - l.fw_algo, - state.workspace, - l.workspace_size, - &beta, //&one, - l.dstTensorDesc, - l.output_gpu); - - //cudaDeviceSynchronize(); -#endif // CUDNN_HALF - #else fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); @@ -418,16 +435,17 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) } gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); } -#endif -#ifndef CUDNN_HALF if (l.batch_normalize) { forward_batchnorm_layer_gpu(l, state); } else { add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } -#endif // no CUDNN_HALF +#endif + +//#ifndef CUDNN_HALF +//#endif // no CUDNN_HALF if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); //if(l.dot > 0) dot_error_gpu(l); @@ -441,13 +459,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); -#ifndef CUDNN_HALF - if(l.batch_normalize){ - backward_batchnorm_layer_gpu(l, state); - } else { - //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); - } -#endif // no CUDNN_HALF +//#ifndef CUDNN_HALF + //if(l.batch_normalize){ + // backward_batchnorm_layer_gpu(l, state); + //} else { + // //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + //} +//#endif // no CUDNN_HALF float *original_input = state.input; if(l.xnor) state.input = l.binary_input_gpu; @@ -455,117 +473,126 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state float one = 1; float alpha = 1, beta = 0; -#ifdef CUDNN_HALF - - const size_t input16_size = l.batch*l.c*l.w*l.h; - const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h; - - if (*state.net.max_input16_size < input16_size) { - *state.net.max_input16_size = input16_size; - if(*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); - *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); - } - float *input16 = *state.net.input16_gpu; - - if (*state.net.max_output16_size < delta16_size) { - *state.net.max_output16_size = delta16_size; - if(*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); - *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); - } - float *delta16 = *state.net.output16_gpu; - - cuda_convert_f32_to_f16(state.input, input16_size, input16); - cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16); - - if (l.batch_normalize) { - //if (!state.train) { - // l.mean_gpu = l.rolling_mean_gpu; - // l.variance_gpu = l.rolling_variance_gpu; - //} - float one = 1; - float zero = 0; - cudnnBatchNormalizationBackward(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - &one, - &one, - l.normDstTensorDescF16, - l.x_gpu, // input - l.normDstTensorDescF16, - delta16, // input - l.normDstTensorDescF16, - l.x_norm_gpu, // output - l.normTensorDesc, - l.scales_gpu, // output (should be FP32) - l.scale_updates_gpu, // output (should be FP32) - l.bias_updates_gpu, // output (should be FP32) - .00001, - l.mean_gpu, // input (should be FP32) - l.variance_gpu); // input (should be FP32) - copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1); - //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream()); - } - else +//#ifdef CUDNN_HALF + int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions); + if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > state.net.burn_in)) { - //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); - } - // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16 - // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16) + const size_t input16_size = l.batch*l.c*l.w*l.h; + const size_t delta16_size = l.batch*l.n*l.out_w*l.out_h; - // calculate conv weight updates - // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum - // so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m - cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16); + if (*state.net.max_input16_size < input16_size) { + *state.net.max_input16_size = input16_size; + if (*state.net.input16_gpu) cuda_free(*state.net.input16_gpu); + *state.net.input16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_input16_size); + } + float *input16 = *state.net.input16_gpu; - cudnnConvolutionBackwardFilter(cudnn_handle(), - &one, - l.srcTensorDesc, - input16, //state.input, - l.ddstTensorDesc, - delta16, //l.delta_gpu, - l.convDesc, - l.bf_algo, - state.workspace, - l.workspace_size, - &one, - l.dweightDesc, - l.weight_updates_gpu16); // l.weight_updates_gpu); + if (*state.net.max_output16_size < delta16_size) { + *state.net.max_output16_size = delta16_size; + if (*state.net.output16_gpu) cuda_free(*state.net.output16_gpu); + *state.net.output16_gpu = (float *)cuda_make_f16_from_f32_array(NULL, *state.net.max_output16_size); + } + float *delta16 = *state.net.output16_gpu; - cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu); + cuda_convert_f32_to_f16(state.input, input16_size, input16); + cuda_convert_f32_to_f16(l.delta_gpu, delta16_size, delta16); - if (state.delta) { - if (l.binary || l.xnor) swap_binary(&l); + if (l.batch_normalize) { + //if (!state.train) { + // l.mean_gpu = l.rolling_mean_gpu; + // l.variance_gpu = l.rolling_variance_gpu; + //} + float one = 1; + float zero = 0; + cudnnBatchNormalizationBackward(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + &one, + &one, + l.normDstTensorDescF16, + l.x_gpu, // input + l.normDstTensorDescF16, + delta16, // input + l.normDstTensorDescF16, + l.x_norm_gpu, // output + l.normTensorDesc, + l.scales_gpu, // output (should be FP32) + l.scale_updates_gpu, // output (should be FP32) + l.bias_updates_gpu, // output (should be FP32) + .00001, + l.mean_gpu, // input (should be FP32) + l.variance_gpu); // input (should be FP32) + copy_ongpu(l.outputs*l.batch / 2, l.x_norm_gpu, 1, delta16, 1); + //cudaMemcpyAsync(delta16, l.x_norm_gpu, l.outputs*l.batch * sizeof(half), cudaMemcpyDefault, get_cuda_stream()); + } + else + { + //backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + } - // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData - // calculate delta for the next layer - // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16 - // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16) - cudnnConvolutionBackwardData(cudnn_handle(), - &alpha, - l.weightDesc, - l.weights_gpu16, //l.weights_gpu, - l.ddstTensorDesc, + // convert input: state.input (x), l.delta_gpu (y) from fp32 to fp16 + // get output: l.weight_updates_gpu (dw) and convert it to fp32 (ONLY if it is fp16) + + // calculate conv weight updates + // Already: l.weight_updates_gpu = (l.weight_updates_gpu - l.weight*decay*batch*subdivision)*momentum + // so we should copy f32 to f16, or compute: f16=(w_up - w*d*b*s)*m + cuda_convert_f32_to_f16(l.weight_updates_gpu, l.c*l.n*l.size*l.size, l.weight_updates_gpu16); + + cudnnConvolutionBackwardFilter(cudnn_handle(), + &one, + l.srcTensorDesc16, + input16, //state.input, + l.ddstTensorDesc16, delta16, //l.delta_gpu, l.convDesc, - l.bd_algo, + l.bf_algo16, state.workspace, l.workspace_size, - &beta, - l.dsrcTensorDesc, - input16); // state.delta); + &one, + l.dweightDesc, + l.weight_updates_gpu16); // l.weight_updates_gpu); - cuda_convert_f16_to_f32(input16, input16_size, state.delta); + cuda_convert_f16_to_f32(l.weight_updates_gpu16, l.c*l.n*l.size*l.size, l.weight_updates_gpu); - if (l.binary || l.xnor) swap_binary(&l); - if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData + // calculate delta for the next layer + // convert input: l.weights_gpu (w), l.delta_gpu (dy) from fp32 to fp16 + // get output: state.delta (dx) and convert it to fp32 (ONLY if it is fp16) + cudnnConvolutionBackwardData(cudnn_handle(), + &alpha, + l.weightDesc16, + l.weights_gpu16, //l.weights_gpu, + l.ddstTensorDesc16, + delta16, //l.delta_gpu, + l.convDesc, + l.bd_algo16, + state.workspace, + l.workspace_size, + &beta, + l.dsrcTensorDesc16, + input16); // state.delta); + + cuda_convert_f16_to_f32(input16, input16_size, state.delta); + + if (l.binary || l.xnor) swap_binary(&l); + if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + } } -#else // CUDNN_HALF + else { + //#else // CUDNN_HALF - // calculate conv weight updates - // if used: beta=1 then loss decreases faster - cudnnConvolutionBackwardFilter(cudnn_handle(), + if(l.batch_normalize){ + backward_batchnorm_layer_gpu(l, state); + } + + // calculate conv weight updates + // if used: beta=1 then loss decreases faster + cudnnConvolutionBackwardFilter(cudnn_handle(), &one, l.srcTensorDesc, state.input, @@ -579,11 +606,11 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.dweightDesc, l.weight_updates_gpu); - if(state.delta){ - if(l.binary || l.xnor) swap_binary(&l); - // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData - // calculate delta for the next layer - cudnnConvolutionBackwardData(cudnn_handle(), + if (state.delta) { + if (l.binary || l.xnor) swap_binary(&l); + // http://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html#cudnnConvolutionBackwardData + // calculate delta for the next layer + cudnnConvolutionBackwardData(cudnn_handle(), &one, l.weightDesc, l.weights_gpu, @@ -596,13 +623,18 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state &one, l.dsrcTensorDesc, state.delta); - if(l.binary || l.xnor) swap_binary(&l); - if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + if (l.binary || l.xnor) swap_binary(&l); + if (l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + } } -#endif // CUDNN_HALF +//#endif // CUDNN_HALF #else // CUDNN + if (l.batch_normalize) { + backward_batchnorm_layer_gpu(l, state); + } + int m = l.n; int n = l.size*l.size*l.c; int k = l.out_w*l.out_h; diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 8e83758c..9fc33c0c 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -141,14 +141,12 @@ size_t get_workspace_size(layer l){ void cudnn_convolutional_setup(layer *l, int cudnn_preference) { -#ifdef CUDNN_HALF +// CUDNN_HALF // TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0): // Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100 // PSEUDO_HALF_CONFIG is required for Tensor Cores - our case! - const cudnnDataType_t data_type = CUDNN_DATA_HALF; -#else + cudnnDataType_t data_type = CUDNN_DATA_FLOAT; -#endif #if(CUDNN_MAJOR >= 7) // Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH @@ -179,11 +177,25 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); +#ifdef CUDNN_HALF + // backward delta + cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w); + cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w); + cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); + + // forward + cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w); + cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w); + cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); + + // batch norm + cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w); +#endif + // batch norm cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); - cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w); #if(CUDNN_MAJOR >= 6) cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0 #else @@ -225,32 +237,32 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) 0, &l->bf_algo); - if (data_type == CUDNN_DATA_HALF) + //if (data_type == CUDNN_DATA_HALF) { // HALF-16 if(data_type == CUDNN_DATA_HALF) - l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT) - //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; - //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; - //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; + //l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; + //l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; + //l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; int fw = 0, bd = 0, bf = 0; - if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1; + if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1; //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n"); - if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2; + if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2; //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n"); - if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1; + if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1; //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n"); - if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2; + if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2; //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n"); - if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1; + if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1; //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n"); - if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2; + if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2; //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n"); //if (fw == 2 && bd == 2 && bf == 2) printf("TF "); @@ -260,12 +272,13 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) #endif #endif -convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output) +convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index) { int i; convolutional_layer l = {0}; l.type = CONVOLUTIONAL; + l.index = index; l.h = h; l.w = w; l.c = c; @@ -392,15 +405,24 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN - cudnnCreateTensorDescriptor(&l.normDstTensorDesc); - cudnnCreateTensorDescriptor(&l.normDstTensorDescF16); cudnnCreateTensorDescriptor(&l.normTensorDesc); + + cudnnCreateTensorDescriptor(&l.normDstTensorDesc); cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.weightDesc); cudnnCreateTensorDescriptor(&l.dsrcTensorDesc); cudnnCreateTensorDescriptor(&l.ddstTensorDesc); cudnnCreateFilterDescriptor(&l.dweightDesc); + + cudnnCreateTensorDescriptor(&l.normDstTensorDescF16); + cudnnCreateTensorDescriptor(&l.srcTensorDesc16); + cudnnCreateTensorDescriptor(&l.dstTensorDesc16); + cudnnCreateFilterDescriptor(&l.weightDesc16); + cudnnCreateTensorDescriptor(&l.dsrcTensorDesc16); + cudnnCreateTensorDescriptor(&l.ddstTensorDesc16); + cudnnCreateFilterDescriptor(&l.dweightDesc16); + cudnnCreateConvolutionDescriptor(&l.convDesc); cudnn_convolutional_setup(&l, cudnn_fastest); #endif @@ -436,7 +458,7 @@ void denormalize_convolutional_layer(convolutional_layer l) void test_convolutional_layer() { - convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0); + convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0); l.batch_normalize = 1; float data[] = {1,1,1,1,1, 1,1,1,1,1, diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 20e9d68e..0bd9849e 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -25,7 +25,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16); #endif #endif -convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output); +convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index); void denormalize_convolutional_layer(convolutional_layer l); void resize_convolutional_layer(convolutional_layer *layer, int w, int h); void forward_convolutional_layer(const convolutional_layer layer, network_state state); diff --git a/src/crnn_layer.c b/src/crnn_layer.c index f78e0c9d..db384cfb 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou l.input_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); + *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0, 0); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); + *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0, 0); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); + *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0, 0); l.output_layer->batch = batch; l.output = l.output_layer->output; diff --git a/src/detector.c b/src/detector.c index ba4d809b..ee25685c 100644 --- a/src/detector.c +++ b/src/detector.c @@ -91,8 +91,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i int init_w = net.w; int init_h = net.h; - int iter_save; + int iter_save, iter_save_last; iter_save = get_current_batch(net); + iter_save_last = get_current_batch(net); load_args args = {0}; args.w = net.w; @@ -210,7 +211,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i //if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) { //if (i % 100 == 0) { - if(i >= (iter_save + 100)) { + if(i >= (iter_save + 1000)) { iter_save = i; #ifdef GPU if (ngpus != 1) sync_nets(nets, ngpus, 0); @@ -219,6 +220,16 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); } + + if (i >= (iter_save_last + 100)) { + iter_save_last = i; +#ifdef GPU + if (ngpus != 1) sync_nets(nets, ngpus, 0); +#endif + char buff[256]; + sprintf(buff, "%s/%s_last.weights", backup_directory, base, i); + save_weights(net, buff); + } free_data(train); } #ifdef GPU diff --git a/src/layer.h b/src/layer.h index 2cfa0871..a4ebbfbc 100644 --- a/src/layer.h +++ b/src/layer.h @@ -299,14 +299,16 @@ struct layer{ float * norms_gpu; #ifdef CUDNN cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc; + cudnnTensorDescriptor_t srcTensorDesc16, dstTensorDesc16; cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc; + cudnnTensorDescriptor_t dsrcTensorDesc16, ddstTensorDesc16; cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc, normDstTensorDescF16; - cudnnFilterDescriptor_t weightDesc; - cudnnFilterDescriptor_t dweightDesc; + cudnnFilterDescriptor_t weightDesc, weightDesc16; + cudnnFilterDescriptor_t dweightDesc, dweightDesc16; cudnnConvolutionDescriptor_t convDesc; - cudnnConvolutionFwdAlgo_t fw_algo; - cudnnConvolutionBwdDataAlgo_t bd_algo; - cudnnConvolutionBwdFilterAlgo_t bf_algo; + cudnnConvolutionFwdAlgo_t fw_algo, fw_algo16; + cudnnConvolutionBwdDataAlgo_t bd_algo, bd_algo16; + cudnnConvolutionBwdFilterAlgo_t bf_algo, bf_algo16; cudnnPoolingDescriptor_t poolingDesc; #endif #endif diff --git a/src/network.h b/src/network.h index 18dc5953..f630fb96 100644 --- a/src/network.h +++ b/src/network.h @@ -42,6 +42,7 @@ typedef struct network{ int *steps; int num_steps; int burn_in; + int cudnn_half; int adam; float B1; diff --git a/src/parser.c b/src/parser.c index c82c4a2f..c1647666 100644 --- a/src/parser.c +++ b/src/parser.c @@ -165,7 +165,7 @@ convolutional_layer parse_convolutional(list *options, size_params params) int xnor = option_find_int_quiet(options, "xnor", 0); int use_bin_output = option_find_int_quiet(options, "bin_output", 0); - convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output); + convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output, params.index); layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.dot = option_find_float_quiet(options, "dot", 0); @@ -655,7 +655,8 @@ void parse_net_options(list *options, network *net) net->policy = get_policy(policy_s); net->burn_in = option_find_int_quiet(options, "burn_in", 0); #ifdef CUDNN_HALF - net->burn_in = 0; + //net->burn_in = 0; + net->cudnn_half = 1; #endif if(net->policy == STEP){ net->step = option_find_int(options, "step", 1);