mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Fixed nan issue for training with CUDNN_HALF=1 by using Tensor Cores
This commit is contained in:
@ -141,14 +141,12 @@ size_t get_workspace_size(layer l){
|
||||
void cudnn_convolutional_setup(layer *l, int cudnn_preference)
|
||||
{
|
||||
|
||||
#ifdef CUDNN_HALF
|
||||
// CUDNN_HALF
|
||||
// TRUE_HALF_CONFIG is only supported on architectures with true fp16 support (compute capability 5.3 and 6.0):
|
||||
// Tegra X1, Jetson TX1, DRIVE CX, DRIVE PX, Quadro GP100, Tesla P100
|
||||
// PSEUDO_HALF_CONFIG is required for Tensor Cores - our case!
|
||||
const cudnnDataType_t data_type = CUDNN_DATA_HALF;
|
||||
#else
|
||||
|
||||
cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
|
||||
#endif
|
||||
|
||||
#if(CUDNN_MAJOR >= 7)
|
||||
// Tensor Core uses CUDNN_TENSOR_OP_MATH instead of CUDNN_DEFAULT_MATH
|
||||
@ -179,11 +177,25 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
|
||||
cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
cudnnSetFilter4dDescriptor(l->weightDesc, data_type, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
|
||||
|
||||
#ifdef CUDNN_HALF
|
||||
// backward delta
|
||||
cudnnSetTensor4dDescriptor(l->dsrcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
|
||||
cudnnSetTensor4dDescriptor(l->ddstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
cudnnSetFilter4dDescriptor(l->dweightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
|
||||
|
||||
// forward
|
||||
cudnnSetTensor4dDescriptor(l->srcTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->c, l->h, l->w);
|
||||
cudnnSetTensor4dDescriptor(l->dstTensorDesc16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
cudnnSetFilter4dDescriptor(l->weightDesc16, CUDNN_DATA_HALF, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size);
|
||||
|
||||
// batch norm
|
||||
cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, CUDNN_DATA_HALF, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
#endif
|
||||
|
||||
// batch norm
|
||||
cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
|
||||
cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
|
||||
cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
|
||||
#if(CUDNN_MAJOR >= 6)
|
||||
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0
|
||||
#else
|
||||
@ -225,32 +237,32 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
|
||||
0,
|
||||
&l->bf_algo);
|
||||
|
||||
if (data_type == CUDNN_DATA_HALF)
|
||||
//if (data_type == CUDNN_DATA_HALF)
|
||||
{
|
||||
// HALF-16 if(data_type == CUDNN_DATA_HALF)
|
||||
l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
||||
l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
|
||||
l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
|
||||
l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
||||
l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
|
||||
l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
|
||||
|
||||
// FLOAT-32 if(data_type == CUDNN_DATA_FLOAT)
|
||||
//l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
|
||||
//l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
|
||||
//l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
|
||||
//l->fw_algo16 = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
|
||||
//l->bd_algo16 = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED;
|
||||
//l->bf_algo16 = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED;
|
||||
|
||||
int fw = 0, bd = 0, bf = 0;
|
||||
if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
|
||||
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1;
|
||||
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n");
|
||||
if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
|
||||
if (l->fw_algo16 == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2;
|
||||
//printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n");
|
||||
|
||||
if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
|
||||
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1;
|
||||
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n");
|
||||
if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
|
||||
if (l->bd_algo16 == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2;
|
||||
//printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n");
|
||||
|
||||
if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
|
||||
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1;
|
||||
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n");
|
||||
if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
|
||||
if (l->bf_algo16 == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2;
|
||||
//printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n");
|
||||
|
||||
//if (fw == 2 && bd == 2 && bf == 2) printf("TF ");
|
||||
@ -260,12 +272,13 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output)
|
||||
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output, int index)
|
||||
{
|
||||
int i;
|
||||
convolutional_layer l = {0};
|
||||
l.type = CONVOLUTIONAL;
|
||||
|
||||
l.index = index;
|
||||
l.h = h;
|
||||
l.w = w;
|
||||
l.c = c;
|
||||
@ -392,15 +405,24 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
|
||||
l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
|
||||
}
|
||||
#ifdef CUDNN
|
||||
cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
|
||||
cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
|
||||
cudnnCreateTensorDescriptor(&l.normTensorDesc);
|
||||
|
||||
cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
|
||||
cudnnCreateTensorDescriptor(&l.srcTensorDesc);
|
||||
cudnnCreateTensorDescriptor(&l.dstTensorDesc);
|
||||
cudnnCreateFilterDescriptor(&l.weightDesc);
|
||||
cudnnCreateTensorDescriptor(&l.dsrcTensorDesc);
|
||||
cudnnCreateTensorDescriptor(&l.ddstTensorDesc);
|
||||
cudnnCreateFilterDescriptor(&l.dweightDesc);
|
||||
|
||||
cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
|
||||
cudnnCreateTensorDescriptor(&l.srcTensorDesc16);
|
||||
cudnnCreateTensorDescriptor(&l.dstTensorDesc16);
|
||||
cudnnCreateFilterDescriptor(&l.weightDesc16);
|
||||
cudnnCreateTensorDescriptor(&l.dsrcTensorDesc16);
|
||||
cudnnCreateTensorDescriptor(&l.ddstTensorDesc16);
|
||||
cudnnCreateFilterDescriptor(&l.dweightDesc16);
|
||||
|
||||
cudnnCreateConvolutionDescriptor(&l.convDesc);
|
||||
cudnn_convolutional_setup(&l, cudnn_fastest);
|
||||
#endif
|
||||
@ -436,7 +458,7 @@ void denormalize_convolutional_layer(convolutional_layer l)
|
||||
|
||||
void test_convolutional_layer()
|
||||
{
|
||||
convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0);
|
||||
convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0, 0);
|
||||
l.batch_normalize = 1;
|
||||
float data[] = {1,1,1,1,1,
|
||||
1,1,1,1,1,
|
||||
|
Reference in New Issue
Block a user