Accelerated by another 5% using FP16/32 Batch-norm for Tensor Cores.

This commit is contained in:
AlexeyAB
2018-04-17 02:51:11 +03:00
parent 701f4fab63
commit 9bae70b225
5 changed files with 105 additions and 20 deletions

View File

@ -178,6 +178,8 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference)
// batch norm
cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1);
cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w);
cudnnSetTensor4dDescriptor(l->normDstTensorDescF16, CUDNN_TENSOR_NCHW, data_type, l->batch, l->out_c, l->out_h, l->out_w);
#if(CUDNN_MAJOR >= 6)
cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0
#else
@ -379,6 +381,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
}
#ifdef CUDNN
cudnnCreateTensorDescriptor(&l.normDstTensorDesc);
cudnnCreateTensorDescriptor(&l.normDstTensorDescF16);
cudnnCreateTensorDescriptor(&l.normTensorDesc);
cudnnCreateTensorDescriptor(&l.srcTensorDesc);
cudnnCreateTensorDescriptor(&l.dstTensorDesc);