mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Speedup Tensor Cores: 1st layer uses FP32 and pre-allocate GPU memory for Tensor Cores
This commit is contained in:
@ -295,8 +295,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
|||||||
//#ifdef CUDNN_HALF
|
//#ifdef CUDNN_HALF
|
||||||
//if (state.use_mixed_precision) {
|
//if (state.use_mixed_precision) {
|
||||||
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
||||||
if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
|
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
|
||||||
//if(state.index != 0)
|
|
||||||
{
|
{
|
||||||
//printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
|
//printf("\n CUDNN_HALF!!! state.index = %d \n", state.index);
|
||||||
|
|
||||||
@ -476,8 +475,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
|||||||
|
|
||||||
//#ifdef CUDNN_HALF
|
//#ifdef CUDNN_HALF
|
||||||
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
int iteration_num = (*state.net.seen) / (state.net.batch*state.net.subdivisions);
|
||||||
if (state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
|
if (state.index != 0 && state.net.cudnn_half && !l.xnor && (!state.train || iteration_num > 3*state.net.burn_in))
|
||||||
//if (state.index != 0)
|
|
||||||
{
|
{
|
||||||
|
|
||||||
const size_t input16_size = l.batch*l.c*l.w*l.h;
|
const size_t input16_size = l.batch*l.c*l.w*l.h;
|
||||||
|
12
src/parser.c
12
src/parser.c
@ -732,6 +732,8 @@ network parse_network_cfg_custom(char *filename, int batch)
|
|||||||
|
|
||||||
float bflops = 0;
|
float bflops = 0;
|
||||||
size_t workspace_size = 0;
|
size_t workspace_size = 0;
|
||||||
|
size_t max_inputs = 0;
|
||||||
|
size_t max_outputs = 0;
|
||||||
n = n->next;
|
n = n->next;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
free_section(s);
|
free_section(s);
|
||||||
@ -806,6 +808,8 @@ network parse_network_cfg_custom(char *filename, int batch)
|
|||||||
option_unused(options);
|
option_unused(options);
|
||||||
net.layers[count] = l;
|
net.layers[count] = l;
|
||||||
if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
|
if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
|
||||||
|
if (l.inputs > max_inputs) max_inputs = l.inputs;
|
||||||
|
if (l.outputs > max_outputs) max_outputs = l.outputs;
|
||||||
free_section(s);
|
free_section(s);
|
||||||
n = n->next;
|
n = n->next;
|
||||||
++count;
|
++count;
|
||||||
@ -828,6 +832,14 @@ network parse_network_cfg_custom(char *filename, int batch)
|
|||||||
net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
|
net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
|
||||||
int size = get_network_input_size(net) * net.batch;
|
int size = get_network_input_size(net) * net.batch;
|
||||||
net.input_state_gpu = cuda_make_array(0, size);
|
net.input_state_gpu = cuda_make_array(0, size);
|
||||||
|
|
||||||
|
// pre-allocate memory for inference on Tensor Cores (fp16)
|
||||||
|
if (net.cudnn_half) {
|
||||||
|
*net.max_input16_size = max_inputs;
|
||||||
|
check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
|
||||||
|
*net.max_output16_size = max_outputs;
|
||||||
|
check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
|
||||||
|
}
|
||||||
}else {
|
}else {
|
||||||
net.workspace = calloc(1, workspace_size);
|
net.workspace = calloc(1, workspace_size);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user