mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Fixed memory allocation
This commit is contained in:
@ -87,7 +87,7 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_
|
||||
extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state)
|
||||
{
|
||||
|
||||
#ifdef CUDNN
|
||||
#ifdef CUDNN_DISABLED
|
||||
if (!state.train && layer.stride == layer.size) {
|
||||
// cudnnPoolingBackward
|
||||
cudnnStatus_t maxpool_status;
|
||||
|
45
src/parser.c
45
src/parser.c
@ -844,32 +844,37 @@ network parse_network_cfg_custom(char *filename, int batch)
|
||||
net.outputs = get_network_output_size(net);
|
||||
net.output = get_network_output(net);
|
||||
printf("Total BFLOPS %5.3f \n", bflops);
|
||||
if(workspace_size){
|
||||
//printf("%ld\n", workspace_size);
|
||||
#ifdef GPU
|
||||
get_cuda_stream();
|
||||
get_cuda_memcpy_stream();
|
||||
if(gpu_index >= 0){
|
||||
net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
|
||||
int size = get_network_input_size(net) * net.batch;
|
||||
net.input_state_gpu = cuda_make_array(0, size);
|
||||
if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size*sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
|
||||
else net.input_pinned_cpu = calloc(size, sizeof(float));
|
||||
get_cuda_stream();
|
||||
get_cuda_memcpy_stream();
|
||||
if (gpu_index >= 0)
|
||||
{
|
||||
int size = get_network_input_size(net) * net.batch;
|
||||
net.input_state_gpu = cuda_make_array(0, size);
|
||||
if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
|
||||
else net.input_pinned_cpu = calloc(size, sizeof(float));
|
||||
|
||||
// pre-allocate memory for inference on Tensor Cores (fp16)
|
||||
if (net.cudnn_half) {
|
||||
*net.max_input16_size = max_inputs;
|
||||
check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
|
||||
*net.max_output16_size = max_outputs;
|
||||
check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
|
||||
}
|
||||
}else {
|
||||
// pre-allocate memory for inference on Tensor Cores (fp16)
|
||||
if (net.cudnn_half) {
|
||||
*net.max_input16_size = max_inputs;
|
||||
check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
|
||||
*net.max_output16_size = max_outputs;
|
||||
check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
|
||||
}
|
||||
if (workspace_size) {
|
||||
net.workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1);
|
||||
}
|
||||
else {
|
||||
net.workspace = calloc(1, workspace_size);
|
||||
}
|
||||
#else
|
||||
net.workspace = calloc(1, workspace_size);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
if (workspace_size) {
|
||||
net.workspace = calloc(1, workspace_size);
|
||||
}
|
||||
#endif
|
||||
|
||||
LAYER_TYPE lt = net.layers[net.n - 1].type;
|
||||
if ((net.w % 32 != 0 || net.h % 32 != 0) && (lt == YOLO || lt == REGION || lt == DETECTION)) {
|
||||
printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",
|
||||
|
@ -579,3 +579,14 @@ void get_region_detections(layer l, int w, int h, int netw, int neth, float thre
|
||||
}
|
||||
correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
|
||||
}
|
||||
|
||||
void zero_objectness(layer l)
|
||||
{
|
||||
int i, n;
|
||||
for (i = 0; i < l.w*l.h; ++i) {
|
||||
for (n = 0; n < l.n; ++n) {
|
||||
int obj_index = entry_index(l, 0, n*l.w*l.h + i, l.coords);
|
||||
l.output[obj_index] = 0;
|
||||
}
|
||||
}
|
||||
}
|
@ -13,6 +13,7 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b
|
||||
void resize_region_layer(layer *l, int w, int h);
|
||||
void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
|
||||
void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative);
|
||||
void zero_objectness(layer l);
|
||||
|
||||
#ifdef GPU
|
||||
void forward_region_layer_gpu(const region_layer l, network_state state);
|
||||
|
Reference in New Issue
Block a user