Fixed memory allocation

2023-08-10 21:13:14 +03:00 · 2019-01-26 23:25:09 +03:00
parent 090d934c0f
commit 9576cd4d89
4 changed files with 38 additions and 21 deletions
--- a/src/parser.c
+++ b/src/parser.c
@ -844,32 +844,37 @@ network parse_network_cfg_custom(char *filename, int batch)
    net.outputs = get_network_output_size(net);
    net.output = get_network_output(net);
    printf("Total BFLOPS %5.3f \n", bflops);
-    if(workspace_size){
        //printf("%ld\n", workspace_size);
 #ifdef GPU
-        get_cuda_stream();
-        get_cuda_memcpy_stream();
-        if(gpu_index >= 0){
-            net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1);
-            int size = get_network_input_size(net) * net.batch;
-            net.input_state_gpu = cuda_make_array(0, size);
-            if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size*sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
-            else net.input_pinned_cpu = calloc(size, sizeof(float));
+    get_cuda_stream();
+    get_cuda_memcpy_stream();
+    if (gpu_index >= 0)
+    {
+        int size = get_network_input_size(net) * net.batch;
+        net.input_state_gpu = cuda_make_array(0, size);
+        if (cudaSuccess == cudaHostAlloc(&net.input_pinned_cpu, size * sizeof(float), cudaHostRegisterMapped)) net.input_pinned_cpu_flag = 1;
+        else net.input_pinned_cpu = calloc(size, sizeof(float));

-            // pre-allocate memory for inference on Tensor Cores (fp16)
-            if (net.cudnn_half) {
-                *net.max_input16_size = max_inputs;
-                check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
-                *net.max_output16_size = max_outputs;
-                check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
-            }
-        }else {
+        // pre-allocate memory for inference on Tensor Cores (fp16)
+        if (net.cudnn_half) {
+            *net.max_input16_size = max_inputs;
+            check_error(cudaMalloc((void **)net.input16_gpu, *net.max_input16_size * sizeof(short))); //sizeof(half)
+            *net.max_output16_size = max_outputs;
+            check_error(cudaMalloc((void **)net.output16_gpu, *net.max_output16_size * sizeof(short))); //sizeof(half)
+        }
+        if (workspace_size) {
+            net.workspace = cuda_make_array(0, workspace_size / sizeof(float) + 1);
+        }
+        else {
            net.workspace = calloc(1, workspace_size);
        }
-#else
-        net.workspace = calloc(1, workspace_size);
-#endif
    }
+#else
+        if (workspace_size) {
+            net.workspace = calloc(1, workspace_size);
+        }
+#endif
+
    LAYER_TYPE lt = net.layers[net.n - 1].type;
    if ((net.w % 32 != 0 || net.h % 32 != 0) && (lt == YOLO || lt == REGION || lt == DETECTION)) {
        printf("\n Warning: width=%d and height=%d in cfg-file must be divisible by 32 for default networks Yolo v1/v2/v3!!! \n\n",