From 9d23aad8696268e8ce3a94fee9490fd1db000dc8 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Sun, 31 Dec 2017 20:10:32 +0300 Subject: [PATCH] Added CUDA-streams to Darknet-Yolo forward inference --- build/darknet/yolo_console_dll.vcxproj | 4 ++-- src/activation_kernels.cu | 2 +- src/blas_kernels.cu | 28 +++++++++++------------ src/cuda.c | 25 ++++++++++++++++++--- src/cuda.h | 1 + src/gemm.c | 1 + src/im2col_kernels.cu | 2 +- src/maxpool_layer_kernels.cu | 2 +- src/region_layer.c | 1 + src/yolo_console_dll.cpp | 31 ++++++++++++++++++-------- 10 files changed, 66 insertions(+), 31 deletions(-) diff --git a/build/darknet/yolo_console_dll.vcxproj b/build/darknet/yolo_console_dll.vcxproj index 104863fa..176d70ba 100644 --- a/build/darknet/yolo_console_dll.vcxproj +++ b/build/darknet/yolo_console_dll.vcxproj @@ -115,14 +115,14 @@ true true true - C:\opencv_3.0\opencv\build\include + C:\opencv_source\opencv\bin\install\include _CRT_SECURE_NO_WARNINGS;_MBCS;%(PreprocessorDefinitions) Async true true - C:\opencv_3.0\opencv\build\x64\vc14\lib;C:\opencv_2.4.13\opencv\build\x64\vc12\lib + C:\opencv_source\opencv\bin\install\x64\vc14\lib;C:\opencv_3.0\opencv\build\x64\vc14\lib;C:\opencv_2.4.13\opencv\build\x64\vc12\lib diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu index 994e2068..d5f25a0b 100644 --- a/src/activation_kernels.cu +++ b/src/activation_kernels.cu @@ -154,7 +154,7 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) { - activate_array_kernel<<>>(x, n, a); + activate_array_kernel<<>>(x, n, a); check_error(cudaPeekAtLastError()); } diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 79fc1c1d..8e1cf19e 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -23,7 +23,7 @@ void scale_bias_gpu(float *output, float *biases, int batch, int n, int size) dim3 dimGrid((size-1)/BLOCK + 1, n, batch); dim3 dimBlock(BLOCK, 1, 1); - scale_bias_kernel<<>>(output, biases, n, size); + scale_bias_kernel<<>>(output, biases, n, size); check_error(cudaPeekAtLastError()); } @@ -67,7 +67,7 @@ void add_bias_gpu(float *output, float *biases, int batch, int n, int size) dim3 dimGrid((size-1)/BLOCK + 1, n, batch); dim3 dimBlock(BLOCK, 1, 1); - add_bias_kernel<<>>(output, biases, n, size); + add_bias_kernel<<>>(output, biases, n, size); check_error(cudaPeekAtLastError()); } @@ -427,7 +427,7 @@ __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY) extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) { size_t N = batch*filters*spatial; - normalize_kernel<<>>(N, x, mean, variance, batch, filters, spatial); + normalize_kernel<<>>(N, x, mean, variance, batch, filters, spatial); check_error(cudaPeekAtLastError()); } @@ -490,13 +490,13 @@ __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filt extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) { - fast_mean_kernel<<>>(x, batch, filters, spatial, mean); + fast_mean_kernel<<>>(x, batch, filters, spatial, mean); check_error(cudaPeekAtLastError()); } extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) { - fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); + fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); check_error(cudaPeekAtLastError()); } @@ -520,13 +520,13 @@ extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, i extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { - pow_kernel<<>>(N, ALPHA, X, INCX, Y, INCY); + pow_kernel<<>>(N, ALPHA, X, INCX, Y, INCY); check_error(cudaPeekAtLastError()); } extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { - axpy_kernel<<>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); + axpy_kernel<<>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } @@ -543,7 +543,7 @@ extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY) extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { - copy_kernel<<>>(N, X, OFFX, INCX, Y, OFFY, INCY); + copy_kernel<<>>(N, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } @@ -567,20 +567,20 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out) { int size = spatial*batch*layers; - flatten_kernel<<>>(size, x, spatial, layers, batch, forward, out); + flatten_kernel<<>>(size, x, spatial, layers, batch, forward, out); check_error(cudaPeekAtLastError()); } extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) { int size = w*h*c*batch; - reorg_kernel<<>>(size, x, w, h, c, batch, stride, forward, out); + reorg_kernel<<>>(size, x, w, h, c, batch, stride, forward, out); check_error(cudaPeekAtLastError()); } extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask) { - mask_kernel<<>>(N, X, mask_num, mask); + mask_kernel<<>>(N, X, mask_num, mask); check_error(cudaPeekAtLastError()); } @@ -599,7 +599,7 @@ extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX) extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) { - scal_kernel<<>>(N, ALPHA, X, INCX); + scal_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } @@ -611,7 +611,7 @@ extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX) extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX) { - fill_kernel<<>>(N, ALPHA, X, INCX); + fill_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } @@ -766,6 +766,6 @@ extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float t { int inputs = n; int batch = groups; - softmax_kernel<<>>(inputs, offset, batch, input, temp, output); + softmax_kernel<<>>(inputs, offset, batch, input, temp, output); check_error(cudaPeekAtLastError()); } diff --git a/src/cuda.c b/src/cuda.c index 1b51271f..f168e4e2 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -61,6 +61,19 @@ dim3 cuda_gridsize(size_t n){ return d; } +static cudaStream_t streamsArray[16]; // cudaStreamSynchronize( get_cuda_stream() ); +static int streamInit[16] = { 0 }; + +cudaStream_t get_cuda_stream() { + int i = cuda_get_device(); + if (!streamInit[i]) { + cudaStreamCreate(&streamsArray[i]); + streamInit[i] = 1; + } + return streamsArray[i]; +} + + #ifdef CUDNN cudnnHandle_t cudnn_handle() { @@ -70,6 +83,7 @@ cudnnHandle_t cudnn_handle() if(!init[i]) { cudnnCreate(&handle[i]); init[i] = 1; + cudnnStatus_t status = cudnnSetStream(handle[i], get_cuda_stream()); } return handle[i]; } @@ -94,7 +108,8 @@ float *cuda_make_array(float *x, size_t n) cudaError_t status = cudaMalloc((void **)&x_gpu, size); check_error(status); if(x){ - status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + //status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream()); check_error(status); } if(!x_gpu) error("Cuda malloc failed\n"); @@ -139,6 +154,7 @@ int *cuda_make_int_array(size_t n) void cuda_free(float *x_gpu) { + //cudaStreamSynchronize(get_cuda_stream()); cudaError_t status = cudaFree(x_gpu); check_error(status); } @@ -146,15 +162,18 @@ void cuda_free(float *x_gpu) void cuda_push_array(float *x_gpu, float *x, size_t n) { size_t size = sizeof(float)*n; - cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + //cudaError_t status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); + cudaError_t status = cudaMemcpyAsync(x_gpu, x, size, cudaMemcpyHostToDevice, get_cuda_stream()); check_error(status); } void cuda_pull_array(float *x_gpu, float *x, size_t n) { size_t size = sizeof(float)*n; - cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost); + //cudaError_t status = cudaMemcpy(x, x_gpu, size, cudaMemcpyDeviceToHost); + cudaError_t status = cudaMemcpyAsync(x, x_gpu, size, cudaMemcpyDeviceToHost, get_cuda_stream()); check_error(status); + cudaStreamSynchronize(get_cuda_stream()); } #endif diff --git a/src/cuda.h b/src/cuda.h index 32aaabb4..31f9092a 100644 --- a/src/cuda.h +++ b/src/cuda.h @@ -30,6 +30,7 @@ void cuda_free(float *x_gpu); void cuda_random(float *x_gpu, size_t n); float cuda_compare(float *x_gpu, float *x, size_t n, char *s); dim3 cuda_gridsize(size_t n); +cudaStream_t get_cuda_stream(); #ifdef CUDNN cudnnHandle_t cudnn_handle(); diff --git a/src/gemm.c b/src/gemm.c index a4db8a4b..c3154ec9 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -177,6 +177,7 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, float *C_gpu, int ldc) { cublasHandle_t handle = blas_handle(); + cudaError_t stream_status = cublasSetStream(handle, get_cuda_stream()); cudaError_t status = cublasSgemm(handle, (TB ? CUBLAS_OP_T : CUBLAS_OP_N), (TA ? CUBLAS_OP_T : CUBLAS_OP_N), N, M, K, &ALPHA, B_gpu, ldb, A_gpu, lda, &BETA, C_gpu, ldc); check_error(status); diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index d42d600b..8a15e504 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -54,7 +54,7 @@ void im2col_ongpu(float *im, int width_col = (width + 2 * pad - ksize) / stride + 1; int num_kernels = channels * height_col * width_col; im2col_gpu_kernel<<<(num_kernels+BLOCK-1)/BLOCK, - BLOCK>>>( + BLOCK, 0, get_cuda_stream()>>>( num_kernels, im, height, width, ksize, pad, stride, height_col, width_col, data_col); diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu index 6381cc1e..d40d3c0b 100644 --- a/src/maxpool_layer_kernels.cu +++ b/src/maxpool_layer_kernels.cu @@ -92,7 +92,7 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta size_t n = h*w*c*layer.batch; - forward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu); + forward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu); check_error(cudaPeekAtLastError()); } diff --git a/src/region_layer.c b/src/region_layer.c index 9095b3ce..0638301f 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -409,6 +409,7 @@ void forward_region_layer_gpu(const region_layer l, network_state state) cuda_pull_array(state.truth, truth_cpu, num_truth); } cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs); + cudaStreamSynchronize(get_cuda_stream()); network_state cpu_state = state; cpu_state.train = state.train; cpu_state.truth = truth_cpu; diff --git a/src/yolo_console_dll.cpp b/src/yolo_console_dll.cpp index ebafe110..16a90496 100644 --- a/src/yolo_console_dll.cpp +++ b/src/yolo_console_dll.cpp @@ -169,8 +169,8 @@ int main(int argc, char *argv[]) //if (x > 10) return; if (result_vec.size() == 0) return; bbox_t i = result_vec[0]; - //cv::Rect r(i.x, i.y, i.w, i.h); - cv::Rect r(i.x + (i.w-31)/2, i.y + (i.h - 31)/2, 31, 31); + cv::Rect r(i.x, i.y, i.w, i.h); + //cv::Rect r(i.x + (i.w-31)/2, i.y + (i.h - 31)/2, 31, 31); cv::Rect img_rect(cv::Point2i(0, 0), src_frame.size()); cv::Rect rect_roi = r & img_rect; if (rect_roi.width < 1 || rect_roi.height < 1) return; @@ -188,16 +188,25 @@ int main(int argc, char *argv[]) // track optical flow if (track_optflow_queue.size() > 0) { + //show_flow = track_optflow_queue.front().clone(); + //draw_boxes(show_flow, result_vec, obj_names, 3, current_det_fps, current_cap_fps); + std::queue new_track_optflow_queue; - std::cout << "\n !!!! all = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl; - //draw_boxes(track_optflow_queue.front().clone(), result_vec, obj_names, 3, current_det_fps, current_cap_fps); - //cv::waitKey(10); + //std::cout << "\n !!!! all = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl; + if (result_vec.size() > 0) { + draw_boxes(track_optflow_queue.front().clone(), result_vec, obj_names, 3, current_det_fps, current_cap_fps); + std::cout << "\n frame_size = " << track_optflow_queue.size() << std::endl; + cv::waitKey(1000); + } tracker_flow.update_tracking_flow(track_optflow_queue.front()); lambda(show_flow, track_optflow_queue.front(), result_vec); track_optflow_queue.pop(); while(track_optflow_queue.size() > 0) { - //draw_boxes(track_optflow_queue.front().clone(), result_vec, obj_names, 3, current_det_fps, current_cap_fps); - //cv::waitKey(10); + if (result_vec.size() > 0) { + draw_boxes(track_optflow_queue.front().clone(), result_vec, obj_names, 3, current_det_fps, current_cap_fps); + std::cout << "\n frame_size = " << track_optflow_queue.size() << std::endl; + cv::waitKey(1000); + } result_vec = tracker_flow.tracking_flow(track_optflow_queue.front(), result_vec); if (track_optflow_queue.size() <= passed_flow_frames && new_track_optflow_queue.size() == 0) new_track_optflow_queue = track_optflow_queue; @@ -207,10 +216,13 @@ int main(int argc, char *argv[]) track_optflow_queue = new_track_optflow_queue; new_track_optflow_queue.swap(std::queue()); passed_flow_frames = 0; - std::cout << "\n !!!! now = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl; + //std::cout << "\n !!!! now = " << track_optflow_queue.size() << ", cur = " << passed_flow_frames << std::endl; cv::imshow("flow", show_flow); cv::waitKey(3); + //if (result_vec.size() > 0) { + // cv::waitKey(1000); + //} } #endif @@ -222,7 +234,8 @@ int main(int argc, char *argv[]) consumed = true; while (current_image.use_count() > 0) { auto result = detector.detect_resized(*current_image, frame_size, 0.24, false); // true - Sleep(500); + //Sleep(200); + Sleep(50); ++fps_det_counter; std::unique_lock lock(mtx); thread_result_vec = result;