From d767e8ca38b73c4096c79277a458e500c22c2a69 Mon Sep 17 00:00:00 2001 From: AlexeyAB Date: Mon, 4 Feb 2019 23:29:06 +0300 Subject: [PATCH] Minor fixes --- include/darknet.h | 2 +- src/activations.h | 4 ++-- src/connected_layer.c | 15 ++++++++------- src/convolutional_kernels.cu | 4 +++- src/im2col_kernels.cu | 12 ++++++------ src/yolo_layer.c | 1 + 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/darknet.h b/include/darknet.h index 10d374b7..9b0ea02b 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -45,7 +45,7 @@ struct network; typedef struct network network; struct network_state; -typedef struct network_state; +typedef struct network_state network_state; struct layer; typedef struct layer layer; diff --git a/src/activations.h b/src/activations.h index 442f15a2..849c65d3 100644 --- a/src/activations.h +++ b/src/activations.h @@ -76,11 +76,11 @@ static inline float loggy_gradient(float x) static inline float stair_gradient(float x) { if (floor(x) == x) return 0; - return 1; + return 1.0f; } static inline float relu_gradient(float x){return (x>0);} static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);} -static inline float selu_gradient(float x) { return (x >= 0)*1.0507 + (x < 0)*(x + 1.0507f*1.6732f); } +static inline float selu_gradient(float x) { return (x >= 0)*1.0507f + (x < 0)*(x + 1.0507f*1.6732f); } static inline float relie_gradient(float x){return (x>0) ? 1 : .01f;} static inline float ramp_gradient(float x){return (x>0)+.1f;} static inline float leaky_gradient(float x){return (x>0) ? 1 : .1f;} diff --git a/src/connected_layer.c b/src/connected_layer.c index 0f4a61a1..66f7c91c 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -84,7 +84,7 @@ connected_layer make_connected_layer(int batch, int steps, int inputs, int outpu l.update = update_connected_layer; //float scale = 1./sqrt(inputs); - float scale = sqrt(2./inputs); + float scale = sqrt(2.f/inputs); for(i = 0; i < outputs*inputs; ++i){ l.weights[i] = scale*rand_uniform(-1, 1); } @@ -182,10 +182,10 @@ void forward_connected_layer(connected_layer l, network_state state) mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); - scal_cpu(l.outputs, .95, l.rolling_mean, 1); - axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); - scal_cpu(l.outputs, .95, l.rolling_variance, 1); - axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); + scal_cpu(l.outputs, .95f, l.rolling_mean, 1); + axpy_cpu(l.outputs, .05f, l.mean, 1, l.rolling_mean, 1); + scal_cpu(l.outputs, .95f, l.rolling_variance, 1); + axpy_cpu(l.outputs, .05f, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); @@ -242,7 +242,7 @@ void denormalize_connected_layer(layer l) { int i, j; for(i = 0; i < l.outputs; ++i){ - float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001); + float scale = l.scales[i]/sqrt(l.rolling_variance[i] + .000001f); for(j = 0; j < l.inputs; ++j){ l.weights[i*l.inputs + j] *= scale; } @@ -285,6 +285,7 @@ void pull_connected_layer(connected_layer l) cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs); cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs); } + CHECK_CUDA(cudaPeekAtLastError()); } void push_connected_layer(connected_layer l) @@ -298,6 +299,7 @@ void push_connected_layer(connected_layer l) cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.outputs); cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.outputs); } + CHECK_CUDA(cudaPeekAtLastError()); } void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay) @@ -317,7 +319,6 @@ void update_connected_layer_gpu(connected_layer l, int batch, float learning_rat void forward_connected_layer_gpu(connected_layer l, network_state state) { - int i; fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); int m = l.batch; diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 2ba2acdd..270e3fe1 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -84,7 +84,7 @@ __global__ void set_zero_kernel(float *src, int size) __inline__ __device__ float warpAllReduceSum(float val) { for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) -#if CUDA_VERSION >= 9000 +#if CUDART_VERSION >= 9000 val += __shfl_xor_sync(0xffffffff, val, mask); #else val += __shfl_xor(val, mask); @@ -807,6 +807,7 @@ void pull_convolutional_layer(convolutional_layer layer) cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); } + CHECK_CUDA(cudaPeekAtLastError()); cudaStreamSynchronize(get_cuda_stream()); } @@ -828,6 +829,7 @@ void push_convolutional_layer(convolutional_layer layer) cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); } + CHECK_CUDA(cudaPeekAtLastError()); } void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init, float momentum, float decay) diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 7b347857..4d22beda 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -17,7 +17,7 @@ extern "C" { template __device__ inline T1 __shfl_custom(T1 val, T2 lane) { -#if CUDA_VERSION >= 9000 +#if CUDART_VERSION >= 9000 return __shfl_sync(FULL_MASK, val, lane); #else return __shfl(val, lane); @@ -26,7 +26,7 @@ __device__ inline T1 __shfl_custom(T1 val, T2 lane) { template __device__ inline uint32_t __ballot_custom(T val) { -#if CUDA_VERSION >= 9000 +#if CUDART_VERSION >= 9000 return __ballot_sync(FULL_MASK, val); #else return __ballot(val); @@ -1223,7 +1223,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int __inline__ __device__ int warpAllReduceSum(int val) { for (int mask = WARP_SIZE / 2; mask > 0; mask /= 2) -#if CUDA_VERSION >= 9000 +#if CUDART_VERSION >= 9000 val += __shfl_xor_sync(FULL_MASK, val, mask); #else val += __shfl_xor(val, mask); @@ -1233,7 +1233,7 @@ int warpAllReduceSum(int val) { } // Tensor Cores binary (CC >= 7.3 && CUDA >= 10.0) - __CUDA_SUBBYTE_IMMA__ -#if CUDA_VERSION >= 10000 +#if CUDART_VERSION >= 10000 #include #define WMMA_M 8 @@ -1779,7 +1779,7 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, //if (M % 8 == 0 && N % 8 == 0 && M == 128) //if (M >= 32) // l.n >= 32 -#if CUDA_VERSION >= 10000 +#if CUDART_VERSION >= 10000 if (1) { const int M_aligned = M + (8 - (M % 8)); @@ -1800,7 +1800,7 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, //getchar(); } else -#endif //# CUDA_VERSION >= 10000 +#endif //# CUDART_VERSION >= 10000 { gemm_nn_custom_bin_mean_transposed_gpu_kernel << > > ( M, N, K, diff --git a/src/yolo_layer.c b/src/yolo_layer.c index d1fa78cf..05739b4b 100644 --- a/src/yolo_layer.c +++ b/src/yolo_layer.c @@ -454,6 +454,7 @@ void forward_yolo_layer_gpu(const layer l, network_state state) if(!state.train || l.onlyforward){ //cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); cuda_pull_array_async(l.output_gpu, l.output, l.batch*l.outputs); + CHECK_CUDA(cudaPeekAtLastError()); return; }