diff --git a/Makefile b/Makefile index aab1d08c..a0393b53 100644 --- a/Makefile +++ b/Makefile @@ -13,14 +13,15 @@ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ # ARCH= -gencode arch=compute_52,code=compute_52 VPATH=./src/:./examples -LIB=libdarknet.so +SLIB=libdarknet.so +ALIB=libdarknet.a EXEC=darknet OBJDIR=./obj/ CC=gcc NVCC=nvcc --compiler-options '-fPIC' AR=ar -ARFLAGS=-rv +ARFLAGS=rcs OPTS=-Ofast LDFLAGS= -lm -pthread COMMON= -Iinclude/ -Isrc/ @@ -62,13 +63,16 @@ EXECOBJ = $(addprefix $(OBJDIR), $(EXECOBJA)) OBJS = $(addprefix $(OBJDIR), $(OBJ)) DEPS = $(wildcard src/*.h) Makefile include/darknet.h -all: obj backup results $(LIB) $(EXEC) +all: obj backup results $(SLIB) $(ALIB) $(EXEC) -$(EXEC): $(EXECOBJ) $(LIB) - $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(LIB) +$(EXEC): $(EXECOBJ) $(ALIB) + $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) $(ALIB) -$(LIB): $(OBJS) +$(ALIB): $(OBJS) + $(AR) $(ARFLAGS) $@ $^ + +$(SLIB): $(OBJS) $(CC) $(CFLAGS) -shared $^ -o $@ $(LDFLAGS) $(OBJDIR)%.o: %.c $(DEPS) @@ -87,5 +91,5 @@ results: .PHONY: clean clean: - rm -rf $(OBJS) $(LIB) $(EXEC) $(EXECOBJ) + rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 9f1337ca..2ce60153 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -53,21 +53,24 @@ void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, check_error(cudaPeekAtLastError()); } -__global__ void add_bias_kernel(float *output, float *biases, int n, int size) +__global__ void add_bias_kernel(float *output, float *biases, int batch, int n, int size) { - int offset = blockIdx.x * blockDim.x + threadIdx.x; - int filter = blockIdx.y; - int batch = blockIdx.z; + int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (index >= n*size*batch) return; + int i = index % size; + index /= size; + int j = index % n; + index /= n; + int k = index; - if(offset < size) output[(batch*n+filter)*size + offset] += biases[filter]; + output[(k*n+j)*size + i] += biases[j]; } void add_bias_gpu(float *output, float *biases, int batch, int n, int size) { - dim3 dimGrid((size-1)/BLOCK + 1, n, batch); - dim3 dimBlock(BLOCK, 1, 1); + int num = n*size*batch; - add_bias_kernel<<>>(output, biases, n, size); + add_bias_kernel<<>>(output, biases, batch, n, size); check_error(cudaPeekAtLastError()); } diff --git a/src/cuda.c b/src/cuda.c index 48199743..e5114037 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -26,7 +26,7 @@ int cuda_get_device() void check_error(cudaError_t status) { - cudaDeviceSynchronize(); + //cudaDeviceSynchronize(); cudaError_t status2 = cudaGetLastError(); if (status != cudaSuccess) { diff --git a/src/gru_layer.c b/src/gru_layer.c index e07cb877..75497ae4 100644 --- a/src/gru_layer.c +++ b/src/gru_layer.c @@ -68,15 +68,6 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); l.wh->batch = batch; -#ifdef CUDNN - cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); - cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); - cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); - cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); - cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); - cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); -#endif - l.batch_normalize = batch_normalize; @@ -110,6 +101,15 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no l.r_gpu = cuda_make_array(0, batch*outputs); l.z_gpu = cuda_make_array(0, batch*outputs); l.h_gpu = cuda_make_array(0, batch*outputs); + +#ifdef CUDNN + cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w); + cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w); + cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w); + cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w); + cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w); + cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w); +#endif #endif return l;