From 91f95c715bff84094fc18bad6a8f938291b9b0f5 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Mon, 24 Oct 2016 13:32:49 -0700 Subject: [PATCH] tree things, tree stuff --- Makefile | 2 +- src/blas.h | 2 +- src/blas_kernels.cu | 22 +++++++++++++--------- src/network_kernels.cu | 4 ++++ src/softmax_layer.c | 35 +++++++---------------------------- 5 files changed, 26 insertions(+), 39 deletions(-) diff --git a/Makefile b/Makefile index ca358bfd..37b92c18 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ EXEC=darknet OBJDIR=./obj/ CC=gcc -NVCC=nvcc +NVCC=nvcc OPTS=-Ofast LDFLAGS= -lm -pthread COMMON= diff --git a/src/blas.h b/src/blas.h index 6b6b8f5b..daacf9a7 100644 --- a/src/blas.h +++ b/src/blas.h @@ -77,7 +77,7 @@ void mult_add_into_gpu(int num, float *a, float *b, float *c); void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); -void softmax_gpu(float *input, int n, int groups, float temp, float *output, cudaStream_t stream); +void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output); #endif #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 59ec0057..b4d520e1 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -693,31 +693,35 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c) } -__global__ void softmax_kernel(int n, int batch, float *input, float temp, float *output) +__device__ void softmax_device(int n, float *input, float temp, float *output) { - int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if(b >= batch) return; - int i; float sum = 0; float largest = -INFINITY; for(i = 0; i < n; ++i){ - int val = input[i+b*n]; + int val = input[i]; largest = (val>largest) ? val : largest; } for(i = 0; i < n; ++i){ - sum += exp(input[i+b*n]/temp-largest/temp); + sum += exp(input[i]/temp-largest/temp); } sum = (sum != 0) ? largest/temp+log(sum) : largest-100; for(i = 0; i < n; ++i){ - output[i+b*n] = exp(input[i+b*n]/temp-sum); + output[i] = exp(input[i]/temp-sum); } } -extern "C" void softmax_gpu(float *input, int n, int groups, float temp, float *output, cudaStream_t stream) +__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output) +{ + int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(b >= batch) return; + softmax_device(n, input + b*offset, temp, output + b*offset); +} + +extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output) { int inputs = n; int batch = groups; - softmax_kernel<<>>(inputs, batch, input, temp, output); + softmax_kernel<<>>(inputs, offset, batch, input, temp, output); check_error(cudaPeekAtLastError()); } diff --git a/src/network_kernels.cu b/src/network_kernels.cu index e3190680..9c431cf7 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -134,6 +134,7 @@ void *train_thread(void *ptr) free(ptr); cuda_set_device(args.net.gpu_index); *args.err = train_network(args.net, args.d); + printf("%d\n", args.net.gpu_index); return 0; } @@ -359,11 +360,14 @@ float train_networks(network *nets, int n, data d, int interval) //printf("%f\n", errors[i]); sum += errors[i]; } + //cudaDeviceSynchronize(); if (get_current_batch(nets[0]) % interval == 0) { printf("Syncing... "); + fflush(stdout); sync_nets(nets, n, interval); printf("Done!\n"); } + //cudaDeviceSynchronize(); free(threads); free(errors); return (float)sum/(n); diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 2a34caea..31f3e03f 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -73,37 +73,16 @@ void forward_softmax_layer_gpu(const softmax_layer l, network_state state) { int inputs = l.inputs / l.groups; int batch = l.batch * l.groups; - int b; if(l.softmax_tree){ - if(0){ - float *buff = calloc(inputs * batch, sizeof(float)); - cuda_pull_array(state.input, buff, batch * inputs); - state.input = buff; - forward_softmax_layer(l, state); - cuda_push_array(l.output_gpu, l.output, batch*inputs); - free(buff); - } else { - int i; - const int nstreams = 32; - cudaStream_t streams[nstreams]; - for (i = 0; i < nstreams; ++i) { - cudaStreamCreate(&streams[i]); - } - for (b = 0; b < batch; ++b) { - int i; - int count = 0; - for (i = 0; i < l.softmax_tree->groups; ++i) { - int group_size = l.softmax_tree->group_size[i]; - softmax_gpu(state.input+b*inputs + count, group_size, 1, l.temperature, l.output_gpu+b*inputs + count, streams[(b*l.softmax_tree->groups + i) % nstreams]); - count += group_size; - } - } - for(i = 0; i < nstreams; ++i){ - cudaStreamDestroy(streams[i]); - } + int i; + int count = 0; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count); + count += group_size; } } else { - softmax_gpu(state.input, inputs, batch, l.temperature, l.output_gpu, 0); + softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu); } }