diff --git a/src/blas.h b/src/blas.h index be7da00b..5a50db5b 100644 --- a/src/blas.h +++ b/src/blas.h @@ -36,14 +36,12 @@ void mean_gpu(float *x, int batch, int filters, int spatial, float *mean); void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial); -void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); -void variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta); -void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *spatial_mean_delta, float *mean_delta); -void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *spatial_variance_delta, float *variance_delta); +void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta); +void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta); -void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *spatial_variance, float *variance); -void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *spatial_mean, float *mean); +void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); +void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean); #endif #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 4da31d14..17955e46 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -48,28 +48,6 @@ __global__ void variance_delta_kernel(float *x, float *delta, float *mean, floa variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.)); } -__global__ void spatial_variance_delta_kernel(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *spatial_variance_delta) -{ - int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (i >= batch*filters) return; - int f = i%filters; - int b = i/filters; - - int k; - spatial_variance_delta[i] = 0; - for (k = 0; k < spatial; ++k) { - int index = b*filters*spatial + f*spatial + k; - spatial_variance_delta[i] += delta[index]*(x[index] - mean[f]); - } - spatial_variance_delta[i] *= -.5 * pow(variance[f] + .00001f, (float)(-3./2.)); -} - -extern "C" void variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) -{ - variance_delta_kernel<<>>(x, delta, mean, variance, batch, filters, spatial, variance_delta); - check_error(cudaPeekAtLastError()); -} - __global__ void accumulate_kernel(float *x, int n, int groups, float *sum) { int k; @@ -81,38 +59,62 @@ __global__ void accumulate_kernel(float *x, int n, int groups, float *sum) } } -extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *spatial_variance_delta, float *variance_delta) +__global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) { - spatial_variance_delta_kernel<<>>(x, delta, mean, variance, batch, filters, spatial, spatial_variance_delta); - check_error(cudaPeekAtLastError()); - accumulate_kernel<<>>(spatial_variance_delta, batch, filters, variance_delta); - check_error(cudaPeekAtLastError()); -} + const int threads = BLOCK; + __shared__ float local[threads]; -__global__ void spatial_mean_delta_kernel(float *delta, float *variance, int batch, int filters, int spatial, float *spatial_mean_delta) -{ - int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (i >= batch*filters) return; - int f = i%filters; - int b = i/filters; + int id = threadIdx.x; + local[id] = 0; - int k; - spatial_mean_delta[i] = 0; - for (k = 0; k < spatial; ++k) { - int index = b*filters*spatial + f*spatial + k; - spatial_mean_delta[i] += delta[index]; + int filter = blockIdx.x; + + int i, j; + for(j = 0; j < batch; ++j){ + for(i = 0; i < spatial; i += threads){ + int index = j*spatial*filters + filter*spatial + i + id; + local[id] += (i+id < spatial) ? delta[index] : 0; + } + } + + if(id == 0){ + mean_delta[filter] = 0; + for(i = 0; i < threads; ++i){ + mean_delta[filter] += local[i]; + } + mean_delta[filter] *= (-1./sqrt(variance[filter] + .00001f)); } - spatial_mean_delta[i] *= (-1./sqrt(variance[f] + .00001f)); } -extern "C" void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *spatial_mean_delta, float *mean_delta) +__global__ void fast_variance_delta_kernel(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) { - spatial_mean_delta_kernel<<>>(delta, variance, batch, filters, spatial, spatial_mean_delta); - check_error(cudaPeekAtLastError()); - accumulate_kernel<<>>(spatial_mean_delta, batch, filters, mean_delta); - check_error(cudaPeekAtLastError()); + const int threads = BLOCK; + __shared__ float local[threads]; + + int id = threadIdx.x; + local[id] = 0; + + int filter = blockIdx.x; + + int i, j; + for(j = 0; j < batch; ++j){ + for(i = 0; i < spatial; i += threads){ + int index = j*spatial*filters + filter*spatial + i + id; + + local[id] += (i+id < spatial) ? delta[index]*(x[index] - mean[filter]) : 0; + } + } + + if(id == 0){ + variance_delta[filter] = 0; + for(i = 0; i < threads; ++i){ + variance_delta[filter] += local[i]; + } + variance_delta[filter] *= -.5 * pow(variance[filter] + .00001f, (float)(-3./2.)); + } } + __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; @@ -134,6 +136,18 @@ extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int fil check_error(cudaPeekAtLastError()); } +extern "C" void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) +{ + fast_mean_delta_kernel<<>>(delta, variance, batch, filters, spatial, mean_delta); + check_error(cudaPeekAtLastError()); +} + +extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) +{ + fast_variance_delta_kernel<<>>(x, delta, mean, variance, batch, filters, spatial, variance_delta); + check_error(cudaPeekAtLastError()); +} + __global__ void mean_kernel(float *x, int batch, int filters, int spatial, float *mean) { float scale = 1./(batch * spatial); @@ -150,23 +164,6 @@ __global__ void mean_kernel(float *x, int batch, int filters, int spatial, floa mean[i] *= scale; } -__global__ void spatial_variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance) -{ - float scale = 1./(spatial*batch-1); - int k; - int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if (i >= batch*filters) return; - int f = i%filters; - int b = i/filters; - - variance[i] = 0; - for(k = 0; k < spatial; ++k){ - int index = b*filters*spatial + f*spatial + k; - variance[i] += pow((x[index] - mean[f]), 2); - } - variance[i] *= scale; -} - __global__ void variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance) { float scale = 1./(batch * spatial); @@ -238,28 +235,80 @@ extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, check_error(cudaPeekAtLastError()); } +__global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean) +{ + const int threads = BLOCK; + __shared__ float local[threads]; + + int id = threadIdx.x; + local[id] = 0; + + int filter = blockIdx.x; + + int i, j; + for(j = 0; j < batch; ++j){ + for(i = 0; i < spatial; i += threads){ + int index = j*spatial*filters + filter*spatial + i + id; + local[id] += (i+id < spatial) ? x[index] : 0; + } + } + + if(id == 0){ + mean[filter] = 0; + for(i = 0; i < threads; ++i){ + mean[filter] += local[i]; + } + mean[filter] /= spatial * batch; + } +} + +__global__ void fast_variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance) +{ + const int threads = BLOCK; + __shared__ float local[threads]; + + int id = threadIdx.x; + local[id] = 0; + + int filter = blockIdx.x; + + int i, j; + for(j = 0; j < batch; ++j){ + for(i = 0; i < spatial; i += threads){ + int index = j*spatial*filters + filter*spatial + i + id; + + local[id] += (i+id < spatial) ? pow((x[index] - mean[filter]), 2) : 0; + } + } + + if(id == 0){ + variance[filter] = 0; + for(i = 0; i < threads; ++i){ + variance[filter] += local[i]; + } + variance[filter] /= spatial * batch; + } +} + +extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) +{ + fast_mean_kernel<<>>(x, batch, filters, spatial, mean); + check_error(cudaPeekAtLastError()); +} + +extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) +{ + fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); + check_error(cudaPeekAtLastError()); +} + + extern "C" void mean_gpu(float *x, int batch, int filters, int spatial, float *mean) { mean_kernel<<>>(x, batch, filters, spatial, mean); check_error(cudaPeekAtLastError()); } -extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *spatial_mean, float *mean) -{ - mean_kernel<<>>(x, 1, filters*batch, spatial, spatial_mean); - check_error(cudaPeekAtLastError()); - mean_kernel<<>>(spatial_mean, batch, filters, 1, mean); - check_error(cudaPeekAtLastError()); -} - -extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *spatial_variance, float *variance) -{ - spatial_variance_kernel<<>>(x, mean, batch, filters, spatial, spatial_variance); - check_error(cudaPeekAtLastError()); - accumulate_kernel<<>>(spatial_variance, batch, filters, variance); - check_error(cudaPeekAtLastError()); -} - extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) { variance_kernel<<>>(x, mean, batch, filters, spatial, variance); diff --git a/src/classifier.c b/src/classifier.c index e2439659..c0006e67 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -98,6 +98,11 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile) sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); save_weights(net, buff); } + if(*net.seen%1000 == 0){ + char buff[256]; + sprintf(buff, "%s/%s.backup",backup_directory,base); + save_weights(net, buff); + } } char buff[256]; sprintf(buff, "%s/%s.weights", backup_directory, base); diff --git a/src/coco.c b/src/coco.c index b532d623..41c2d80c 100644 --- a/src/coco.c +++ b/src/coco.c @@ -20,7 +20,8 @@ image coco_labels[80]; void train_coco(char *cfgfile, char *weightfile) { //char *train_images = "/home/pjreddie/data/voc/test/train.txt"; - char *train_images = "/home/pjreddie/data/coco/train.txt"; + //char *train_images = "/home/pjreddie/data/coco/train.txt"; + char *train_images = "data/coco.trainval.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); data_seed = time(0); diff --git a/src/connected_layer.c b/src/connected_layer.c index 43235057..640e8b8b 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -148,6 +148,12 @@ void forward_connected_layer_gpu(connected_layer l, network_state state) float * c = l.output_gpu; gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + +/* + cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); + float avg = mean_array(l.output, l.outputs*l.batch); + printf("%f\n", avg); + */ } void backward_connected_layer_gpu(connected_layer l, network_state state) diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 5f24ca5e..130824af 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -119,17 +119,14 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) if(l.batch_normalize){ if(state.train){ - fast_mean_gpu(l.output_gpu, l.batch, l.n, l.out_h*l.out_w, l.spatial_mean_gpu, l.mean_gpu); - fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.n, l.out_h*l.out_w, l.spatial_variance_gpu, l.variance_gpu); + fast_mean_gpu(l.output_gpu, l.batch, l.n, l.out_h*l.out_w, l.mean_gpu); + fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.n, l.out_h*l.out_w, l.variance_gpu); scal_ongpu(l.n, .95, l.rolling_mean_gpu, 1); axpy_ongpu(l.n, .05, l.mean_gpu, 1, l.rolling_mean_gpu, 1); scal_ongpu(l.n, .95, l.rolling_variance_gpu, 1); axpy_ongpu(l.n, .05, l.variance_gpu, 1, l.rolling_variance_gpu, 1); - // cuda_pull_array(l.variance_gpu, l.mean, l.n); - // printf("%f\n", l.mean[0]); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.n, l.out_h*l.out_w); copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); @@ -161,8 +158,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.n, l.out_h*l.out_w); - fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.n, l.out_w*l.out_h, l.spatial_mean_delta_gpu, l.mean_delta_gpu); - fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.n, l.out_w*l.out_h, l.spatial_variance_delta_gpu, l.variance_delta_gpu); + fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.n, l.out_w*l.out_h, l.mean_delta_gpu); + fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.n, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.n, l.out_w*l.out_h, l.delta_gpu); } diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index b9fd3c95..ec571a69 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -86,9 +86,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int } l.mean = calloc(n, sizeof(float)); - l.spatial_mean = calloc(n*l.batch, sizeof(float)); - l.variance = calloc(n, sizeof(float)); + l.rolling_mean = calloc(n, sizeof(float)); l.rolling_variance = calloc(n, sizeof(float)); } @@ -114,12 +113,6 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.rolling_mean_gpu = cuda_make_array(l.mean, n); l.rolling_variance_gpu = cuda_make_array(l.variance, n); - l.spatial_mean_gpu = cuda_make_array(l.spatial_mean, n*l.batch); - l.spatial_variance_gpu = cuda_make_array(l.spatial_mean, n*l.batch); - - l.spatial_mean_delta_gpu = cuda_make_array(l.spatial_mean, n*l.batch); - l.spatial_variance_delta_gpu = cuda_make_array(l.spatial_mean, n*l.batch); - l.mean_delta_gpu = cuda_make_array(l.mean, n); l.variance_delta_gpu = cuda_make_array(l.variance, n);