diff --git a/cfg/coco.data b/cfg/coco.data index 30038417..610151dc 100644 --- a/cfg/coco.data +++ b/cfg/coco.data @@ -1,7 +1,7 @@ classes= 80 train = /home/pjreddie/data/coco/trainvalno5k.txt -valid = coco_testdev -#valid = data/coco_val_5k.list +#valid = coco_testdev +valid = data/coco_val_5k.list names = data/coco.names backup = /home/pjreddie/backup/ eval=coco diff --git a/cfg/yolo.cfg b/cfg/yolo.cfg index 7001dfa5..c530f915 100644 --- a/cfg/yolo.cfg +++ b/cfg/yolo.cfg @@ -5,8 +5,8 @@ subdivisions=1 # Training # batch=64 # subdivisions=8 -height=416 -width=416 +height=608 +width=608 channels=3 momentum=0.9 decay=0.0005 diff --git a/src/activation_layer.c b/src/activation_layer.c index 3430dac4..de4e4b7d 100644 --- a/src/activation_layer.c +++ b/src/activation_layer.c @@ -35,29 +35,29 @@ layer make_activation_layer(int batch, int inputs, ACTIVATION activation) return l; } -void forward_activation_layer(layer l, network_state state) +void forward_activation_layer(layer l, network net) { - copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); + copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1); activate_array(l.output, l.outputs*l.batch, l.activation); } -void backward_activation_layer(layer l, network_state state) +void backward_activation_layer(layer l, network net) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); - copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1); + copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1); } #ifdef GPU -void forward_activation_layer_gpu(layer l, network_state state) +void forward_activation_layer_gpu(layer l, network net) { - copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); + copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); } -void backward_activation_layer_gpu(layer l, network_state state) +void backward_activation_layer_gpu(layer l, network net) { gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); + copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/activation_layer.h b/src/activation_layer.h index a09756aa..42118a84 100644 --- a/src/activation_layer.h +++ b/src/activation_layer.h @@ -7,12 +7,12 @@ layer make_activation_layer(int batch, int inputs, ACTIVATION activation); -void forward_activation_layer(layer l, network_state state); -void backward_activation_layer(layer l, network_state state); +void forward_activation_layer(layer l, network net); +void backward_activation_layer(layer l, network net); #ifdef GPU -void forward_activation_layer_gpu(layer l, network_state state); -void backward_activation_layer_gpu(layer l, network_state state); +void forward_activation_layer_gpu(layer l, network net); +void backward_activation_layer_gpu(layer l, network net); #endif #endif diff --git a/src/avgpool_layer.c b/src/avgpool_layer.c index b6932fe7..83034dbe 100644 --- a/src/avgpool_layer.c +++ b/src/avgpool_layer.c @@ -37,7 +37,7 @@ void resize_avgpool_layer(avgpool_layer *l, int w, int h) l->inputs = h*w*l->c; } -void forward_avgpool_layer(const avgpool_layer l, network_state state) +void forward_avgpool_layer(const avgpool_layer l, network net) { int b,i,k; @@ -47,14 +47,14 @@ void forward_avgpool_layer(const avgpool_layer l, network_state state) l.output[out_index] = 0; for(i = 0; i < l.h*l.w; ++i){ int in_index = i + l.h*l.w*(k + b*l.c); - l.output[out_index] += state.input[in_index]; + l.output[out_index] += net.input[in_index]; } l.output[out_index] /= l.h*l.w; } } } -void backward_avgpool_layer(const avgpool_layer l, network_state state) +void backward_avgpool_layer(const avgpool_layer l, network net) { int b,i,k; @@ -63,7 +63,7 @@ void backward_avgpool_layer(const avgpool_layer l, network_state state) int out_index = k + b*l.c; for(i = 0; i < l.h*l.w; ++i){ int in_index = i + l.h*l.w*(k + b*l.c); - state.delta[in_index] += l.delta[out_index] / (l.h*l.w); + net.delta[in_index] += l.delta[out_index] / (l.h*l.w); } } } diff --git a/src/avgpool_layer.h b/src/avgpool_layer.h index f8329aea..3bd356c4 100644 --- a/src/avgpool_layer.h +++ b/src/avgpool_layer.h @@ -11,12 +11,12 @@ typedef layer avgpool_layer; image get_avgpool_image(avgpool_layer l); avgpool_layer make_avgpool_layer(int batch, int w, int h, int c); void resize_avgpool_layer(avgpool_layer *l, int w, int h); -void forward_avgpool_layer(const avgpool_layer l, network_state state); -void backward_avgpool_layer(const avgpool_layer l, network_state state); +void forward_avgpool_layer(const avgpool_layer l, network net); +void backward_avgpool_layer(const avgpool_layer l, network net); #ifdef GPU -void forward_avgpool_layer_gpu(avgpool_layer l, network_state state); -void backward_avgpool_layer_gpu(avgpool_layer l, network_state state); +void forward_avgpool_layer_gpu(avgpool_layer l, network net); +void backward_avgpool_layer_gpu(avgpool_layer l, network net); #endif #endif diff --git a/src/avgpool_layer_kernels.cu b/src/avgpool_layer_kernels.cu index b7e2770e..a7eca3ae 100644 --- a/src/avgpool_layer_kernels.cu +++ b/src/avgpool_layer_kernels.cu @@ -43,19 +43,19 @@ __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float } } -extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state) +extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network net) { size_t n = layer.c*layer.batch; - forward_avgpool_layer_kernel<<>>(n, layer.w, layer.h, layer.c, state.input, layer.output_gpu); + forward_avgpool_layer_kernel<<>>(n, layer.w, layer.h, layer.c, net.input_gpu, layer.output_gpu); check_error(cudaPeekAtLastError()); } -extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state) +extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network net) { size_t n = layer.c*layer.batch; - backward_avgpool_layer_kernel<<>>(n, layer.w, layer.h, layer.c, state.delta, layer.delta_gpu); + backward_avgpool_layer_kernel<<>>(n, layer.w, layer.h, layer.c, net.delta_gpu, layer.delta_gpu); check_error(cudaPeekAtLastError()); } diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index 1be70aa0..de081c4e 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -132,14 +132,15 @@ void resize_batchnorm_layer(layer *layer, int w, int h) fprintf(stderr, "Not implemented\n"); } -void forward_batchnorm_layer(layer l, network_state state) +void forward_batchnorm_layer(layer l, network net) { - if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); + if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } - if(state.train){ + copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); + if(net.train){ mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); @@ -148,7 +149,6 @@ void forward_batchnorm_layer(layer l, network_state state) scal_cpu(l.out_c, .99, l.rolling_variance, 1); axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); - copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { @@ -158,8 +158,12 @@ void forward_batchnorm_layer(layer l, network_state state) add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); } -void backward_batchnorm_layer(const layer l, network_state state) +void backward_batchnorm_layer(layer l, network net) { + if(!net.train){ + l.mean = l.rolling_mean; + l.variance = l.rolling_variance; + } backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates); @@ -168,7 +172,7 @@ void backward_batchnorm_layer(const layer l, network_state state) mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta); variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta); normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta); - if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1); + if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, net.delta, 1); } #ifdef GPU @@ -186,35 +190,35 @@ void push_batchnorm_layer(layer l) cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.c); } -void forward_batchnorm_layer_gpu(layer l, network_state state) +void forward_batchnorm_layer_gpu(layer l, network net) { - if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); + if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); if(l.type == CONNECTED){ l.out_c = l.outputs; l.out_h = l.out_w = 1; } - if (state.train) { + copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); + if (net.train) { #ifdef CUDNN - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); float one = 1; float zero = 0; cudnnBatchNormalizationForwardTraining(cudnn_handle(), - CUDNN_BATCHNORM_SPATIAL, - &one, - &zero, - l.dstTensorDesc, - l.x_gpu, - l.dstTensorDesc, - l.output_gpu, - l.normTensorDesc, - l.scales_gpu, - l.biases_gpu, - .01, - l.rolling_mean_gpu, - l.rolling_variance_gpu, - .00001, - l.mean_gpu, - l.variance_gpu); + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + l.dstTensorDesc, + l.x_gpu, + l.dstTensorDesc, + l.output_gpu, + l.normTensorDesc, + l.scales_gpu, + l.biases_gpu, + .01, + l.rolling_mean_gpu, + l.rolling_variance_gpu, + .00001, + l.mean_gpu, + l.variance_gpu); #else fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); @@ -239,8 +243,12 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) } -void backward_batchnorm_layer_gpu(const layer l, network_state state) +void backward_batchnorm_layer_gpu(layer l, network net) { + if(!net.train){ + l.mean_gpu = l.rolling_mean_gpu; + l.variance_gpu = l.rolling_variance_gpu; + } #ifdef CUDNN float one = 1; float zero = 0; @@ -274,6 +282,6 @@ void backward_batchnorm_layer_gpu(const layer l, network_state state) fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); #endif - if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); + if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/batchnorm_layer.h b/src/batchnorm_layer.h index 99d1d0fe..25a18a3c 100644 --- a/src/batchnorm_layer.h +++ b/src/batchnorm_layer.h @@ -6,12 +6,12 @@ #include "network.h" layer make_batchnorm_layer(int batch, int w, int h, int c); -void forward_batchnorm_layer(layer l, network_state state); -void backward_batchnorm_layer(layer l, network_state state); +void forward_batchnorm_layer(layer l, network net); +void backward_batchnorm_layer(layer l, network net); #ifdef GPU -void forward_batchnorm_layer_gpu(layer l, network_state state); -void backward_batchnorm_layer_gpu(layer l, network_state state); +void forward_batchnorm_layer_gpu(layer l, network net); +void backward_batchnorm_layer_gpu(layer l, network net); void pull_batchnorm_layer(layer l); void push_batchnorm_layer(layer l); #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index a833adbb..ac29d3f0 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -145,7 +145,7 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if (index >= N) return; - x[index] = x[index] - (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)); + x[index] = x[index] + (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)); //if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps))); } diff --git a/src/classifier.c b/src/classifier.c index 6011c935..1b7ff38a 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -123,7 +123,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); save_weights(net, buff); } - if(get_current_batch(net)%100 == 0){ + if(get_current_batch(net)%1000 == 0){ char buff[256]; sprintf(buff, "%s/%s.backup",backup_directory,base); save_weights(net, buff); diff --git a/src/connected_layer.c b/src/connected_layer.c index c23d6fa4..f8c74235 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -125,19 +125,19 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1); } -void forward_connected_layer(connected_layer l, network_state state) +void forward_connected_layer(connected_layer l, network net) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; - float *a = state.input; + float *a = net.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ - if(state.train){ + if(net.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); @@ -160,7 +160,7 @@ void forward_connected_layer(connected_layer l, network_state state) activate_array(l.output, l.outputs*l.batch, l.activation); } -void backward_connected_layer(connected_layer l, network_state state) +void backward_connected_layer(connected_layer l, network net) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); @@ -181,7 +181,7 @@ void backward_connected_layer(connected_layer l, network_state state) int k = l.batch; int n = l.inputs; float *a = l.delta; - float *b = state.input; + float *b = net.input; float *c = l.weight_updates; gemm(1,0,m,n,k,1,a,m,b,n,1,c,n); @@ -191,7 +191,7 @@ void backward_connected_layer(connected_layer l, network_state state) a = l.delta; b = l.weights; - c = state.delta; + c = net.delta; if(c) gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } @@ -274,7 +274,7 @@ void update_connected_layer_gpu(connected_layer l, int batch, float learning_rat scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1); } -void forward_connected_layer_gpu(connected_layer l, network_state state) +void forward_connected_layer_gpu(connected_layer l, network net) { int i; fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); @@ -282,12 +282,12 @@ void forward_connected_layer_gpu(connected_layer l, network_state state) int m = l.batch; int k = l.inputs; int n = l.outputs; - float * a = state.input; + float * a = net.input_gpu; float * b = l.weights_gpu; float * c = l.output_gpu; gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ - forward_batchnorm_layer_gpu(l, state); + forward_batchnorm_layer_gpu(l, net); } for(i = 0; i < l.batch; ++i){ axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1); @@ -295,7 +295,7 @@ void forward_connected_layer_gpu(connected_layer l, network_state state) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); } -void backward_connected_layer_gpu(connected_layer l, network_state state) +void backward_connected_layer_gpu(connected_layer l, network net) { int i; constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); @@ -305,14 +305,14 @@ void backward_connected_layer_gpu(connected_layer l, network_state state) } if(l.batch_normalize){ - backward_batchnorm_layer_gpu(l, state); + backward_batchnorm_layer_gpu(l, net); } int m = l.outputs; int k = l.batch; int n = l.inputs; float * a = l.delta_gpu; - float * b = state.input; + float * b = net.input_gpu; float * c = l.weight_updates_gpu; gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n); @@ -322,7 +322,7 @@ void backward_connected_layer_gpu(connected_layer l, network_state state) a = l.delta_gpu; b = l.weights_gpu; - c = state.delta; + c = net.delta_gpu; if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } diff --git a/src/connected_layer.h b/src/connected_layer.h index 23797b10..62dd497f 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -9,15 +9,15 @@ typedef layer connected_layer; connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize); -void forward_connected_layer(connected_layer layer, network_state state); -void backward_connected_layer(connected_layer layer, network_state state); +void forward_connected_layer(connected_layer layer, network net); +void backward_connected_layer(connected_layer layer, network net); void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay); void denormalize_connected_layer(layer l); void statistics_connected_layer(layer l); #ifdef GPU -void forward_connected_layer_gpu(connected_layer layer, network_state state); -void backward_connected_layer_gpu(connected_layer layer, network_state state); +void forward_connected_layer_gpu(connected_layer layer, network net); +void backward_connected_layer_gpu(connected_layer layer, network net); void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay); void push_connected_layer(connected_layer layer); void pull_connected_layer(connected_layer layer); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 9eb058ce..41dec50d 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -70,7 +70,7 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary) check_error(cudaPeekAtLastError()); } -void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) +void forward_convolutional_layer_gpu(convolutional_layer l, network net) { fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); if(l.binary){ @@ -81,8 +81,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) if(l.xnor){ binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); swap_binary(&l); - binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); - state.input = l.binary_input_gpu; + binarize_gpu(net.input_gpu, l.c*l.h*l.w*l.batch, l.binary_input_gpu); + net.input_gpu = l.binary_input_gpu; } #ifdef CUDNN @@ -90,12 +90,12 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) cudnnConvolutionForward(cudnn_handle(), &one, l.srcTensorDesc, - state.input, + net.input_gpu, l.weightDesc, l.weights_gpu, l.convDesc, l.fw_algo, - state.workspace, + net.workspace, l.workspace_size, &one, l.dstTensorDesc, @@ -107,16 +107,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) int k = l.size*l.size*l.c; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ - im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); float * a = l.weights_gpu; - float * b = state.workspace; + float * b = net.workspace; float * c = l.output_gpu; gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); } #endif if (l.batch_normalize) { - forward_batchnorm_layer_gpu(l, state); + forward_batchnorm_layer_gpu(l, net); } else { add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } @@ -168,40 +168,40 @@ extern "C" void smooth_layer(layer l, int size, float rate) check_error(cudaPeekAtLastError()); } -void backward_convolutional_layer_gpu(convolutional_layer l, network_state state) +void backward_convolutional_layer_gpu(convolutional_layer l, network net) { if(l.smooth){ smooth_layer(l, 5, l.smooth); } - //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ - backward_batchnorm_layer_gpu(l, state); + backward_batchnorm_layer_gpu(l, net); } else { backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); } - float *original_input = state.input; + float *original_input = net.input_gpu; - if(l.xnor) state.input = l.binary_input_gpu; + if(l.xnor) net.input_gpu = l.binary_input_gpu; #ifdef CUDNN float one = 1; cudnnConvolutionBackwardFilter(cudnn_handle(), &one, l.srcTensorDesc, - state.input, + net.input_gpu, l.ddstTensorDesc, l.delta_gpu, l.convDesc, l.bf_algo, - state.workspace, + net.workspace, l.workspace_size, &one, l.dweightDesc, l.weight_updates_gpu); - if(state.delta){ + if(net.delta_gpu){ if(l.binary || l.xnor) swap_binary(&l); cudnnConvolutionBackwardData(cudnn_handle(), &one, @@ -211,13 +211,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state l.delta_gpu, l.convDesc, l.bd_algo, - state.workspace, + net.workspace, l.workspace_size, &one, l.dsrcTensorDesc, - state.delta); + net.delta_gpu); if(l.binary || l.xnor) swap_binary(&l); - if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, state.delta); + if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu); } #else @@ -228,25 +228,25 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state int i; for(i = 0; i < l.batch; ++i){ float * a = l.delta_gpu; - float * b = state.workspace; + float * b = net.workspace; float * c = l.weight_updates_gpu; - im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); - if(state.delta){ + if(net.delta_gpu){ if(l.binary || l.xnor) swap_binary(&l); float * a = l.weights_gpu; float * b = l.delta_gpu; - float * c = state.workspace; + float * c = net.workspace; gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k); - col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta + i*l.c*l.h*l.w); + col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w); if(l.binary || l.xnor) { swap_binary(&l); } - if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w); + if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w); } } #endif @@ -286,33 +286,42 @@ void push_convolutional_layer(convolutional_layer layer) } } -void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay) +void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch) { - int size = layer.size*layer.size*layer.c*layer.n; - axpy_ongpu(layer.n, learning_rate/batch, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); - scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1); + scal_ongpu(n, B1, m, 1); + scal_ongpu(n, B2, v, 1); + axpy_ongpu(n, -decay*batch, w, 1, d, 1); - if(layer.scales_gpu){ - axpy_ongpu(layer.n, learning_rate/batch, layer.scale_updates_gpu, 1, layer.scales_gpu, 1); - scal_ongpu(layer.n, momentum, layer.scale_updates_gpu, 1); - } + axpy_ongpu(n, (1-B1), d, 1, m, 1); + mul_ongpu(n, d, 1, d, 1); + axpy_ongpu(n, (1-B2), d, 1, v, 1); - if(layer.adam){ - scal_ongpu(size, layer.B1, layer.m_gpu, 1); - scal_ongpu(size, layer.B2, layer.v_gpu, 1); + adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, 1000); + fill_ongpu(n, 0, d, 1); +} - axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); +void update_convolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +{ + int size = l.size*l.size*l.c*l.n; - axpy_ongpu(size, -(1-layer.B1), layer.weight_updates_gpu, 1, layer.m_gpu, 1); - mul_ongpu(size, layer.weight_updates_gpu, 1, layer.weight_updates_gpu, 1); - axpy_ongpu(size, (1-layer.B2), layer.weight_updates_gpu, 1, layer.v_gpu, 1); - - adam_gpu(size, layer.weights_gpu, layer.m_gpu, layer.v_gpu, layer.B1, layer.B2, learning_rate/batch, layer.eps, layer.t+1); - fill_ongpu(size, 0, layer.weight_updates_gpu, 1); + if(l.adam){ + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch); + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch); + if(l.scales_gpu){ + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch); + } }else{ - axpy_ongpu(size, -decay*batch, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate/batch, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); - scal_ongpu(size, momentum, layer.weight_updates_gpu, 1); + axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + + axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); + + if(l.scales_gpu){ + axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + } } } diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 04d21310..182a113d 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -12,11 +12,6 @@ #include "xnor_layer.h" #endif -#ifndef AI2 -#define AI2 0 -void forward_xnor_layer(layer l, network_state state); -#endif - void swap_binary(convolutional_layer *l) { float *swap = l->weights; @@ -188,9 +183,14 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); + l.nweights = c*n*size*size; + l.nbiases = n; + // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); - for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); + scale = .02; + //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); + for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal(); int out_w = convolutional_out_width(l); int out_h = convolutional_out_height(l); l.out_h = out_h; @@ -237,6 +237,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.adam = 1; l.m = calloc(c*n*size*size, sizeof(float)); l.v = calloc(c*n*size*size, sizeof(float)); + l.bias_m = calloc(n, sizeof(float)); + l.scale_m = calloc(n, sizeof(float)); + l.bias_v = calloc(n, sizeof(float)); + l.scale_v = calloc(n, sizeof(float)); } #ifdef GPU @@ -248,6 +252,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int if (adam) { l.m_gpu = cuda_make_array(l.m, c*n*size*size); l.v_gpu = cuda_make_array(l.v, c*n*size*size); + l.bias_m_gpu = cuda_make_array(l.bias_m, n); + l.bias_v_gpu = cuda_make_array(l.bias_v, n); + l.scale_m_gpu = cuda_make_array(l.scale_m, n); + l.scale_v_gpu = cuda_make_array(l.scale_v, n); } l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); @@ -319,6 +327,7 @@ void denormalize_convolutional_layer(convolutional_layer l) } } +/* void test_convolutional_layer() { convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0); @@ -338,10 +347,10 @@ void test_convolutional_layer() 3,3,3,3,3, 3,3,3,3,3, 3,3,3,3,3}; - network_state state = {0}; - state.input = data; - forward_convolutional_layer(l, state); + //net.input = data; + //forward_convolutional_layer(l); } +*/ void resize_convolutional_layer(convolutional_layer *l, int w, int h) { @@ -418,7 +427,7 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size } } -void forward_convolutional_layer(convolutional_layer l, network_state state) +void forward_convolutional_layer(convolutional_layer l, network net) { int out_h = l.out_h; int out_w = l.out_w; @@ -429,8 +438,8 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) if(l.xnor){ binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.binary_weights); swap_binary(&l); - binarize_cpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input); - state.input = l.binary_input; + binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input); + net.input = l.binary_input; } int m = l.n; @@ -439,19 +448,19 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) float *a = l.weights; - float *b = state.workspace; + float *b = net.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ - im2col_cpu(state.input, l.c, l.h, l.w, + im2col_cpu(net.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; - state.input += l.c*l.h*l.w; + net.input += l.c*l.h*l.w; } if(l.batch_normalize){ - forward_batchnorm_layer(l, state); + forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); } @@ -460,7 +469,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) if(l.binary || l.xnor) swap_binary(&l); } -void backward_convolutional_layer(convolutional_layer l, network_state state) +void backward_convolutional_layer(convolutional_layer l, network net) { int i; int m = l.n; @@ -470,30 +479,30 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) gradient_array(l.output, m*k*l.batch, l.activation, l.delta); if(l.batch_normalize){ - backward_batchnorm_layer(l, state); + backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); } for(i = 0; i < l.batch; ++i){ float *a = l.delta + i*m*k; - float *b = state.workspace; + float *b = net.workspace; float *c = l.weight_updates; - float *im = state.input+i*l.c*l.h*l.w; + float *im = net.input+i*l.c*l.h*l.w; im2col_cpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); - if(state.delta){ + if(net.delta){ a = l.weights; b = l.delta + i*m*k; - c = state.workspace; + c = net.workspace; gemm(1,0,n,k,m,1,a,n,b,k,0,c,k); - col2im_cpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_cpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta+i*l.c*l.h*l.w); } } } @@ -553,8 +562,14 @@ image *get_weights(convolutional_layer l) int i; for(i = 0; i < l.n; ++i){ weights[i] = copy_image(get_convolutional_weight(l, i)); - //normalize_image(weights[i]); + normalize_image(weights[i]); + /* + char buff[256]; + sprintf(buff, "filter%d", i); + save_image(weights[i], buff); + */ } + //error("hey"); return weights; } diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 970aa101..e00e6788 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -10,8 +10,8 @@ typedef layer convolutional_layer; #ifdef GPU -void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state); -void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state); +void forward_convolutional_layer_gpu(convolutional_layer layer, network net); +void backward_convolutional_layer_gpu(convolutional_layer layer, network net); void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); void push_convolutional_layer(convolutional_layer layer); @@ -19,6 +19,7 @@ void pull_convolutional_layer(convolutional_layer layer); void add_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); +void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch); #ifdef CUDNN void cudnn_convolutional_setup(layer *l); #endif @@ -27,18 +28,19 @@ void cudnn_convolutional_setup(layer *l); convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam); void denormalize_convolutional_layer(convolutional_layer l); void resize_convolutional_layer(convolutional_layer *layer, int w, int h); -void forward_convolutional_layer(const convolutional_layer layer, network_state state); +void forward_convolutional_layer(const convolutional_layer layer, network net); void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights); void binarize_weights(float *weights, int n, int size, float *binary); void swap_binary(convolutional_layer *l); void binarize_weights2(float *weights, int n, int size, char *binary, float *scales); -void backward_convolutional_layer(convolutional_layer layer, network_state state); +void backward_convolutional_layer(convolutional_layer layer, network net); void add_bias(float *output, float *biases, int batch, int n, int size); void backward_bias(float *bias_updates, float *delta, int batch, int n, int size); +image *get_weights(convolutional_layer l); image get_convolutional_image(convolutional_layer layer); image get_convolutional_delta(convolutional_layer layer); image get_convolutional_weight(convolutional_layer layer, int i); diff --git a/src/cost_layer.c b/src/cost_layer.c index 320f7fe5..76c001f0 100644 --- a/src/cost_layer.c +++ b/src/cost_layer.c @@ -73,28 +73,28 @@ void resize_cost_layer(cost_layer *l, int inputs) #endif } -void forward_cost_layer(cost_layer l, network_state state) +void forward_cost_layer(cost_layer l, network net) { - if (!state.truth) return; + if (!net.truth) return; if(l.cost_type == MASKED){ int i; for(i = 0; i < l.batch*l.inputs; ++i){ - if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM; + if(net.truth[i] == SECRET_NUM) net.input[i] = SECRET_NUM; } } if(l.cost_type == SMOOTH){ - smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); + smooth_l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output); }else if(l.cost_type == L1){ - l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); + l1_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output); } else { - l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); + l2_cpu(l.batch*l.inputs, net.input, net.truth, l.delta, l.output); } l.cost[0] = sum_array(l.output, l.batch*l.inputs); } -void backward_cost_layer(const cost_layer l, network_state state) +void backward_cost_layer(const cost_layer l, network net) { - axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, state.delta, 1); + axpy_cpu(l.batch*l.inputs, l.scale, l.delta, 1, net.delta, 1); } #ifdef GPU @@ -118,23 +118,23 @@ int float_abs_compare (const void * a, const void * b) return (fa > fb) - (fa < fb); } -void forward_cost_layer_gpu(cost_layer l, network_state state) +void forward_cost_layer_gpu(cost_layer l, network net) { - if (!state.truth) return; + if (!net.truth) return; if(l.smooth){ - scal_ongpu(l.batch*l.inputs, (1-l.smooth), state.truth, 1); - add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, state.truth, 1); + scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1); + add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1); } if (l.cost_type == MASKED) { - mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); + mask_ongpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu); } if(l.cost_type == SMOOTH){ - smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); + smooth_l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu); } else if (l.cost_type == L1){ - l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); + l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu); } else { - l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); + l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu); } if(l.ratio){ @@ -155,9 +155,9 @@ void forward_cost_layer_gpu(cost_layer l, network_state state) l.cost[0] = sum_array(l.output, l.batch*l.inputs); } -void backward_cost_layer_gpu(const cost_layer l, network_state state) +void backward_cost_layer_gpu(const cost_layer l, network net) { - axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, state.delta, 1); + axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/cost_layer.h b/src/cost_layer.h index a692831e..ceb64de0 100644 --- a/src/cost_layer.h +++ b/src/cost_layer.h @@ -8,13 +8,13 @@ typedef layer cost_layer; COST_TYPE get_cost_type(char *s); char *get_cost_string(COST_TYPE a); cost_layer make_cost_layer(int batch, int inputs, COST_TYPE type, float scale); -void forward_cost_layer(const cost_layer l, network_state state); -void backward_cost_layer(const cost_layer l, network_state state); +void forward_cost_layer(const cost_layer l, network net); +void backward_cost_layer(const cost_layer l, network net); void resize_cost_layer(cost_layer *l, int inputs); #ifdef GPU -void forward_cost_layer_gpu(cost_layer l, network_state state); -void backward_cost_layer_gpu(const cost_layer l, network_state state); +void forward_cost_layer_gpu(cost_layer l, network net); +void backward_cost_layer_gpu(const cost_layer l, network net); #endif #endif diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 54958803..2478fef5 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -88,10 +88,10 @@ void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay); } -void forward_crnn_layer(layer l, network_state state) +void forward_crnn_layer(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -100,17 +100,17 @@ void forward_crnn_layer(layer l, network_state state) fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); - if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); + if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { - s.input = state.input; + s.input = net.input; forward_convolutional_layer(input_layer, s); s.input = l.state; forward_convolutional_layer(self_layer, s); float *old_state = l.state; - if(state.train) l.state += l.hidden*l.batch; + if(net.train) l.state += l.hidden*l.batch; if(l.shortcut){ copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); }else{ @@ -122,17 +122,16 @@ void forward_crnn_layer(layer l, network_state state) s.input = l.state; forward_convolutional_layer(output_layer, s); - state.input += l.inputs*l.batch; + net.input += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } } -void backward_crnn_layer(layer l, network_state state) +void backward_crnn_layer(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -168,8 +167,8 @@ void backward_crnn_layer(layer l, network_state state) copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1); if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1); - s.input = state.input + i*l.inputs*l.batch; - if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; + s.input = net.input + i*l.inputs*l.batch; + if(net.delta) s.delta = net.delta + i*l.inputs*l.batch; else s.delta = 0; backward_convolutional_layer(input_layer, s); @@ -202,10 +201,9 @@ void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float moment update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); } -void forward_crnn_layer_gpu(layer l, network_state state) +void forward_crnn_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -214,17 +212,17 @@ void forward_crnn_layer_gpu(layer l, network_state state) fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); - if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); + if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { - s.input = state.input; + s.input_gpu = net.input_gpu; forward_convolutional_layer_gpu(input_layer, s); - s.input = l.state_gpu; + s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(self_layer, s); float *old_state = l.state_gpu; - if(state.train) l.state_gpu += l.hidden*l.batch; + if(net.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ @@ -233,20 +231,20 @@ void forward_crnn_layer_gpu(layer l, network_state state) axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); - s.input = l.state_gpu; + s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(output_layer, s); - state.input += l.inputs*l.batch; + net.input_gpu += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } } -void backward_crnn_layer_gpu(layer l, network_state state) +void backward_crnn_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -259,22 +257,22 @@ void backward_crnn_layer_gpu(layer l, network_state state) copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); - s.input = l.state_gpu; - s.delta = self_layer.delta_gpu; + s.input_gpu = l.state_gpu; + s.delta_gpu = self_layer.delta_gpu; backward_convolutional_layer_gpu(output_layer, s); l.state_gpu -= l.hidden*l.batch; - s.input = l.state_gpu; - s.delta = self_layer.delta_gpu - l.hidden*l.batch; - if (i == 0) s.delta = 0; + s.input_gpu = l.state_gpu; + s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch; + if (i == 0) s.delta_gpu = 0; backward_convolutional_layer_gpu(self_layer, s); copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); - s.input = state.input + i*l.inputs*l.batch; - if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; - else s.delta = 0; + s.input_gpu = net.input_gpu + i*l.inputs*l.batch; + if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch; + else s.delta_gpu = 0; backward_convolutional_layer_gpu(input_layer, s); increment_layer(&input_layer, -1); diff --git a/src/crnn_layer.h b/src/crnn_layer.h index 0da942ee..ce89211b 100644 --- a/src/crnn_layer.h +++ b/src/crnn_layer.h @@ -8,13 +8,13 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int output_filters, int steps, ACTIVATION activation, int batch_normalize); -void forward_crnn_layer(layer l, network_state state); -void backward_crnn_layer(layer l, network_state state); +void forward_crnn_layer(layer l, network net); +void backward_crnn_layer(layer l, network net); void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); #ifdef GPU -void forward_crnn_layer_gpu(layer l, network_state state); -void backward_crnn_layer_gpu(layer l, network_state state); +void forward_crnn_layer_gpu(layer l, network net); +void backward_crnn_layer_gpu(layer l, network net); void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void push_crnn_layer(layer l); void pull_crnn_layer(layer l); diff --git a/src/crop_layer.c b/src/crop_layer.c index 11c59b49..3b918529 100644 --- a/src/crop_layer.c +++ b/src/crop_layer.c @@ -10,8 +10,8 @@ image get_crop_image(crop_layer l) return float_to_image(w,h,c,l.output); } -void backward_crop_layer(const crop_layer l, network_state state){} -void backward_crop_layer_gpu(const crop_layer l, network_state state){} +void backward_crop_layer(const crop_layer l, network net){} +void backward_crop_layer_gpu(const crop_layer l, network net){} crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure) { @@ -64,7 +64,7 @@ void resize_crop_layer(layer *l, int w, int h) } -void forward_crop_layer(const crop_layer l, network_state state) +void forward_crop_layer(const crop_layer l, network net) { int i,j,c,b,row,col; int index; @@ -78,7 +78,7 @@ void forward_crop_layer(const crop_layer l, network_state state) scale = 1; trans = 0; } - if(!state.train){ + if(!net.train){ flip = 0; dh = (l.h - l.out_h)/2; dw = (l.w - l.out_w)/2; @@ -94,7 +94,7 @@ void forward_crop_layer(const crop_layer l, network_state state) } row = i + dh; index = col+l.w*(row+l.h*(c + l.c*b)); - l.output[count++] = state.input[index]*scale + trans; + l.output[count++] = net.input[index]*scale + trans; } } } diff --git a/src/crop_layer.h b/src/crop_layer.h index 3aa2d3dd..3b5883c4 100644 --- a/src/crop_layer.h +++ b/src/crop_layer.h @@ -9,11 +9,11 @@ typedef layer crop_layer; image get_crop_image(crop_layer l); crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure); -void forward_crop_layer(const crop_layer l, network_state state); +void forward_crop_layer(const crop_layer l, network net); void resize_crop_layer(layer *l, int w, int h); #ifdef GPU -void forward_crop_layer_gpu(crop_layer l, network_state state); +void forward_crop_layer_gpu(crop_layer l, network net); #endif #endif diff --git a/src/crop_layer_kernels.cu b/src/crop_layer_kernels.cu index 8a086305..b6568219 100644 --- a/src/crop_layer_kernels.cu +++ b/src/crop_layer_kernels.cu @@ -180,7 +180,7 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); } -extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state) +extern "C" void forward_crop_layer_gpu(crop_layer layer, network net) { cuda_random(layer.rand_gpu, layer.batch*8); @@ -195,12 +195,12 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state) int size = layer.batch * layer.w * layer.h; - levels_image_kernel<<>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift); + levels_image_kernel<<>>(net.input_gpu, layer.rand_gpu, layer.batch, layer.w, layer.h, net.train, layer.saturation, layer.exposure, translate, scale, layer.shift); check_error(cudaPeekAtLastError()); size = layer.batch*layer.c*layer.out_w*layer.out_h; - forward_crop_layer_kernel<<>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu); + forward_crop_layer_kernel<<>>(net.input_gpu, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, net.train, layer.flip, radians, layer.output_gpu); check_error(cudaPeekAtLastError()); /* diff --git a/src/darknet.c b/src/darknet.c index 4b797acc..a3aa6555 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -348,6 +348,32 @@ void denormalize_net(char *cfgfile, char *weightfile, char *outfile) save_weights(net, outfile); } +void mkimg(char *cfgfile, char *weightfile, int h, int w, int num, char *prefix) +{ + network net = load_network(cfgfile, weightfile, 0); + image *ims = get_weights(net.layers[0]); + int n = net.layers[0].n; + int z; + for(z = 0; z < num; ++z){ + image im = make_image(h, w, 3); + fill_image(im, .5); + int i; + for(i = 0; i < 100; ++i){ + image r = copy_image(ims[rand()%n]); + rotate_image_cw(r, rand()%4); + random_distort_image(r, 1, 1.5, 1.5); + int dx = rand()%(w-r.w); + int dy = rand()%(h-r.h); + ghost_image(r, im, dx, dy); + free_image(r); + } + char buff[256]; + sprintf(buff, "%s/gen_%d", prefix, z); + save_image(im, buff); + free_image(im); + } +} + void visualize(char *cfgfile, char *weightfile) { network net = parse_network_cfg(cfgfile); @@ -458,6 +484,8 @@ int main(int argc, char **argv) average(argc, argv); } else if (0 == strcmp(argv[1], "visualize")){ visualize(argv[2], (argc > 3) ? argv[3] : 0); + } else if (0 == strcmp(argv[1], "mkimg")){ + mkimg(argv[2], argv[3], atoi(argv[4]), atoi(argv[5]), atoi(argv[6]), argv[7]); } else if (0 == strcmp(argv[1], "imtest")){ test_resize(argv[2]); } else { diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu index 381be23b..55aa162c 100644 --- a/src/deconvolutional_kernels.cu +++ b/src/deconvolutional_kernels.cu @@ -14,12 +14,9 @@ extern "C" { #include "cuda.h" } -extern "C" void forward_deconvolutional_layer_gpu(layer l, network_state state) +extern "C" void forward_deconvolutional_layer_gpu(layer l, network net) { int i; - int out_h = l.out_h; - int out_w = l.out_w; - int size = out_h*out_w; int m = l.size*l.size*l.n; int n = l.h*l.w; @@ -29,59 +26,57 @@ extern "C" void forward_deconvolutional_layer_gpu(layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *a = l.weights_gpu; - float *b = state.input + i*l.c*l.h*l.w; - float *c = state.workspace; + float *b = net.input_gpu + i*l.c*l.h*l.w; + float *c = net.workspace; gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n); - col2im_ongpu(c, l.n, out_h, out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.n*size); + col2im_ongpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs); } if (l.batch_normalize) { - forward_batchnorm_layer_gpu(l, state); + forward_batchnorm_layer_gpu(l, net); } else { add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } - activate_array_ongpu(l.output_gpu, l.batch*l.n*size, l.activation); + activate_array_ongpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation); } -extern "C" void backward_deconvolutional_layer_gpu(layer l, network_state state) +extern "C" void backward_deconvolutional_layer_gpu(layer l, network net) { - int out_h = l.out_h; - int out_w = l.out_w; - int size = out_h*out_w; int i; + constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ - backward_batchnorm_layer_gpu(l, state); + backward_batchnorm_layer_gpu(l, net); } else { backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); } - //if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); + //if(net.delta_gpu) memset(net.delta_gpu, 0, l.batch*l.h*l.w*l.c*sizeof(float)); for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; - float *a = state.input + i*m*n; - float *b = state.workspace; + float *a = net.input_gpu + i*m*k; + float *b = net.workspace; float *c = l.weight_updates_gpu; - im2col_ongpu(l.delta_gpu + i*l.n*size, l.n, out_h, out_w, + im2col_ongpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, b); gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); - if(state.delta){ + if(net.delta_gpu){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights_gpu; - float *b = state.workspace; - float *c = state.delta + i*n*m; + float *b = net.workspace; + float *c = net.delta_gpu + i*n*m; gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } @@ -117,16 +112,25 @@ extern "C" void push_deconvolutional_layer(layer l) void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; - axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); - if(l.scales_gpu){ - axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + if(l.adam){ + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch); + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch); + if(l.scales_gpu){ + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch); + } + }else{ + axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + + axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); + + if(l.scales_gpu){ + axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + } } - - axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(size, momentum, l.weight_updates_gpu, 1); } diff --git a/src/deconvolutional_layer.c b/src/deconvolutional_layer.c index 7170975c..917d77fb 100644 --- a/src/deconvolutional_layer.c +++ b/src/deconvolutional_layer.c @@ -11,20 +11,11 @@ static size_t get_workspace_size(layer l){ - return (size_t)l.h*l.w*l.size*l.size*l.c*sizeof(float); + return (size_t)l.h*l.w*l.size*l.size*l.n*sizeof(float); } -int deconvolutional_out_height(layer l) -{ - return (l.h) * l.stride + l.size/2 - l.pad; -} -int deconvolutional_out_width(layer l) -{ - return (l.w) * l.stride + l.size/2 - l.pad; -} - -layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize) +layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam) { int i; layer l = {0}; @@ -38,26 +29,29 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size l.stride = stride; l.size = size; + l.nweights = c*n*size*size; + l.nbiases = n; + l.weights = calloc(c*n*size*size, sizeof(float)); l.weight_updates = calloc(c*n*size*size, sizeof(float)); l.biases = calloc(n, sizeof(float)); l.bias_updates = calloc(n, sizeof(float)); - float scale = 1./sqrt(size*size*c); + float scale = .02; for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal(); for(i = 0; i < n; ++i){ - l.biases[i] = scale; + l.biases[i] = 0; } - l.pad = l.size/2; + l.pad = padding; - l.out_h = (l.h) * l.stride + l.size/2 - l.pad; - l.out_w = (l.w) * l.stride + l.size/2 - l.pad; + l.out_h = (l.h - 1) * l.stride + l.size - 2*l.pad; + l.out_w = (l.w - 1) * l.stride + l.size - 2*l.pad; l.out_c = n; l.outputs = l.out_w * l.out_h * l.out_c; l.inputs = l.w * l.h * l.c; - l.output = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); - l.delta = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); + l.output = calloc(l.batch*l.outputs, sizeof(float)); + l.delta = calloc(l.batch*l.outputs, sizeof(float)); l.forward = forward_deconvolutional_layer; l.backward = backward_deconvolutional_layer; @@ -83,6 +77,15 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size l.x = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } + if(adam){ + l.adam = 1; + l.m = calloc(c*n*size*size, sizeof(float)); + l.v = calloc(c*n*size*size, sizeof(float)); + l.bias_m = calloc(n, sizeof(float)); + l.scale_m = calloc(n, sizeof(float)); + l.bias_v = calloc(n, sizeof(float)); + l.scale_v = calloc(n, sizeof(float)); + } #ifdef GPU l.forward_gpu = forward_deconvolutional_layer_gpu; @@ -91,6 +94,14 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size if(gpu_index >= 0){ + if (adam) { + l.m_gpu = cuda_make_array(l.m, c*n*size*size); + l.v_gpu = cuda_make_array(l.v, c*n*size*size); + l.bias_m_gpu = cuda_make_array(l.bias_m, n); + l.bias_v_gpu = cuda_make_array(l.bias_v, n); + l.scale_m_gpu = cuda_make_array(l.scale_m, n); + l.scale_v_gpu = cuda_make_array(l.scale_v, n); + } l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); @@ -137,8 +148,8 @@ void resize_deconvolutional_layer(layer *l, int h, int w) { l->h = h; l->w = w; - l->out_h = (l->h) * l->stride + l->size/2 - l->pad; - l->out_w = (l->w) * l->stride + l->size/2 - l->pad; + l->out_h = (l->h - 1) * l->stride + l->size - 2*l->pad; + l->out_w = (l->w - 1) * l->stride + l->size - 2*l->pad; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; @@ -172,12 +183,9 @@ void resize_deconvolutional_layer(layer *l, int h, int w) l->workspace_size = get_workspace_size(*l); } -void forward_deconvolutional_layer(const layer l, network_state state) +void forward_deconvolutional_layer(const layer l, network net) { int i; - int out_h = l.out_h; - int out_w = l.out_w; - int size = out_h*out_w; int m = l.size*l.size*l.n; int n = l.h*l.w; @@ -187,60 +195,58 @@ void forward_deconvolutional_layer(const layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *a = l.weights; - float *b = state.input + i*l.c*l.h*l.w; - float *c = state.workspace; + float *b = net.input + i*l.c*l.h*l.w; + float *c = net.workspace; - gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); + gemm_cpu(1,0,m,n,k,1,a,m,b,n,0,c,n); - col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size); + col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output+i*l.outputs); } - - if(l.batch_normalize){ - forward_batchnorm_layer(l, state); + if (l.batch_normalize) { + forward_batchnorm_layer(l, net); } else { - add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); + add_bias(l.output, l.biases, l.batch, l.n, l.out_w*l.out_h); } - activate_array(l.output, l.batch*l.n*size, l.activation); + activate_array(l.output, l.batch*l.n*l.out_w*l.out_h, l.activation); } -void backward_deconvolutional_layer(layer l, network_state state) +void backward_deconvolutional_layer(layer l, network net) { - float alpha = 1./l.batch; - int out_h = deconvolutional_out_height(l); - int out_w = deconvolutional_out_width(l); - int size = out_h*out_w; int i; - gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta); + gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); + if(l.batch_normalize){ - backward_batchnorm_layer(l, state); + backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } + //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); + for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; - float *a = state.input + i*m*n; - float *b = state.workspace; + float *a = net.input + i*m*k; + float *b = net.workspace; float *c = l.weight_updates; - im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, - l.size, l.stride, 0, b); - gemm(0,1,m,n,k,alpha,a,k,b,k,1,c,n); + im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, + l.size, l.stride, l.pad, b); + gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n); - if(state.delta){ + if(net.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; - float *b = state.workspace; - float *c = state.delta + i*n*m; + float *b = net.workspace; + float *c = net.delta + i*n*m; - gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); + gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } } diff --git a/src/deconvolutional_layer.h b/src/deconvolutional_layer.h index 6a57513e..42ccbc14 100644 --- a/src/deconvolutional_layer.h +++ b/src/deconvolutional_layer.h @@ -8,18 +8,18 @@ #include "network.h" #ifdef GPU -void forward_deconvolutional_layer_gpu(layer l, network_state state); -void backward_deconvolutional_layer_gpu(layer l, network_state state); +void forward_deconvolutional_layer_gpu(layer l, network net); +void backward_deconvolutional_layer_gpu(layer l, network net); void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void push_deconvolutional_layer(layer l); void pull_deconvolutional_layer(layer l); #endif -layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize); +layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam); void resize_deconvolutional_layer(layer *l, int h, int w); -void forward_deconvolutional_layer(const layer l, network_state state); +void forward_deconvolutional_layer(const layer l, network net); void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay); -void backward_deconvolutional_layer(layer l, network_state state); +void backward_deconvolutional_layer(layer l, network net); #endif diff --git a/src/demo.c b/src/demo.c index 9a37b4a6..24c02e0e 100644 --- a/src/demo.c +++ b/src/demo.c @@ -41,7 +41,7 @@ void *fetch_in_thread(void *ptr) if(!in.data){ error("Stream closed."); } - in_s = resize_image(in, net.w, net.h); + in_s = letterbox_image(in, net.w, net.h); return 0; } @@ -61,7 +61,7 @@ void *detect_in_thread(void *ptr) if(l.type == DETECTION){ get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0); } else if (l.type == REGION){ - get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0, demo_hier_thresh); + get_region_boxes(l, in.w, in.h, demo_thresh, probs, boxes, 0, 0, demo_hier_thresh, 1); } else { error("Last layer must produce detections\n"); } diff --git a/src/detection_layer.c b/src/detection_layer.c index ff0f4c2b..f9b4e4e7 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -46,11 +46,11 @@ detection_layer make_detection_layer(int batch, int inputs, int n, int side, int return l; } -void forward_detection_layer(const detection_layer l, network_state state) +void forward_detection_layer(const detection_layer l, network net) { int locations = l.side*l.side; int i,j; - memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); + memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); //if(l.reorg) reorg(l.output, l.w*l.h, size*l.n, l.batch, 1); int b; if (l.softmax){ @@ -63,7 +63,7 @@ void forward_detection_layer(const detection_layer l, network_state state) } } } - if(state.train){ + if(net.train){ float avg_iou = 0; float avg_cat = 0; float avg_allcat = 0; @@ -77,7 +77,7 @@ void forward_detection_layer(const detection_layer l, network_state state) int index = b*l.inputs; for (i = 0; i < locations; ++i) { int truth_index = (b*locations + i)*(1+l.coords+l.classes); - int is_obj = state.truth[truth_index]; + int is_obj = net.truth[truth_index]; for (j = 0; j < l.n; ++j) { int p_index = index + locations*l.classes + i*l.n + j; l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]); @@ -95,13 +95,13 @@ void forward_detection_layer(const detection_layer l, network_state state) int class_index = index + i*l.classes; for(j = 0; j < l.classes; ++j) { - l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]); - *(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2); - if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j]; + l.delta[class_index+j] = l.class_scale * (net.truth[truth_index+1+j] - l.output[class_index+j]); + *(l.cost) += l.class_scale * pow(net.truth[truth_index+1+j] - l.output[class_index+j], 2); + if(net.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j]; avg_allcat += l.output[class_index+j]; } - box truth = float_to_box(state.truth + truth_index + 1 + l.classes, 1); + box truth = float_to_box(net.truth + truth_index + 1 + l.classes, 1); truth.x /= l.side; truth.y /= l.side; @@ -139,7 +139,7 @@ void forward_detection_layer(const detection_layer l, network_state state) best_index = 0; } } - if(l.random && *(state.net.seen) < 64000){ + if(l.random && *(net.seen) < 64000){ best_index = rand()%l.n; } @@ -166,13 +166,13 @@ void forward_detection_layer(const detection_layer l, network_state state) l.delta[p_index] = l.object_scale * (iou - l.output[p_index]); } - l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]); - l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]); - l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]); - l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]); + l.delta[box_index+0] = l.coord_scale*(net.truth[tbox_index + 0] - l.output[box_index + 0]); + l.delta[box_index+1] = l.coord_scale*(net.truth[tbox_index + 1] - l.output[box_index + 1]); + l.delta[box_index+2] = l.coord_scale*(net.truth[tbox_index + 2] - l.output[box_index + 2]); + l.delta[box_index+3] = l.coord_scale*(net.truth[tbox_index + 3] - l.output[box_index + 3]); if(l.sqrt){ - l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]); - l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]); + l.delta[box_index+2] = l.coord_scale*(sqrt(net.truth[tbox_index + 2]) - l.output[box_index + 2]); + l.delta[box_index+3] = l.coord_scale*(sqrt(net.truth[tbox_index + 3]) - l.output[box_index + 3]); } *(l.cost) += pow(1-iou, 2); @@ -216,9 +216,9 @@ void forward_detection_layer(const detection_layer l, network_state state) } } -void backward_detection_layer(const detection_layer l, network_state state) +void backward_detection_layer(const detection_layer l, network net) { - axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); + axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1); } void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness) @@ -252,36 +252,25 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box #ifdef GPU -void forward_detection_layer_gpu(const detection_layer l, network_state state) +void forward_detection_layer_gpu(const detection_layer l, network net) { - if(!state.train){ - copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); + if(!net.train){ + copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); return; } float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; - if(state.truth){ - int num_truth = l.batch*l.side*l.side*(1+l.coords+l.classes); - truth_cpu = calloc(num_truth, sizeof(float)); - cuda_pull_array(state.truth, truth_cpu, num_truth); - } - cuda_pull_array(state.input, in_cpu, l.batch*l.inputs); - network_state cpu_state = state; - cpu_state.train = state.train; - cpu_state.truth = truth_cpu; - cpu_state.input = in_cpu; - forward_detection_layer(l, cpu_state); + + forward_detection_layer(l, net); cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); cuda_push_array(l.delta_gpu, l.delta, l.batch*l.inputs); - free(cpu_state.input); - if(cpu_state.truth) free(cpu_state.truth); } -void backward_detection_layer_gpu(detection_layer l, network_state state) +void backward_detection_layer_gpu(detection_layer l, network net) { - axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1); - //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, state.delta, 1); + axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); + //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/detection_layer.h b/src/detection_layer.h index e847a094..fecfed04 100644 --- a/src/detection_layer.h +++ b/src/detection_layer.h @@ -7,13 +7,13 @@ typedef layer detection_layer; detection_layer make_detection_layer(int batch, int inputs, int n, int size, int classes, int coords, int rescore); -void forward_detection_layer(const detection_layer l, network_state state); -void backward_detection_layer(const detection_layer l, network_state state); +void forward_detection_layer(const detection_layer l, network net); +void backward_detection_layer(const detection_layer l, network net); void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness); #ifdef GPU -void forward_detection_layer_gpu(const detection_layer l, network_state state); -void backward_detection_layer_gpu(detection_layer l, network_state state); +void forward_detection_layer_gpu(const detection_layer l, network net); +void backward_detection_layer_gpu(detection_layer l, network net); #endif #endif diff --git a/src/detector.c b/src/detector.c index d5c3cfcc..8a429ed8 100644 --- a/src/detector.c +++ b/src/detector.c @@ -346,7 +346,7 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char network_predict(net, input.data); int w = val[t].w; int h = val[t].h; - get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5); + get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5, 0); if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms); if (coco){ print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h); @@ -477,7 +477,7 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out network_predict(net, X); int w = val[t].w; int h = val[t].h; - get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5); + get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5, 0); if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms); if (coco){ print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h); @@ -541,7 +541,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile) image sized = resize_image(orig, net.w, net.h); char *id = basecfg(path); network_predict(net, sized.data); - get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0, .5); + get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0, .5, 0); if (nms) do_nms(boxes, probs, l.w*l.h*l.n, 1, nms); char labelpath[4096]; @@ -580,7 +580,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile) } } -void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh) +void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile) { list *options = read_data_cfg(datacfg); char *name_list = option_find_str(options, "names", "data/names.list"); @@ -624,21 +624,26 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam time=clock(); network_predict(net, X); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); - get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh); + get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh, 0); if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms); else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms); draw_detections(sized, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); - save_image(sized, "predictions"); - show_image(sized, "predictions"); + if(outfile){ + save_image(sized, outfile); + } + else{ + save_image(sized, "predictions"); + show_image(sized, "predictions"); +#ifdef OPENCV + cvWaitKey(0); + cvDestroyAllWindows(); +#endif + } free_image(im); free_image(sized); free(boxes); free_ptrs((void **)probs, l.w*l.h*l.n); -#ifdef OPENCV - cvWaitKey(0); - cvDestroyAllWindows(); -#endif if (filename) break; } } @@ -684,7 +689,7 @@ void run_detector(int argc, char **argv) char *cfg = argv[4]; char *weights = (argc > 5) ? argv[5] : 0; char *filename = (argc > 6) ? argv[6]: 0; - if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh); + if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile); else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear); else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile); else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile); diff --git a/src/dropout_layer.c b/src/dropout_layer.c index b1381e63..780554fb 100644 --- a/src/dropout_layer.c +++ b/src/dropout_layer.c @@ -35,26 +35,26 @@ void resize_dropout_layer(dropout_layer *l, int inputs) #endif } -void forward_dropout_layer(dropout_layer l, network_state state) +void forward_dropout_layer(dropout_layer l, network net) { int i; - if (!state.train) return; + if (!net.train) return; for(i = 0; i < l.batch * l.inputs; ++i){ float r = rand_uniform(0, 1); l.rand[i] = r; - if(r < l.probability) state.input[i] = 0; - else state.input[i] *= l.scale; + if(r < l.probability) net.input[i] = 0; + else net.input[i] *= l.scale; } } -void backward_dropout_layer(dropout_layer l, network_state state) +void backward_dropout_layer(dropout_layer l, network net) { int i; - if(!state.delta) return; + if(!net.delta) return; for(i = 0; i < l.batch * l.inputs; ++i){ float r = l.rand[i]; - if(r < l.probability) state.delta[i] = 0; - else state.delta[i] *= l.scale; + if(r < l.probability) net.delta[i] = 0; + else net.delta[i] *= l.scale; } } diff --git a/src/dropout_layer.h b/src/dropout_layer.h index 691cfc5b..01f94d4d 100644 --- a/src/dropout_layer.h +++ b/src/dropout_layer.h @@ -8,13 +8,13 @@ typedef layer dropout_layer; dropout_layer make_dropout_layer(int batch, int inputs, float probability); -void forward_dropout_layer(dropout_layer l, network_state state); -void backward_dropout_layer(dropout_layer l, network_state state); +void forward_dropout_layer(dropout_layer l, network net); +void backward_dropout_layer(dropout_layer l, network net); void resize_dropout_layer(dropout_layer *l, int inputs); #ifdef GPU -void forward_dropout_layer_gpu(dropout_layer l, network_state state); -void backward_dropout_layer_gpu(dropout_layer l, network_state state); +void forward_dropout_layer_gpu(dropout_layer l, network net); +void backward_dropout_layer_gpu(dropout_layer l, network net); #endif #endif diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu index 7e51bd55..bd12b678 100644 --- a/src/dropout_layer_kernels.cu +++ b/src/dropout_layer_kernels.cu @@ -14,9 +14,9 @@ __global__ void yoloswag420blazeit360noscope(float *input, int size, float *rand if(id < size) input[id] = (rand[id] < prob) ? 0 : input[id]*scale; } -void forward_dropout_layer_gpu(dropout_layer layer, network_state state) +void forward_dropout_layer_gpu(dropout_layer layer, network net) { - if (!state.train) return; + if (!net.train) return; int size = layer.inputs*layer.batch; cuda_random(layer.rand_gpu, size); /* @@ -27,15 +27,15 @@ void forward_dropout_layer_gpu(dropout_layer layer, network_state state) cuda_push_array(layer.rand_gpu, layer.rand, size); */ - yoloswag420blazeit360noscope<<>>(state.input, size, layer.rand_gpu, layer.probability, layer.scale); + yoloswag420blazeit360noscope<<>>(net.input_gpu, size, layer.rand_gpu, layer.probability, layer.scale); check_error(cudaPeekAtLastError()); } -void backward_dropout_layer_gpu(dropout_layer layer, network_state state) +void backward_dropout_layer_gpu(dropout_layer layer, network net) { - if(!state.delta) return; + if(!net.delta_gpu) return; int size = layer.inputs*layer.batch; - yoloswag420blazeit360noscope<<>>(state.delta, size, layer.rand_gpu, layer.probability, layer.scale); + yoloswag420blazeit360noscope<<>>(net.delta_gpu, size, layer.rand_gpu, layer.probability, layer.scale); check_error(cudaPeekAtLastError()); } diff --git a/src/gru_layer.c b/src/gru_layer.c index 27fc3c31..7139f798 100644 --- a/src/gru_layer.c +++ b/src/gru_layer.c @@ -124,10 +124,10 @@ void update_gru_layer(layer l, int batch, float learning_rate, float momentum, f update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); } -void forward_gru_layer(layer l, network_state state) +void forward_gru_layer(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_z_layer = *(l.input_z_layer); layer input_r_layer = *(l.input_r_layer); @@ -144,7 +144,7 @@ void forward_gru_layer(layer l, network_state state) fill_cpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta, 1); - if(state.train) { + if(net.train) { fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1); } @@ -154,7 +154,7 @@ void forward_gru_layer(layer l, network_state state) forward_connected_layer(state_z_layer, s); forward_connected_layer(state_r_layer, s); - s.input = state.input; + s.input = net.input; forward_connected_layer(input_z_layer, s); forward_connected_layer(input_r_layer, s); forward_connected_layer(input_h_layer, s); @@ -188,7 +188,7 @@ void forward_gru_layer(layer l, network_state state) copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1); - state.input += l.inputs*l.batch; + net.input += l.inputs*l.batch; l.output += l.outputs*l.batch; increment_layer(&input_z_layer, 1); increment_layer(&input_r_layer, 1); @@ -200,7 +200,7 @@ void forward_gru_layer(layer l, network_state state) } } -void backward_gru_layer(layer l, network_state state) +void backward_gru_layer(layer l, network net) { } @@ -224,10 +224,10 @@ void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentu update_connected_layer_gpu(*(l.state_h_layer), batch, learning_rate, momentum, decay); } -void forward_gru_layer_gpu(layer l, network_state state) +void forward_gru_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_z_layer = *(l.input_z_layer); layer input_r_layer = *(l.input_r_layer); @@ -244,17 +244,17 @@ void forward_gru_layer_gpu(layer l, network_state state) fill_ongpu(l.outputs * l.batch * l.steps, 0, state_z_layer.delta_gpu, 1); fill_ongpu(l.outputs * l.batch * l.steps, 0, state_r_layer.delta_gpu, 1); fill_ongpu(l.outputs * l.batch * l.steps, 0, state_h_layer.delta_gpu, 1); - if(state.train) { + if(net.train) { fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1); } for (i = 0; i < l.steps; ++i) { - s.input = l.state_gpu; + s.input_gpu = l.state_gpu; forward_connected_layer_gpu(state_z_layer, s); forward_connected_layer_gpu(state_r_layer, s); - s.input = state.input; + s.input_gpu = net.input_gpu; forward_connected_layer_gpu(input_z_layer, s); forward_connected_layer_gpu(input_r_layer, s); forward_connected_layer_gpu(input_h_layer, s); @@ -272,7 +272,7 @@ void forward_gru_layer_gpu(layer l, network_state state) copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1); mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); - s.input = l.forgot_state_gpu; + s.input_gpu = l.forgot_state_gpu; forward_connected_layer_gpu(state_h_layer, s); copy_ongpu(l.outputs*l.batch, input_h_layer.output_gpu, 1, l.h_gpu, 1); @@ -288,7 +288,7 @@ void forward_gru_layer_gpu(layer l, network_state state) copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1); - state.input += l.inputs*l.batch; + net.input_gpu += l.inputs*l.batch; l.output_gpu += l.outputs*l.batch; increment_layer(&input_z_layer, 1); increment_layer(&input_r_layer, 1); @@ -300,10 +300,10 @@ void forward_gru_layer_gpu(layer l, network_state state) } } -void backward_gru_layer_gpu(layer l, network_state state) +void backward_gru_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_z_layer = *(l.input_z_layer); layer input_r_layer = *(l.input_r_layer); @@ -321,8 +321,8 @@ void backward_gru_layer_gpu(layer l, network_state state) increment_layer(&state_r_layer, l.steps - 1); increment_layer(&state_h_layer, l.steps - 1); - state.input += l.inputs*l.batch*(l.steps-1); - if(state.delta) state.delta += l.inputs*l.batch*(l.steps-1); + net.input_gpu += l.inputs*l.batch*(l.steps-1); + if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1); l.output_gpu += l.outputs*l.batch*(l.steps-1); l.delta_gpu += l.outputs*l.batch*(l.steps-1); for (i = l.steps-1; i >= 0; --i) { @@ -361,8 +361,8 @@ void backward_gru_layer_gpu(layer l, network_state state) mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1); - s.input = l.forgot_state_gpu; - s.delta = l.forgot_delta_gpu; + s.input_gpu = l.forgot_state_gpu; + s.delta_gpu = l.forgot_delta_gpu; backward_connected_layer_gpu(state_h_layer, s); if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu); @@ -374,22 +374,22 @@ void backward_gru_layer_gpu(layer l, network_state state) gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu); copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1); - s.input = l.prev_state_gpu; - s.delta = prev_delta_gpu; + s.input_gpu = l.prev_state_gpu; + s.delta_gpu = prev_delta_gpu; backward_connected_layer_gpu(state_r_layer, s); backward_connected_layer_gpu(state_z_layer, s); - s.input = state.input; - s.delta = state.delta; + s.input_gpu = net.input_gpu; + s.delta_gpu = net.delta_gpu; backward_connected_layer_gpu(input_h_layer, s); backward_connected_layer_gpu(input_r_layer, s); backward_connected_layer_gpu(input_z_layer, s); - state.input -= l.inputs*l.batch; - if(state.delta) state.delta -= l.inputs*l.batch; + net.input_gpu -= l.inputs*l.batch; + if(net.delta_gpu) net.delta_gpu -= l.inputs*l.batch; l.output_gpu -= l.outputs*l.batch; l.delta_gpu -= l.outputs*l.batch; increment_layer(&input_z_layer, -1); diff --git a/src/gru_layer.h b/src/gru_layer.h index 9e19cee1..9dc456e0 100644 --- a/src/gru_layer.h +++ b/src/gru_layer.h @@ -8,13 +8,13 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize); -void forward_gru_layer(layer l, network_state state); -void backward_gru_layer(layer l, network_state state); +void forward_gru_layer(layer l, network net); +void backward_gru_layer(layer l, network net); void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay); #ifdef GPU -void forward_gru_layer_gpu(layer l, network_state state); -void backward_gru_layer_gpu(layer l, network_state state); +void forward_gru_layer_gpu(layer l, network net); +void backward_gru_layer_gpu(layer l, network net); void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void push_gru_layer(layer l); void pull_gru_layer(layer l); diff --git a/src/image.c b/src/image.c index 133bf1fc..8b2fae90 100644 --- a/src/image.c +++ b/src/image.c @@ -288,6 +288,25 @@ image image_distance(image a, image b) return dist; } +void ghost_image(image source, image dest, int dx, int dy) +{ + int x,y,k; + float max_dist = sqrt((-source.w/2. + .5)*(-source.w/2. + .5)); + for(k = 0; k < source.c; ++k){ + for(y = 0; y < source.h; ++y){ + for(x = 0; x < source.w; ++x){ + float dist = sqrt((x - source.w/2. + .5)*(x - source.w/2. + .5) + (y - source.h/2. + .5)*(y - source.h/2. + .5)); + float alpha = (1 - dist/max_dist); + if(alpha < 0) alpha = 0; + float v1 = get_pixel(source, x,y,k); + float v2 = get_pixel(dest, dx+x,dy+y,k); + float val = alpha*v1 + (1-alpha)*v2; + set_pixel(dest, dx+x, dy+y, k, val); + } + } + } +} + void embed_image(image source, image dest, int dx, int dy) { int x,y,k; diff --git a/src/image.h b/src/image.h index 12f12edb..3109094e 100644 --- a/src/image.h +++ b/src/image.h @@ -57,6 +57,7 @@ void normalize_image(image p); image rotate_image(image m, float rad); void rotate_image_cw(image im, int times); void embed_image(image source, image dest, int dx, int dy); +void ghost_image(image source, image dest, int dx, int dy); void place_image(image im, int w, int h, int dx, int dy, image canvas); void saturate_image(image im, float sat); void exposure_image(image im, float sat); diff --git a/src/layer.h b/src/layer.h index f9ac7247..2470d84c 100644 --- a/src/layer.h +++ b/src/layer.h @@ -5,7 +5,8 @@ #include "stddef.h" #include "tree.h" -struct network_state; +struct network; +typedef struct network network; struct layer; typedef struct layer layer; @@ -45,11 +46,11 @@ struct layer{ LAYER_TYPE type; ACTIVATION activation; COST_TYPE cost_type; - void (*forward) (struct layer, struct network_state); - void (*backward) (struct layer, struct network_state); + void (*forward) (struct layer, struct network); + void (*backward) (struct layer, struct network); void (*update) (struct layer, int, float, float, float); - void (*forward_gpu) (struct layer, struct network_state); - void (*backward_gpu) (struct layer, struct network_state); + void (*forward_gpu) (struct layer, struct network); + void (*backward_gpu) (struct layer, struct network); void (*update_gpu) (struct layer, int, float, float, float); int batch_normalize; int shortcut; @@ -58,6 +59,8 @@ struct layer{ int flipped; int inputs; int outputs; + int nweights; + int nbiases; int extra; int truths; int h,w,c; @@ -176,6 +179,11 @@ struct layer{ float * m; float * v; + + float * bias_m; + float * bias_v; + float * scale_m; + float * scale_v; float * z_cpu; float * r_cpu; @@ -216,6 +224,10 @@ struct layer{ float *m_gpu; float *v_gpu; + float *bias_m_gpu; + float *scale_m_gpu; + float *bias_v_gpu; + float *scale_v_gpu; float * prev_state_gpu; float * forgot_state_gpu; diff --git a/src/local_layer.c b/src/local_layer.c index 9f8a7ec7..aad036e0 100644 --- a/src/local_layer.c +++ b/src/local_layer.c @@ -88,7 +88,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in return l; } -void forward_local_layer(const local_layer l, network_state state) +void forward_local_layer(const local_layer l, network net) { int out_h = local_out_height(l); int out_w = local_out_width(l); @@ -100,13 +100,13 @@ void forward_local_layer(const local_layer l, network_state state) } for(i = 0; i < l.batch; ++i){ - float *input = state.input + i*l.w*l.h*l.c; + float *input = net.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, state.workspace); + l.size, l.stride, l.pad, net.workspace); float *output = l.output + i*l.outputs; for(j = 0; j < locations; ++j){ float *a = l.weights + j*l.size*l.size*l.c*l.n; - float *b = state.workspace + j; + float *b = net.workspace + j; float *c = output + j; int m = l.n; @@ -119,7 +119,7 @@ void forward_local_layer(const local_layer l, network_state state) activate_array(l.output, l.outputs*l.batch, l.activation); } -void backward_local_layer(local_layer l, network_state state) +void backward_local_layer(local_layer l, network net) { int i, j; int locations = l.out_w*l.out_h; @@ -131,13 +131,13 @@ void backward_local_layer(local_layer l, network_state state) } for(i = 0; i < l.batch; ++i){ - float *input = state.input + i*l.w*l.h*l.c; + float *input = net.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, state.workspace); + l.size, l.stride, l.pad, net.workspace); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; - float *b = state.workspace + j; + float *b = net.workspace + j; float *c = l.weight_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; @@ -146,11 +146,11 @@ void backward_local_layer(local_layer l, network_state state) gemm(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } - if(state.delta){ + if(net.delta){ for(j = 0; j < locations; ++j){ float *a = l.weights + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; - float *c = state.workspace + j; + float *c = net.workspace + j; int m = l.size*l.size*l.c; int n = 1; @@ -159,7 +159,7 @@ void backward_local_layer(local_layer l, network_state state) gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } - col2im_cpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_cpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta+i*l.c*l.h*l.w); } } } @@ -178,7 +178,7 @@ void update_local_layer(local_layer l, int batch, float learning_rate, float mom #ifdef GPU -void forward_local_layer_gpu(const local_layer l, network_state state) +void forward_local_layer_gpu(const local_layer l, network net) { int out_h = local_out_height(l); int out_w = local_out_width(l); @@ -190,13 +190,13 @@ void forward_local_layer_gpu(const local_layer l, network_state state) } for(i = 0; i < l.batch; ++i){ - float *input = state.input + i*l.w*l.h*l.c; + float *input = net.input_gpu + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, state.workspace); + l.size, l.stride, l.pad, net.workspace); float *output = l.output_gpu + i*l.outputs; for(j = 0; j < locations; ++j){ float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; - float *b = state.workspace + j; + float *b = net.workspace + j; float *c = output + j; int m = l.n; @@ -209,7 +209,7 @@ void forward_local_layer_gpu(const local_layer l, network_state state) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); } -void backward_local_layer_gpu(local_layer l, network_state state) +void backward_local_layer_gpu(local_layer l, network net) { int i, j; int locations = l.out_w*l.out_h; @@ -220,13 +220,13 @@ void backward_local_layer_gpu(local_layer l, network_state state) } for(i = 0; i < l.batch; ++i){ - float *input = state.input + i*l.w*l.h*l.c; + float *input = net.input_gpu + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, state.workspace); + l.size, l.stride, l.pad, net.workspace); for(j = 0; j < locations; ++j){ float *a = l.delta_gpu + i*l.outputs + j; - float *b = state.workspace + j; + float *b = net.workspace + j; float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; @@ -235,11 +235,11 @@ void backward_local_layer_gpu(local_layer l, network_state state) gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } - if(state.delta){ + if(net.delta_gpu){ for(j = 0; j < locations; ++j){ float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; float *b = l.delta_gpu + i*l.outputs + j; - float *c = state.workspace + j; + float *c = net.workspace + j; int m = l.size*l.size*l.c; int n = 1; @@ -248,7 +248,7 @@ void backward_local_layer_gpu(local_layer l, network_state state) gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } - col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w); } } } diff --git a/src/local_layer.h b/src/local_layer.h index 28915d81..5e292f91 100644 --- a/src/local_layer.h +++ b/src/local_layer.h @@ -10,8 +10,8 @@ typedef layer local_layer; #ifdef GPU -void forward_local_layer_gpu(local_layer layer, network_state state); -void backward_local_layer_gpu(local_layer layer, network_state state); +void forward_local_layer_gpu(local_layer layer, network net); +void backward_local_layer_gpu(local_layer layer, network net); void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay); void push_local_layer(local_layer layer); @@ -20,8 +20,8 @@ void pull_local_layer(local_layer layer); local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation); -void forward_local_layer(const local_layer layer, network_state state); -void backward_local_layer(local_layer layer, network_state state); +void forward_local_layer(const local_layer layer, network net); +void backward_local_layer(local_layer layer, network net); void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay); void bias_output(float *output, float *biases, int batch, int n, int size); diff --git a/src/lsd.c b/src/lsd.c index 6a0e6d21..8801c70f 100644 --- a/src/lsd.c +++ b/src/lsd.c @@ -4,6 +4,7 @@ #include "parser.h" #include "blas.h" +/* void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear) { #ifdef GPU @@ -58,36 +59,21 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg float aloss_avg = -1; float floss_avg = -1; - network_state fstate = {0}; - fstate.index = 0; - fstate.net = fnet; - int x_size = get_network_input_size(fnet)*fnet.batch; - int y_size = get_network_output_size(fnet)*fnet.batch; - fstate.input = cuda_make_array(0, x_size); - fstate.truth = cuda_make_array(0, y_size); - fstate.delta = cuda_make_array(0, x_size); - fstate.train = 1; + fnet.train=1; + int x_size = fnet.inputs*fnet.batch; + int y_size = fnet.truths*fnet.batch; float *X = calloc(x_size, sizeof(float)); float *y = calloc(y_size, sizeof(float)); - float *ones = cuda_make_array(0, anet.batch); - fill_ongpu(anet.batch, .9, ones, 1); - network_state astate = {0}; - astate.index = 0; - astate.net = anet; - int ax_size = get_network_input_size(anet)*anet.batch; - int ay_size = get_network_output_size(anet)*anet.batch; - astate.input = 0; - astate.truth = ones; - astate.delta = cuda_make_array(0, ax_size); - astate.train = 1; + int ax_size = anet.inputs*anet.batch; + int ay_size = anet.truths*anet.batch; + fill_ongpu(ay_size, .9, anet.truth_gpu, 1); + anet.delta_gpu = cuda_make_array(0, ax_size); + anet.train = 1; - network_state gstate = {0}; - gstate.index = 0; - gstate.net = gnet; - int gx_size = get_network_input_size(gnet)*gnet.batch; - int gy_size = get_network_output_size(gnet)*gnet.batch; + int gx_size = gnet.inputs*gnet.batch; + int gy_size = gnet.truths*gnet.batch; gstate.input = cuda_make_array(0, gx_size); gstate.truth = 0; gstate.delta = 0; @@ -155,7 +141,7 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg floss += get_network_cost(fnet) /(fnet.subdivisions*fnet.batch); - cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch); for(k = 0; k < gnet.batch; ++k){ int index = j*gnet.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); @@ -164,11 +150,13 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg } } +*/ /* image sim = float_to_image(anet.w, anet.h, anet.c, style.X.vals[j]); show_image(sim, "style"); cvWaitKey(0); */ + /* harmless_update_network_gpu(anet); @@ -205,7 +193,9 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg } #endif } +*/ +/* void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear) { #ifdef GPU @@ -319,7 +309,8 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear get_next_batch(train, net.batch, j*net.batch, pixs, y); get_next_batch(gray, net.batch, j*net.batch, graypixs, y); cuda_push_array(gstate.input, graypixs, x_size); - cuda_push_array(gstate.truth, pixs, x_size); + cuda_push_array(gstate.truth, pixs, y_size); + */ /* image origi = float_to_image(net.w, net.h, 3, pixs); image grayi = float_to_image(net.w, net.h, 3, graypixs); @@ -327,6 +318,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear show_image(origi, "orig"); cvWaitKey(0); */ + /* *net.seen += net.batch; forward_network_gpu(net, gstate); @@ -350,7 +342,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear gloss += get_network_cost(net) /(net.subdivisions*net.batch); - cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch); for(k = 0; k < net.batch; ++k){ int index = j*net.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); @@ -393,6 +385,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear save_weights(net, buff); #endif } +*/ void test_dcgan(char *cfgfile, char *weightfile) { @@ -428,7 +421,7 @@ void test_dcgan(char *cfgfile, char *weightfile) network_predict(net, X); image out = get_network_image_layer(net, imlayer); //yuv_to_rgb(out); - constrain_image(out); + normalize_image(out); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); show_image(out, "out"); save_image(out, "out"); @@ -440,33 +433,43 @@ void test_dcgan(char *cfgfile, char *weightfile) } } +void dcgan_batch(network gnet, network anet) +{ + //float *input = calloc(x_size, sizeof(float)); +} -void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear) + +void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images) { #ifdef GPU //char *train_images = "/home/pjreddie/data/coco/train1.txt"; //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; - char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; + //char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; + //char *train_images = "data/64.txt"; + //char *train_images = "data/alp.txt"; + //char *train_images = "data/cifar.txt"; char *backup_directory = "/home/pjreddie/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); - network net = load_network(cfg, weight, clear); + network gnet = load_network(cfg, weight, clear); network anet = load_network(acfg, aweight, clear); + float orig_rate = anet.learning_rate; + int start = 0; int i, j, k; layer imlayer = {0}; - for (i = 0; i < net.n; ++i) { - if (net.layers[i].out_c == 3) { - imlayer = net.layers[i]; + for (i = 0; i < gnet.n; ++i) { + if (gnet.layers[i].out_c == 3) { + imlayer = gnet.layers[i]; break; } } - printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); - int imgs = net.batch*net.subdivisions; - i = *net.seen/imgs; + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet.learning_rate, gnet.momentum, gnet.decay); + int imgs = gnet.batch*gnet.subdivisions; + i = *gnet.seen/imgs; data train, buffer; @@ -480,131 +483,144 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear) args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; - args.classes = 2; + args.threads=16; + args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; - network_state gstate = {0}; - gstate.index = 0; - gstate.net = net; - int x_size = get_network_input_size(net)*net.batch; - int y_size = get_network_output_size(net)*net.batch; - gstate.input = cuda_make_array(0, x_size); - gstate.truth = cuda_make_array(0, y_size); - gstate.train = 1; - float *input = calloc(x_size, sizeof(float)); - float *y = calloc(y_size, sizeof(float)); + gnet.train = 1; + anet.train = 1; + + int x_size = gnet.inputs*gnet.batch; + int y_size = gnet.truths*gnet.batch; float *imerror = cuda_make_array(0, y_size); - network_state astate = {0}; - astate.index = 0; - astate.net = anet; - int ay_size = get_network_output_size(anet)*anet.batch; - astate.input = 0; - astate.truth = 0; - astate.delta = 0; - astate.train = 1; - - float *ones_gpu = cuda_make_array(0, ay_size); - fill_ongpu(ay_size, .1, ones_gpu, 1); - fill_ongpu(ay_size/2, .9, ones_gpu, 2); + int ay_size = anet.truths*anet.batch; float aloss_avg = -1; //data generated = copy_data(train); - while (get_current_batch(net) < net.max_batches) { + while (get_current_batch(gnet) < gnet.max_batches) { + start += 1; i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; + + //translate_data_rows(train, -.5); + //scale_data_rows(train, 2); + load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); - for(j = 0; j < imgs; ++j){ - train.y.vals[j][0] = .9; - train.y.vals[j][1] = .1; - gen.y.vals[j][0] = .1; - gen.y.vals[j][1] = .9; + for (j = 0; j < imgs; ++j) { + train.y.vals[j][0] = .95; + gen.y.vals[j][0] = .05; } time=clock(); - for(j = 0; j < net.subdivisions; ++j){ - get_next_batch(train, net.batch, j*net.batch, y, 0); + for(j = 0; j < gnet.subdivisions; ++j){ + get_next_batch(train, gnet.batch, j*gnet.batch, gnet.truth, 0); int z; for(z = 0; z < x_size; ++z){ - input[z] = rand_normal(); + gnet.input[z] = rand_normal(); } - cuda_push_array(gstate.input, input, x_size); - cuda_push_array(gstate.truth, y, y_size); - *net.seen += net.batch; - forward_network_gpu(net, gstate); + cuda_push_array(gnet.input_gpu, gnet.input, x_size); + cuda_push_array(gnet.truth_gpu, gnet.truth, y_size); + *gnet.seen += gnet.batch; + forward_network_gpu(gnet); fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); - astate.input = imlayer.output_gpu; - astate.delta = imerror; - astate.truth = ones_gpu; - forward_network_gpu(anet, astate); - backward_network_gpu(anet, astate); + fill_ongpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1); + copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1); + anet.delta_gpu = imerror; + forward_network_gpu(anet); + backward_network_gpu(anet); + + float genaloss = *anet.cost / anet.batch; + printf("%f\n", genaloss); scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); - scal_ongpu(imlayer.outputs*imlayer.batch, .001, net.layers[net.n-1].delta_gpu, 1); + scal_ongpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); - printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs*imlayer.batch)); + printf("features %f\n", cuda_mag_array(gnet.layers[gnet.n-1].delta_gpu, imlayer.outputs*imlayer.batch)); - axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1); + axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1); - backward_network_gpu(net, gstate); + backward_network_gpu(gnet); - cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); - for(k = 0; k < net.batch; ++k){ - int index = j*net.batch + k; - copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gen.X.vals[index], 1); - gen.y.vals[index][0] = .1; + for(k = 0; k < gnet.batch; ++k){ + int index = j*gnet.batch + k; + copy_cpu(gnet.outputs, gnet.output + k*gnet.outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); - randomize_data(merge); + //randomize_data(merge); float aloss = train_network(anet, merge); - update_network_gpu(net); + //translate_image(im, 1); + //scale_image(im, .5); + //translate_image(im2, 1); + //scale_image(im2, .5); + #ifdef OPENCV + if(display){ + image im = float_to_image(anet.w, anet.h, anet.c, gen.X.vals[0]); + image im2 = float_to_image(anet.w, anet.h, anet.c, train.X.vals[0]); + show_image(im, "gen"); + show_image(im2, "train"); + cvWaitKey(50); + } + #endif + +/* + if(aloss < .1){ + anet.learning_rate = 0; + } else if (aloss > .3){ + anet.learning_rate = orig_rate; + } + */ + + update_network_gpu(gnet); + free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; - printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); + printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); - save_weights(net, buff); + save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); - save_weights(net, buff); + save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); - save_weights(net, buff); + save_weights(gnet, buff); #endif } +/* void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear) { #ifdef GPU @@ -645,8 +661,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle args.d = &buffer; args.type = CLASSIFICATION_DATA; - args.classes = 2; - char *ls[2] = {"imagenet", "zzzzzzz"}; + args.classes = 1; + char *ls[2] = {"imagenet"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); @@ -676,8 +692,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle float *imerror = cuda_make_array(0, imlayer.outputs*imlayer.batch); float *ones_gpu = cuda_make_array(0, ay_size); - fill_ongpu(ay_size, .1, ones_gpu, 1); - fill_ongpu(ay_size/2, .9, ones_gpu, 2); + fill_ongpu(ay_size, 1, ones_gpu, 1); float aloss_avg = -1; float gloss_avg = -1; @@ -697,10 +712,8 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle for(j = 0; j < imgs; ++j){ image gim = float_to_image(net.w, net.h, net.c, gray.X.vals[j]); grayscale_image_3c(gim); - train.y.vals[j][0] = .9; - train.y.vals[j][1] = .1; - gray.y.vals[j][0] = .1; - gray.y.vals[j][1] = .9; + train.y.vals[j][0] = 1; + gray.y.vals[j][0] = 0; } time=clock(); float gloss = 0; @@ -709,14 +722,16 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle get_next_batch(train, net.batch, j*net.batch, pixs, 0); get_next_batch(gray, net.batch, j*net.batch, graypixs, 0); cuda_push_array(gstate.input, graypixs, x_size); - cuda_push_array(gstate.truth, pixs, x_size); - /* - image origi = float_to_image(net.w, net.h, 3, pixs); - image grayi = float_to_image(net.w, net.h, 3, graypixs); - show_image(grayi, "gray"); - show_image(origi, "orig"); - cvWaitKey(0); + cuda_push_array(gstate.truth, pixs, y_size); */ + /* + image origi = float_to_image(net.w, net.h, 3, pixs); + image grayi = float_to_image(net.w, net.h, 3, graypixs); + show_image(grayi, "gray"); + show_image(origi, "orig"); + cvWaitKey(0); + */ + /* *net.seen += net.batch; forward_network_gpu(net, gstate); @@ -727,7 +742,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); - scal_ongpu(imlayer.outputs*imlayer.batch, 1./1000., net.layers[net.n-1].delta_gpu, 1); + scal_ongpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1); scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); @@ -741,7 +756,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle gloss += get_network_cost(net) /(net.subdivisions*net.batch); - cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch); for(k = 0; k < net.batch; ++k){ int index = j*net.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); @@ -750,7 +765,7 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle harmless_update_network_gpu(anet); data merge = concat_data(train, gray); - randomize_data(merge); + //randomize_data(merge); float aloss = train_network(anet, merge); update_network_gpu(net); @@ -782,7 +797,9 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle save_weights(net, buff); #endif } +*/ +/* void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear) { #ifdef GPU @@ -913,7 +930,7 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi gloss += get_network_cost(net) /(net.subdivisions*net.batch); - cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch); for(k = 0; k < net.batch; ++k){ int index = j*net.batch + k; copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); @@ -956,7 +973,9 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi save_weights(net, buff); #endif } +*/ +/* void train_lsd(char *cfgfile, char *weightfile, int clear) { char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; @@ -1035,6 +1054,7 @@ void train_lsd(char *cfgfile, char *weightfile, int clear) sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(net, buff); } +*/ void test_lsd(char *cfgfile, char *weightfile, char *filename, int gray) { @@ -1103,18 +1123,20 @@ void run_lsd(int argc, char **argv) } int clear = find_arg(argc, argv, "-clear"); + int display = find_arg(argc, argv, "-display"); + char *file = find_char_arg(argc, argv, "-file", "/home/pjreddie/data/imagenet/imagenet1k.train.list"); char *cfg = argv[3]; char *weights = (argc > 4) ? argv[4] : 0; char *filename = (argc > 5) ? argv[5] : 0; char *acfg = argv[5]; char *aweights = (argc > 6) ? argv[6] : 0; - if(0==strcmp(argv[2], "train")) train_lsd(cfg, weights, clear); - else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear); - else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear); - else if(0==strcmp(argv[2], "traingan")) train_dcgan(cfg, weights, acfg, aweights, clear); + //if(0==strcmp(argv[2], "train")) train_lsd(cfg, weights, clear); + //else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear); + //else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear); + //else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear); + if(0==strcmp(argv[2], "traingan")) train_dcgan(cfg, weights, acfg, aweights, clear, display, file); else if(0==strcmp(argv[2], "gan")) test_dcgan(cfg, weights); - else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear); else if(0==strcmp(argv[2], "test")) test_lsd(cfg, weights, filename, 0); else if(0==strcmp(argv[2], "color")) test_lsd(cfg, weights, filename, 1); /* diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c index 031d116c..7b3a836b 100644 --- a/src/maxpool_layer.c +++ b/src/maxpool_layer.c @@ -76,7 +76,7 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h) #endif } -void forward_maxpool_layer(const maxpool_layer l, network_state state) +void forward_maxpool_layer(const maxpool_layer l, network net) { int b,i,j,k,m,n; int w_offset = -l.pad; @@ -100,7 +100,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state) int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c)); int valid = (cur_h >= 0 && cur_h < l.h && cur_w >= 0 && cur_w < l.w); - float val = (valid != 0) ? state.input[index] : -FLT_MAX; + float val = (valid != 0) ? net.input[index] : -FLT_MAX; max_i = (val > max) ? index : max_i; max = (val > max) ? val : max; } @@ -113,7 +113,7 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state) } } -void backward_maxpool_layer(const maxpool_layer l, network_state state) +void backward_maxpool_layer(const maxpool_layer l, network net) { int i; int h = l.out_h; @@ -121,7 +121,7 @@ void backward_maxpool_layer(const maxpool_layer l, network_state state) int c = l.c; for(i = 0; i < h*w*c*l.batch; ++i){ int index = l.indexes[i]; - state.delta[index] += l.delta[i]; + net.delta[index] += l.delta[i]; } } diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h index ce56dd88..ceb51907 100644 --- a/src/maxpool_layer.h +++ b/src/maxpool_layer.h @@ -11,12 +11,12 @@ typedef layer maxpool_layer; image get_maxpool_image(maxpool_layer l); maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride, int padding); void resize_maxpool_layer(maxpool_layer *l, int w, int h); -void forward_maxpool_layer(const maxpool_layer l, network_state state); -void backward_maxpool_layer(const maxpool_layer l, network_state state); +void forward_maxpool_layer(const maxpool_layer l, network net); +void backward_maxpool_layer(const maxpool_layer l, network net); #ifdef GPU -void forward_maxpool_layer_gpu(maxpool_layer l, network_state state); -void backward_maxpool_layer_gpu(maxpool_layer l, network_state state); +void forward_maxpool_layer_gpu(maxpool_layer l, network net); +void backward_maxpool_layer_gpu(maxpool_layer l, network net); #endif #endif diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu index 6381cc1e..3202e84b 100644 --- a/src/maxpool_layer_kernels.cu +++ b/src/maxpool_layer_kernels.cu @@ -84,7 +84,7 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_ prev_delta[index] += d; } -extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state) +extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network net) { int h = layer.out_h; int w = layer.out_w; @@ -92,15 +92,15 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta size_t n = h*w*c*layer.batch; - forward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu); + forward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, net.input_gpu, layer.output_gpu, layer.indexes_gpu); check_error(cudaPeekAtLastError()); } -extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state) +extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network net) { size_t n = layer.h*layer.w*layer.c*layer.batch; - backward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, state.delta, layer.indexes_gpu); + backward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride, layer.size, layer.pad, layer.delta_gpu, net.delta_gpu, layer.indexes_gpu); check_error(cudaPeekAtLastError()); } diff --git a/src/network.c b/src/network.c index 0d30dd8c..abf1b8aa 100644 --- a/src/network.c +++ b/src/network.c @@ -164,26 +164,26 @@ network make_network(int n) net.n = n; net.layers = calloc(net.n, sizeof(layer)); net.seen = calloc(1, sizeof(int)); - #ifdef GPU - net.input_gpu = calloc(1, sizeof(float *)); - net.truth_gpu = calloc(1, sizeof(float *)); - #endif + net.cost = calloc(1, sizeof(float)); return net; } -void forward_network(network net, network_state state) +void forward_network(network net) { - state.workspace = net.workspace; int i; for(i = 0; i < net.n; ++i){ - state.index = i; + net.index = i; layer l = net.layers[i]; if(l.delta){ fill_cpu(l.outputs * l.batch, 0, l.delta, 1); } - l.forward(l, state); - state.input = l.output; + l.forward(l, net); + net.input = l.output; + if(l.truth) { + net.truth = l.output; + } } + calc_network_cost(net); } void update_network(network net) @@ -199,17 +199,7 @@ void update_network(network net) } } -float *get_network_output(network net) -{ -#ifdef GPU - if (gpu_index >= 0) return get_network_output_gpu(net); -#endif - int i; - for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break; - return net.layers[i].output; -} - -float get_network_cost(network net) +void calc_network_cost(network net) { int i; float sum = 0; @@ -220,54 +210,43 @@ float get_network_cost(network net) ++count; } } - return sum/count; + *net.cost = sum/count; } int get_predicted_class_network(network net) { - float *out = get_network_output(net); - int k = get_network_output_size(net); - return max_index(out, k); + return max_index(net.output, net.outputs); } -void backward_network(network net, network_state state) +void backward_network(network net) { int i; - float *original_input = state.input; - float *original_delta = state.delta; - state.workspace = net.workspace; + network orig = net; for(i = net.n-1; i >= 0; --i){ - state.index = i; - if(i == 0){ - state.input = original_input; - state.delta = original_delta; - }else{ - layer prev = net.layers[i-1]; - state.input = prev.output; - state.delta = prev.delta; - } layer l = net.layers[i]; if(l.stopbackward) break; - l.backward(l, state); + if(i == 0){ + net = orig; + }else{ + layer prev = net.layers[i-1]; + net.input = prev.output; + net.delta = prev.delta; + } + net.index = i; + l.backward(l, net); } } -float train_network_datum(network net, float *x, float *y) +float train_network_datum(network net) { #ifdef GPU - if(gpu_index >= 0) return train_network_datum_gpu(net, x, y); + if(gpu_index >= 0) return train_network_datum_gpu(net); #endif - network_state state; *net.seen += net.batch; - state.index = 0; - state.net = net; - state.input = x; - state.delta = 0; - state.truth = y; - state.train = 1; - forward_network(net, state); - backward_network(net, state); - float error = get_network_cost(net); + net.train = 1; + forward_network(net); + backward_network(net); + float error = *net.cost; if(((*net.seen)/net.batch)%net.subdivisions == 0) update_network(net); return error; } @@ -275,18 +254,14 @@ float train_network_datum(network net, float *x, float *y) float train_network_sgd(network net, data d, int n) { int batch = net.batch; - float *X = calloc(batch*d.X.cols, sizeof(float)); - float *y = calloc(batch*d.y.cols, sizeof(float)); int i; float sum = 0; for(i = 0; i < n; ++i){ - get_random_batch(d, batch, X, y); - float err = train_network_datum(net, X, y); + get_random_batch(d, batch, net.input, net.truth); + float err = train_network_datum(net); sum += err; } - free(X); - free(y); return (float)sum/(n*batch); } @@ -295,43 +270,14 @@ float train_network(network net, data d) assert(d.X.rows % net.batch == 0); int batch = net.batch; int n = d.X.rows / batch; - float *X = calloc(batch*d.X.cols, sizeof(float)); - float *y = calloc(batch*d.y.cols, sizeof(float)); int i; float sum = 0; for(i = 0; i < n; ++i){ - get_next_batch(d, batch, i*batch, X, y); - float err = train_network_datum(net, X, y); + get_next_batch(d, batch, i*batch, net.input, net.truth); + float err = train_network_datum(net); sum += err; } - free(X); - free(y); - return (float)sum/(n*batch); -} - - -float train_network_batch(network net, data d, int n) -{ - int i,j; - network_state state; - state.index = 0; - state.net = net; - state.train = 1; - state.delta = 0; - float sum = 0; - int batch = 2; - for(i = 0; i < n; ++i){ - for(j = 0; j < batch; ++j){ - int index = rand()%d.X.rows; - state.input = d.X.vals[index]; - state.truth = d.y.vals[index]; - forward_network(net, state); - backward_network(net, state); - sum += get_network_cost(net); - } - update_network(net); - } return (float)sum/(n*batch); } @@ -353,9 +299,7 @@ int resize_network(network *net, int w, int h) { #ifdef GPU cuda_set_device(net->gpu_index); - if(gpu_index >= 0){ - cuda_free(net->workspace); - } + cuda_free(net->workspace); #endif int i; //if(w == net->w && h == net->h) return 0; @@ -395,14 +339,22 @@ int resize_network(network *net, int w, int h) h = l.out_h; if(l.type == AVGPOOL) break; } + layer out = get_network_output_layer(*net); + net->inputs = net->layers[0].inputs; + net->outputs = out.outputs; + net->truths = out.outputs; + if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths; + net->output = out.output; + free(net->input); + free(net->truth); + net->input = calloc(net->inputs*net->batch, sizeof(float)); + net->truth = calloc(net->truths*net->batch, sizeof(float)); #ifdef GPU if(gpu_index >= 0){ - if(net->input_gpu) { - cuda_free(*net->input_gpu); - *net->input_gpu = 0; - cuda_free(*net->truth_gpu); - *net->truth_gpu = 0; - } + cuda_free(net->input_gpu); + cuda_free(net->truth_gpu); + net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch); + net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch); net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1); }else { free(net->workspace); @@ -416,18 +368,6 @@ int resize_network(network *net, int w, int h) return 0; } -int get_network_output_size(network net) -{ - int i; - for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break; - return net.layers[i].outputs; -} - -int get_network_input_size(network net) -{ - return net.layers[0].inputs; -} - detection_layer get_network_detection_layer(network net) { int i; @@ -444,9 +384,9 @@ detection_layer get_network_detection_layer(network net) image get_network_image_layer(network net, int i) { layer l = net.layers[i]; - #ifdef GPU - cuda_pull_array(l.output_gpu, l.output, l.outputs); - #endif +#ifdef GPU + cuda_pull_array(l.output_gpu, l.output, l.outputs); +#endif if (l.out_w && l.out_h && l.out_c){ return float_to_image(l.out_w, l.out_h, l.out_c, l.output); } @@ -481,9 +421,7 @@ void visualize_network(network net) void top_predictions(network net, int k, int *index) { - int size = get_network_output_size(net); - float *out = get_network_output(net); - top_k(out, size, k, index); + top_k(net.output, net.outputs, k, index); } @@ -492,23 +430,18 @@ float *network_predict(network net, float *input) #ifdef GPU if(gpu_index >= 0) return network_predict_gpu(net, input); #endif - - network_state state; - state.net = net; - state.index = 0; - state.input = input; - state.truth = 0; - state.train = 0; - state.delta = 0; - forward_network(net, state); - float *out = get_network_output(net); - return out; + net.input = input; + net.truth = 0; + net.train = 0; + net.delta = 0; + forward_network(net); + return net.output; } matrix network_predict_data_multi(network net, data test, int n) { int i,j,b,m; - int k = get_network_output_size(net); + int k = net.outputs; matrix pred = make_matrix(test.X.rows, k); float *X = calloc(net.batch*test.X.rows, sizeof(float)); for(i = 0; i < test.X.rows; i += net.batch){ @@ -533,7 +466,7 @@ matrix network_predict_data_multi(network net, data test, int n) matrix network_predict_data(network net, data test) { int i,j,b; - int k = get_network_output_size(net); + int k = net.outputs; matrix pred = make_matrix(test.X.rows, k); float *X = calloc(net.batch*test.X.cols, sizeof(float)); for(i = 0; i < test.X.rows; i += net.batch){ @@ -613,6 +546,15 @@ float *network_accuracies(network net, data d, int n) return acc; } +layer get_network_output_layer(network net) +{ + int i; + for(i = net.n - 1; i >= 0; --i){ + if(net.layers[i].type != COST) break; + } + return net.layers[i]; +} + float network_accuracy_multi(network net, data d, int n) { matrix guess = network_predict_data_multi(net, d, n); @@ -628,10 +570,38 @@ void free_network(network net) free_layer(net.layers[i]); } free(net.layers); + if(net.input) free(net.input); + if(net.truth) free(net.truth); #ifdef GPU - if(*net.input_gpu) cuda_free(*net.input_gpu); - if(*net.truth_gpu) cuda_free(*net.truth_gpu); - if(net.input_gpu) free(net.input_gpu); - if(net.truth_gpu) free(net.truth_gpu); + if(net.input_gpu) cuda_free(net.input_gpu); + if(net.truth_gpu) cuda_free(net.truth_gpu); #endif } + +// Some day... + + +layer network_output_layer(network net) +{ + int i; + for(i = net.n - 1; i >= 0; --i){ + if(net.layers[i].type != COST) break; + } + return net.layers[i]; +} + +int network_inputs(network net) +{ + return net.layers[0].inputs; +} + +int network_outputs(network net) +{ + return network_output_layer(net).outputs; +} + +float *network_output(network net) +{ + return network_output_layer(net).output; +} + diff --git a/src/network.h b/src/network.h index 20c75b61..d27119ed 100644 --- a/src/network.h +++ b/src/network.h @@ -12,7 +12,6 @@ typedef enum { } learning_rate_policy; typedef struct network{ - float *workspace; int n; int batch; int *seen; @@ -21,7 +20,6 @@ typedef struct network{ float momentum; float decay; layer *layers; - int outputs; float *output; learning_rate_policy policy; @@ -43,6 +41,8 @@ typedef struct network{ float eps; int inputs; + int outputs; + int truths; int notruth; int h, w, c; int max_crop; @@ -56,32 +56,34 @@ typedef struct network{ int gpu_index; tree *hierarchy; - #ifdef GPU - float **input_gpu; - float **truth_gpu; - #endif -} network; -typedef struct network_state { - float *truth; + float *input; + float *truth; float *delta; float *workspace; int train; int index; - network net; -} network_state; + float *cost; + + #ifdef GPU + float *input_gpu; + float *truth_gpu; + float *delta_gpu; + float *output_gpu; + #endif + +} network; + #ifdef GPU float train_networks(network *nets, int n, data d, int interval); void sync_nets(network *nets, int n, int interval); -float train_network_datum_gpu(network net, float *x, float *y); +float train_network_datum_gpu(network net); float *network_predict_gpu(network net, float *input); -float * get_network_output_gpu_layer(network net, int i); -float * get_network_delta_gpu_layer(network net, int i); -float *get_network_output_gpu(network net); -void forward_network_gpu(network net, network_state state); -void backward_network_gpu(network net, network_state state); +void pull_network_output(network net); +void forward_network_gpu(network net); +void backward_network_gpu(network net); void update_network_gpu(network net); void harmless_update_network_gpu(network net); #endif @@ -93,14 +95,13 @@ void compare_networks(network n1, network n2, data d); char *get_layer_string(LAYER_TYPE a); network make_network(int n); -void forward_network(network net, network_state state); -void backward_network(network net, network_state state); +void forward_network(network net); +void backward_network(network net); void update_network(network net); float train_network(network net, data d); -float train_network_batch(network net, data d, int n); float train_network_sgd(network net, data d, int n); -float train_network_datum(network net, float *x, float *y); +float train_network_datum(network net); matrix network_predict_data(network net, data test); float *network_predict(network net, float *input); @@ -108,26 +109,17 @@ float network_accuracy(network net, data d); float *network_accuracies(network net, data d, int n); float network_accuracy_multi(network net, data d, int n); void top_predictions(network net, int n, int *index); -float *get_network_output(network net); -float *get_network_output_layer(network net, int i); -float *get_network_delta_layer(network net, int i); -float *get_network_delta(network net); -int get_network_output_size_layer(network net, int i); -int get_network_output_size(network net); image get_network_image(network net); image get_network_image_layer(network net, int i); +layer get_network_output_layer(network net); int get_predicted_class_network(network net); void print_network(network net); void visualize_network(network net); int resize_network(network *net, int w, int h); void set_batch_network(network *net, int b); -int get_network_input_size(network net); -float get_network_cost(network net); network load_network(char *cfg, char *weights, int clear); load_args get_base_args(network net); - -int get_network_nuisance(network net); -int get_network_background(network net); +void calc_network_cost(network net); #endif diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 4b9b499e..2fbc2dd4 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -35,46 +35,45 @@ extern "C" { #include "blas.h" } -float * get_network_output_gpu_layer(network net, int i); -float * get_network_delta_gpu_layer(network net, int i); -float * get_network_output_gpu(network net); - -void forward_network_gpu(network net, network_state state) +void forward_network_gpu(network net) { - state.workspace = net.workspace; int i; for(i = 0; i < net.n; ++i){ - state.index = i; + net.index = i; layer l = net.layers[i]; if(l.delta_gpu){ fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1); } - //if(l.c ==3 && i > 5) state.input = *net.input_gpu; - l.forward_gpu(l, state); - state.input = l.output_gpu; - if(l.truth) state.truth = l.output_gpu; + l.forward_gpu(l, net); + net.input_gpu = l.output_gpu; + net.input = l.output; + if(l.truth) { + net.truth_gpu = l.output_gpu; + net.truth = l.output; + } } + pull_network_output(net); + calc_network_cost(net); } -void backward_network_gpu(network net, network_state state) +void backward_network_gpu(network net) { - state.workspace = net.workspace; int i; - float * original_input = state.input; - float * original_delta = state.delta; + network orig = net; for(i = net.n-1; i >= 0; --i){ - state.index = i; layer l = net.layers[i]; if(l.stopbackward) break; if(i == 0){ - state.input = original_input; - state.delta = original_delta; + net = orig; }else{ layer prev = net.layers[i-1]; - state.input = prev.output_gpu; - state.delta = prev.delta_gpu; + net.input = prev.output; + net.delta = prev.delta; + net.input_gpu = prev.output_gpu; + net.delta_gpu = prev.delta_gpu; } - l.backward_gpu(l, state); + net.index = i; + l.backward_gpu(l, net); } } @@ -95,39 +94,30 @@ void update_network_gpu(network net) void harmless_update_network_gpu(network net) { - net.learning_rate = 0; - net.momentum = 1; - update_network_gpu(net); -} - -void forward_backward_network_gpu(network net, float *x, float *y) -{ - network_state state; - state.index = 0; - state.net = net; - int x_size = get_network_input_size(net)*net.batch; - int y_size = get_network_output_size(net)*net.batch; - if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch; - if(!*net.input_gpu){ - *net.input_gpu = cuda_make_array(x, x_size); - if(!net.notruth) *net.truth_gpu = cuda_make_array(y, y_size); - }else{ - cuda_push_array(*net.input_gpu, x, x_size); - if(!net.notruth) cuda_push_array(*net.truth_gpu, y, y_size); + cuda_set_device(net.gpu_index); + int i; + for(i = 0; i < net.n; ++i){ + layer l = net.layers[i]; + if(l.weight_updates_gpu) fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1); + if(l.bias_updates_gpu) fill_ongpu(l.nbiases, 0, l.bias_updates_gpu, 1); + if(l.scale_updates_gpu) fill_ongpu(l.nbiases, 0, l.scale_updates_gpu, 1); } - state.input = *net.input_gpu; - state.delta = 0; - state.truth = *net.truth_gpu; - state.train = 1; - forward_network_gpu(net, state); - backward_network_gpu(net, state); } -float train_network_datum_gpu(network net, float *x, float *y) +float train_network_datum_gpu(network net) { *net.seen += net.batch; - forward_backward_network_gpu(net, x, y); - float error = get_network_cost(net); + + int x_size = net.inputs*net.batch; + int y_size = net.truths*net.batch; + cuda_push_array(net.input_gpu, net.input, x_size); + cuda_push_array(net.truth_gpu, net.truth, y_size); + + net.train = 1; + forward_network_gpu(net); + backward_network_gpu(net); + + float error = *net.cost; if (((*net.seen) / net.batch) % net.subdivisions == 0) update_network_gpu(net); return error; @@ -384,34 +374,19 @@ float train_networks(network *nets, int n, data d, int interval) return (float)sum/(n); } -float *get_network_output_layer_gpu(network net, int i) +void pull_network_output(network net) { - layer l = net.layers[i]; - if(l.type != REGION) cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); - return l.output; -} - -float *get_network_output_gpu(network net) -{ - int i; - for(i = net.n-1; i > 0; --i) if(net.layers[i].type != COST) break; - return get_network_output_layer_gpu(net, i); + layer l = get_network_output_layer(net); + cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); } float *network_predict_gpu(network net, float *input) { cuda_set_device(net.gpu_index); - int size = get_network_input_size(net) * net.batch; - network_state state; - state.index = 0; - state.net = net; - state.input = cuda_make_array(input, size); - state.truth = 0; - state.train = 0; - state.delta = 0; - forward_network_gpu(net, state); - float *out = get_network_output_gpu(net); - cuda_free(state.input); - return out; + cuda_push_array(net.input_gpu, input, net.inputs*net.batch); + net.truth = 0; + net.train = 0; + forward_network_gpu(net); + return net.output; } diff --git a/src/nightmare.c b/src/nightmare.c index 5d95416c..4bcf1877 100644 --- a/src/nightmare.c +++ b/src/nightmare.c @@ -46,32 +46,29 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa image delta = make_image(im.w, im.h, im.c); - network_state state = {0}; - state.net = *net; - #ifdef GPU - state.input = cuda_make_array(im.data, im.w*im.h*im.c); - state.delta = cuda_make_array(im.data, im.w*im.h*im.c); + net->delta_gpu = cuda_make_array(delta.data, im.w*im.h*im.c); + cuda_push_array(net->input_gpu, im.data, net->inputs); - forward_network_gpu(*net, state); + forward_network_gpu(*net); copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); cuda_push_array(last.delta_gpu, last.delta, last.outputs); - backward_network_gpu(*net, state); + backward_network_gpu(*net); - cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c); - cuda_free(state.input); - cuda_free(state.delta); + cuda_pull_array(net->delta_gpu, delta.data, im.w*im.h*im.c); + cuda_free(net->delta_gpu); + net->delta_gpu = 0; #else - state.input = im.data; - state.delta = delta.data; - forward_network(*net, state); + net->input = im.data; + net->delta = delta.data; + forward_network(*net); copy_cpu(last.outputs, last.output, 1, last.delta, 1); calculate_loss(last.output, last.delta, last.outputs, thresh); - backward_network(*net, state); + backward_network(*net); #endif if(flip) flip_image(delta); @@ -134,31 +131,30 @@ void smooth(image recon, image update, float lambda, int num) void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters) { int iter = 0; + layer l = get_network_output_layer(net); for (iter = 0; iter < iters; ++iter) { image delta = make_image(recon.w, recon.h, recon.c); - network_state state = {0}; - state.net = net; #ifdef GPU - state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); - state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); - state.truth = cuda_make_array(features, get_network_output_size(net)); + cuda_push_array(net.input_gpu, recon.data, recon.w*recon.h*recon.c); + cuda_push_array(net.truth_gpu, features, net.truths); + net.delta_gpu = cuda_make_array(delta.data, delta.w*delta.h*delta.c); - forward_network_gpu(net, state); - backward_network_gpu(net, state); + forward_network_gpu(net); + copy_ongpu(l.outputs, net.truth_gpu, 1, l.delta_gpu, 1); + axpy_ongpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1); + backward_network_gpu(net); - cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); + cuda_pull_array(net.delta_gpu, delta.data, delta.w*delta.h*delta.c); - cuda_free(state.input); - cuda_free(state.delta); - cuda_free(state.truth); + cuda_free(net.delta_gpu); #else - state.input = recon.data; - state.delta = delta.data; - state.truth = features; + net.input = recon.data; + net.delta = delta.data; + net.truth = features; - forward_network(net, state); - backward_network(net, state); + forward_network(net); + backward_network(net); #endif axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1); @@ -328,11 +324,12 @@ void run_nightmare(int argc, char **argv) free_image(im); im = resized; } - im = letterbox_image(im, net.w, net.h); + //im = letterbox_image(im, net.w, net.h); float *features = 0; image update; if (reconstruct){ + net.n = max_layer; resize_network(&net, im.w, im.h); int zz = 0; diff --git a/src/normalization_layer.c b/src/normalization_layer.c index 069a0792..2be0d069 100644 --- a/src/normalization_layer.c +++ b/src/normalization_layer.c @@ -62,7 +62,7 @@ void resize_normalization_layer(layer *layer, int w, int h) #endif } -void forward_normalization_layer(const layer layer, network_state state) +void forward_normalization_layer(const layer layer, network net) { int k,b; int w = layer.w; @@ -73,7 +73,7 @@ void forward_normalization_layer(const layer layer, network_state state) for(b = 0; b < layer.batch; ++b){ float *squared = layer.squared + w*h*c*b; float *norms = layer.norms + w*h*c*b; - float *input = state.input + w*h*c*b; + float *input = net.input + w*h*c*b; pow_cpu(w*h*c, 2, input, 1, squared, 1); const_cpu(w*h, layer.kappa, norms, 1); @@ -90,10 +90,10 @@ void forward_normalization_layer(const layer layer, network_state state) } } pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, layer.output, 1); - mul_cpu(w*h*c*layer.batch, state.input, 1, layer.output, 1); + mul_cpu(w*h*c*layer.batch, net.input, 1, layer.output, 1); } -void backward_normalization_layer(const layer layer, network_state state) +void backward_normalization_layer(const layer layer, network net) { // TODO This is approximate ;-) // Also this should add in to delta instead of overwritting. @@ -101,12 +101,12 @@ void backward_normalization_layer(const layer layer, network_state state) int w = layer.w; int h = layer.h; int c = layer.c; - pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, state.delta, 1); - mul_cpu(w*h*c*layer.batch, layer.delta, 1, state.delta, 1); + pow_cpu(w*h*c*layer.batch, -layer.beta, layer.norms, 1, net.delta, 1); + mul_cpu(w*h*c*layer.batch, layer.delta, 1, net.delta, 1); } #ifdef GPU -void forward_normalization_layer_gpu(const layer layer, network_state state) +void forward_normalization_layer_gpu(const layer layer, network net) { int k,b; int w = layer.w; @@ -117,7 +117,7 @@ void forward_normalization_layer_gpu(const layer layer, network_state state) for(b = 0; b < layer.batch; ++b){ float *squared = layer.squared_gpu + w*h*c*b; float *norms = layer.norms_gpu + w*h*c*b; - float *input = state.input + w*h*c*b; + float *input = net.input_gpu + w*h*c*b; pow_ongpu(w*h*c, 2, input, 1, squared, 1); const_ongpu(w*h, layer.kappa, norms, 1); @@ -134,17 +134,17 @@ void forward_normalization_layer_gpu(const layer layer, network_state state) } } pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1); - mul_ongpu(w*h*c*layer.batch, state.input, 1, layer.output_gpu, 1); + mul_ongpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1); } -void backward_normalization_layer_gpu(const layer layer, network_state state) +void backward_normalization_layer_gpu(const layer layer, network net) { // TODO This is approximate ;-) int w = layer.w; int h = layer.h; int c = layer.c; - pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, state.delta, 1); - mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, state.delta, 1); + pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1); + mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/normalization_layer.h b/src/normalization_layer.h index ab327764..665baa50 100644 --- a/src/normalization_layer.h +++ b/src/normalization_layer.h @@ -7,13 +7,13 @@ layer make_normalization_layer(int batch, int w, int h, int c, int size, float alpha, float beta, float kappa); void resize_normalization_layer(layer *layer, int h, int w); -void forward_normalization_layer(const layer layer, network_state state); -void backward_normalization_layer(const layer layer, network_state state); +void forward_normalization_layer(const layer layer, network net); +void backward_normalization_layer(const layer layer, network net); void visualize_normalization_layer(layer layer, char *window); #ifdef GPU -void forward_normalization_layer_gpu(const layer layer, network_state state); -void backward_normalization_layer_gpu(const layer layer, network_state state); +void forward_normalization_layer_gpu(const layer layer, network net); +void backward_normalization_layer_gpu(const layer layer, network net); #endif #endif diff --git a/src/parser.c b/src/parser.c index c89d98de..2c03c7f5 100644 --- a/src/parser.c +++ b/src/parser.c @@ -154,8 +154,11 @@ layer parse_deconvolutional(list *options, size_params params) batch=params.batch; if(!(h && w && c)) error("Layer before deconvolutional layer must output image."); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); + int pad = option_find_int_quiet(options, "pad",0); + int padding = option_find_int_quiet(options, "padding",0); + if(pad) padding = size/2; - layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,activation, batch_normalize); + layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,padding, activation, batch_normalize, params.net.adam); return l; } @@ -546,7 +549,7 @@ void parse_net_options(list *options, network *net) if(net->adam){ net->B1 = option_find_float(options, "B1", .9); net->B2 = option_find_float(options, "B2", .999); - net->eps = option_find_float(options, "eps", .000001); + net->eps = option_find_float(options, "eps", .00000001); } net->h = option_find_int_quiet(options, "height",0); @@ -718,8 +721,18 @@ network parse_network_cfg(char *filename) } } free_list(sections); - net.outputs = get_network_output_size(net); - net.output = get_network_output(net); + layer out = get_network_output_layer(net); + net.outputs = out.outputs; + net.truths = out.outputs; + if(net.layers[net.n-1].truths) net.truths = net.layers[net.n-1].truths; + net.output = out.output; + net.input = calloc(net.inputs*net.batch, sizeof(float)); + net.truth = calloc(net.truths*net.batch, sizeof(float)); +#ifdef GPU + net.output_gpu = out.output_gpu; + net.input_gpu = cuda_make_array(net.input, net.inputs*net.batch); + net.truth_gpu = cuda_make_array(net.truth, net.truths*net.batch); +#endif if(workspace_size){ //printf("%ld\n", workspace_size); #ifdef GPU diff --git a/src/region_layer.c b/src/region_layer.c index 7b57e609..99c54a95 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -142,10 +142,10 @@ int entry_index(layer l, int batch, int location, int entry) } void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output); -void forward_region_layer(const layer l, network_state state) +void forward_region_layer(const layer l, network net) { int i,j,b,t,n; - memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); + memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); #ifndef GPU for (b = 0; b < l.batch; ++b){ @@ -161,17 +161,17 @@ void forward_region_layer(const layer l, network_state state) int count = 5; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; - softmax_cpu(state.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count); + softmax_cpu(net.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count); count += group_size; } } else if (l.softmax){ int index = entry_index(l, 0, 0, 5); - softmax_cpu(state.input + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index); + softmax_cpu(net.input + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output + index); } #endif memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); - if(!state.train) return; + if(!net.train) return; float avg_iou = 0; float recall = 0; float avg_cat = 0; @@ -184,9 +184,9 @@ void forward_region_layer(const layer l, network_state state) if(l.softmax_tree){ int onlyclass = 0; for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); + box truth = float_to_box(net.truth + t*5 + b*l.truths, 1); if(!truth.x) break; - int class = state.truth[t*5 + b*l.truths + 4]; + int class = net.truth[t*5 + b*l.truths + 4]; float maxp = 0; int maxi = 0; if(truth.x > 100000 && truth.y > 100000){ @@ -220,7 +220,7 @@ void forward_region_layer(const layer l, network_state state) box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); float best_iou = 0; for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); + box truth = float_to_box(net.truth + t*5 + b*l.truths, 1); if(!truth.x) break; float iou = box_iou(pred, truth); if (iou > best_iou) { @@ -234,7 +234,7 @@ void forward_region_layer(const layer l, network_state state) l.delta[obj_index] = 0; } - if(*(state.net.seen) < 12800){ + if(*(net.seen) < 12800){ box truth = {0}; truth.x = (i + .5)/l.w; truth.y = (j + .5)/l.h; @@ -246,7 +246,7 @@ void forward_region_layer(const layer l, network_state state) } } for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); + box truth = float_to_box(net.truth + t*5 + b*l.truths, 1); if(!truth.x) break; float best_iou = 0; @@ -289,7 +289,7 @@ void forward_region_layer(const layer l, network_state state) l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]); } - int class = state.truth[t*5 + b*l.truths + 4]; + int class = net.truth[t*5 + b*l.truths + 4]; if (l.map) class = l.map[class]; int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 5); delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); @@ -302,7 +302,7 @@ void forward_region_layer(const layer l, network_state state) printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f, count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count); } -void backward_region_layer(const layer l, network_state state) +void backward_region_layer(const layer l, network net) { /* int b; @@ -311,11 +311,11 @@ void backward_region_layer(const layer l, network_state state) int index = (b*size + 4)*l.w*l.h; gradient_array(l.output + index, l.w*l.h, LOGISTIC, l.delta + index); } - axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); + axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1); */ } -void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh) +void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh, int nomult) { int i,j,n,z; float *predictions = l.output; @@ -358,10 +358,12 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b boxes[index].w *= (float)max/w; boxes[index].h *= (float)max/h; } - boxes[index].x *= w; - boxes[index].y *= h; - boxes[index].w *= w; - boxes[index].h *= h; + if(!nomult){ + boxes[index].x *= w; + boxes[index].y *= h; + boxes[index].w *= w; + boxes[index].h *= h; + } int class_index = entry_index(l, 0, n*l.w*l.h + i, 5); if(l.softmax_tree){ @@ -383,6 +385,9 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b int class_index = entry_index(l, 0, n*l.w*l.h + i, 5 + j); float prob = scale*predictions[class_index]; probs[index][j] = (prob > thresh) ? prob : 0; + // TODO REMOVE + // if (j != 15 && j != 16) probs[index][j] = 0; + // if (j != 0) probs[index][j] = 0; } } if(only_objectness){ @@ -394,9 +399,9 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b #ifdef GPU -void forward_region_layer_gpu(const layer l, network_state state) +void forward_region_layer_gpu(const layer l, network net) { - copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); + copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); int b, n; for (b = 0; b < l.batch; ++b){ for(n = 0; n < l.n; ++n){ @@ -412,40 +417,33 @@ void forward_region_layer_gpu(const layer l, network_state state) for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; int index = entry_index(l, 0, 0, count); - softmax_gpu(state.input + index, group_size, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); + softmax_gpu(net.input_gpu + index, group_size, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); count += group_size; } } else if (l.softmax) { int index = entry_index(l, 0, 0, 5); //printf("%d\n", index); - softmax_gpu(state.input + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); + softmax_gpu(net.input_gpu + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); } - if(!state.train || l.onlyforward){ + if(!net.train || l.onlyforward){ cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); return; } - float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; - if(state.truth){ + if(net.truth_gpu){ int num_truth = l.batch*l.truths; truth_cpu = calloc(num_truth, sizeof(float)); - cuda_pull_array(state.truth, truth_cpu, num_truth); + cuda_pull_array(net.truth_gpu, truth_cpu, num_truth); } - cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs); - network_state cpu_state = state; - cpu_state.train = state.train; - cpu_state.truth = truth_cpu; - cpu_state.input = in_cpu; - forward_region_layer(l, cpu_state); + cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs); + forward_region_layer(l, net); //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); - free(cpu_state.input); - if(!state.train) return; + if(!net.train) return; cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); - if(cpu_state.truth) free(cpu_state.truth); } -void backward_region_layer_gpu(const layer l, network_state state) +void backward_region_layer_gpu(const layer l, network net) { int b, n; for (b = 0; b < l.batch; ++b){ @@ -456,7 +454,7 @@ void backward_region_layer_gpu(const layer l, network_state state) gradient_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index); } } - axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1); + axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/region_layer.h b/src/region_layer.h index 9a3b7cd3..6375445f 100644 --- a/src/region_layer.h +++ b/src/region_layer.h @@ -5,14 +5,14 @@ #include "network.h" layer make_region_layer(int batch, int h, int w, int n, int classes, int coords); -void forward_region_layer(const layer l, network_state state); -void backward_region_layer(const layer l, network_state state); -void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh); +void forward_region_layer(const layer l, network net); +void backward_region_layer(const layer l, network net); +void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh, int nomult); void resize_region_layer(layer *l, int w, int h); #ifdef GPU -void forward_region_layer_gpu(const layer l, network_state state); -void backward_region_layer_gpu(layer l, network_state state); +void forward_region_layer_gpu(const layer l, network net); +void backward_region_layer_gpu(layer l, network net); #endif #endif diff --git a/src/reorg_layer.c b/src/reorg_layer.c index 29ccc0e5..405a4266 100644 --- a/src/reorg_layer.c +++ b/src/reorg_layer.c @@ -87,11 +87,11 @@ void resize_reorg_layer(layer *l, int w, int h) #endif } -void forward_reorg_layer(const layer l, network_state state) +void forward_reorg_layer(const layer l, network net) { int i; if(l.flatten){ - memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); + memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); if(l.reverse){ flatten(l.output, l.w*l.h, l.c, l.batch, 0); }else{ @@ -99,74 +99,74 @@ void forward_reorg_layer(const layer l, network_state state) } } else if (l.extra) { for(i = 0; i < l.batch; ++i){ - copy_cpu(l.inputs, state.input + i*l.inputs, 1, l.output + i*l.outputs, 1); + copy_cpu(l.inputs, net.input + i*l.inputs, 1, l.output + i*l.outputs, 1); } } else if (l.reverse){ - reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output); + reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output); } else { - reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output); + reorg_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output); } } -void backward_reorg_layer(const layer l, network_state state) +void backward_reorg_layer(const layer l, network net) { int i; if(l.flatten){ - memcpy(state.delta, l.delta, l.outputs*l.batch*sizeof(float)); + memcpy(net.delta, l.delta, l.outputs*l.batch*sizeof(float)); if(l.reverse){ - flatten(state.delta, l.w*l.h, l.c, l.batch, 1); + flatten(net.delta, l.w*l.h, l.c, l.batch, 1); }else{ - flatten(state.delta, l.w*l.h, l.c, l.batch, 0); + flatten(net.delta, l.w*l.h, l.c, l.batch, 0); } } else if(l.reverse){ - reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta); + reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta); } else if (l.extra) { for(i = 0; i < l.batch; ++i){ - copy_cpu(l.inputs, l.delta + i*l.outputs, 1, state.delta + i*l.inputs, 1); + copy_cpu(l.inputs, l.delta + i*l.outputs, 1, net.delta + i*l.inputs, 1); } }else{ - reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta); + reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta); } } #ifdef GPU -void forward_reorg_layer_gpu(layer l, network_state state) +void forward_reorg_layer_gpu(layer l, network net) { int i; if(l.flatten){ if(l.reverse){ - flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 0, l.output_gpu); + flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu); }else{ - flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 1, l.output_gpu); + flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu); } } else if (l.extra) { for(i = 0; i < l.batch; ++i){ - copy_ongpu(l.inputs, state.input + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1); + copy_ongpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1); } } else if (l.reverse) { - reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu); + reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu); }else { - reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu); + reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu); } } -void backward_reorg_layer_gpu(layer l, network_state state) +void backward_reorg_layer_gpu(layer l, network net) { if(l.flatten){ if(l.reverse){ - flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, state.delta); + flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu); }else{ - flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, state.delta); + flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu); } } else if (l.extra) { int i; for(i = 0; i < l.batch; ++i){ - copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, state.delta + i*l.inputs, 1); + copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1); } } else if(l.reverse){ - reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta); + reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu); } else { - reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta); + reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu); } } #endif diff --git a/src/reorg_layer.h b/src/reorg_layer.h index 6b9c3040..e6513a5f 100644 --- a/src/reorg_layer.h +++ b/src/reorg_layer.h @@ -8,12 +8,12 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra); void resize_reorg_layer(layer *l, int w, int h); -void forward_reorg_layer(const layer l, network_state state); -void backward_reorg_layer(const layer l, network_state state); +void forward_reorg_layer(const layer l, network net); +void backward_reorg_layer(const layer l, network net); #ifdef GPU -void forward_reorg_layer_gpu(layer l, network_state state); -void backward_reorg_layer_gpu(layer l, network_state state); +void forward_reorg_layer_gpu(layer l, network net); +void backward_reorg_layer_gpu(layer l, network net); #endif #endif diff --git a/src/rnn.c b/src/rnn.c index ccfdc55d..62d7d8d1 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -151,7 +151,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int batch = net.batch; int steps = net.time_steps; @@ -176,7 +176,9 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, p = get_rnn_data(text, offsets, inputs, size, streams, steps); } - float loss = train_network_datum(net, p.x, p.y) / (batch); + memcpy(net.input, p.x, net.inputs*net.batch); + memcpy(net.truth, p.y, net.truths*net.batch); + float loss = train_network_datum(net) / (batch); free(p.x); free(p.y); if (avg_loss < 0) avg_loss = loss; @@ -234,7 +236,7 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t if(weightfile){ load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; int i, j; for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp; @@ -291,7 +293,7 @@ void test_tactic_rnn(char *cfgfile, char *weightfile, int num, float temp, int r if(weightfile){ load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; int i, j; for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp; @@ -329,7 +331,7 @@ void valid_tactic_rnn(char *cfgfile, char *weightfile, char *seed) if(weightfile){ load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; int count = 0; int words = 1; @@ -381,7 +383,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed) if(weightfile){ load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; int count = 0; int words = 1; @@ -422,7 +424,7 @@ void vec_char_rnn(char *cfgfile, char *weightfile, char *seed) if(weightfile){ load_weights(&net, weightfile); } - int inputs = get_network_input_size(net); + int inputs = net.inputs; int c; int seed_len = strlen(seed); diff --git a/src/rnn_layer.c b/src/rnn_layer.c index 83fda13e..fb4f1084 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -80,10 +80,10 @@ void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, f update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); } -void forward_rnn_layer(layer l, network_state state) +void forward_rnn_layer(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -92,17 +92,17 @@ void forward_rnn_layer(layer l, network_state state) fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); - if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); + if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { - s.input = state.input; + s.input = net.input; forward_connected_layer(input_layer, s); s.input = l.state; forward_connected_layer(self_layer, s); float *old_state = l.state; - if(state.train) l.state += l.hidden*l.batch; + if(net.train) l.state += l.hidden*l.batch; if(l.shortcut){ copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); }else{ @@ -114,17 +114,17 @@ void forward_rnn_layer(layer l, network_state state) s.input = l.state; forward_connected_layer(output_layer, s); - state.input += l.inputs*l.batch; + net.input += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } } -void backward_rnn_layer(layer l, network_state state) +void backward_rnn_layer(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -160,8 +160,8 @@ void backward_rnn_layer(layer l, network_state state) copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1); if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1); - s.input = state.input + i*l.inputs*l.batch; - if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; + s.input = net.input + i*l.inputs*l.batch; + if(net.delta) s.delta = net.delta + i*l.inputs*l.batch; else s.delta = 0; backward_connected_layer(input_layer, s); @@ -194,10 +194,10 @@ void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentu update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); } -void forward_rnn_layer_gpu(layer l, network_state state) +void forward_rnn_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -206,17 +206,17 @@ void forward_rnn_layer_gpu(layer l, network_state state) fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); - if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); + if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { - s.input = state.input; + s.input_gpu = net.input_gpu; forward_connected_layer_gpu(input_layer, s); - s.input = l.state_gpu; + s.input_gpu = l.state_gpu; forward_connected_layer_gpu(self_layer, s); float *old_state = l.state_gpu; - if(state.train) l.state_gpu += l.hidden*l.batch; + if(net.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ @@ -225,20 +225,20 @@ void forward_rnn_layer_gpu(layer l, network_state state) axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); - s.input = l.state_gpu; + s.input_gpu = l.state_gpu; forward_connected_layer_gpu(output_layer, s); - state.input += l.inputs*l.batch; + net.input_gpu += l.inputs*l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } } -void backward_rnn_layer_gpu(layer l, network_state state) +void backward_rnn_layer_gpu(layer l, network net) { - network_state s = {0}; - s.train = state.train; + network s = net; + s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); @@ -249,24 +249,24 @@ void backward_rnn_layer_gpu(layer l, network_state state) l.state_gpu += l.hidden*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { - s.input = l.state_gpu; - s.delta = self_layer.delta_gpu; + s.input_gpu = l.state_gpu; + s.delta_gpu = self_layer.delta_gpu; backward_connected_layer_gpu(output_layer, s); l.state_gpu -= l.hidden*l.batch; copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); - s.input = l.state_gpu; - s.delta = self_layer.delta_gpu - l.hidden*l.batch; - if (i == 0) s.delta = 0; + s.input_gpu = l.state_gpu; + s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch; + if (i == 0) s.delta_gpu = 0; backward_connected_layer_gpu(self_layer, s); //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); - s.input = state.input + i*l.inputs*l.batch; - if(state.delta) s.delta = state.delta + i*l.inputs*l.batch; - else s.delta = 0; + s.input_gpu = net.input_gpu + i*l.inputs*l.batch; + if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch; + else s.delta_gpu = 0; backward_connected_layer_gpu(input_layer, s); increment_layer(&input_layer, -1); diff --git a/src/rnn_layer.h b/src/rnn_layer.h index bb9478b9..782a90ef 100644 --- a/src/rnn_layer.h +++ b/src/rnn_layer.h @@ -9,13 +9,13 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log); -void forward_rnn_layer(layer l, network_state state); -void backward_rnn_layer(layer l, network_state state); +void forward_rnn_layer(layer l, network net); +void backward_rnn_layer(layer l, network net); void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); #ifdef GPU -void forward_rnn_layer_gpu(layer l, network_state state); -void backward_rnn_layer_gpu(layer l, network_state state); +void forward_rnn_layer_gpu(layer l, network net); +void backward_rnn_layer_gpu(layer l, network net); void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void push_rnn_layer(layer l); void pull_rnn_layer(layer l); diff --git a/src/rnn_vid.c b/src/rnn_vid.c index 1e2264e0..3042173f 100644 --- a/src/rnn_vid.c +++ b/src/rnn_vid.c @@ -103,7 +103,9 @@ void train_vid_rnn(char *cfgfile, char *weightfile) time=clock(); float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps); - float loss = train_network_datum(net, p.x, p.y) / (net.batch); + memcpy(net.input, p.x, net.inputs*net.batch); + memcpy(net.truth, p.y, net.truths*net.batch); + float loss = train_network_datum(net) / (net.batch); free(p.x); diff --git a/src/route_layer.c b/src/route_layer.c index dce71180..72267689 100644 --- a/src/route_layer.c +++ b/src/route_layer.c @@ -70,13 +70,13 @@ void resize_route_layer(route_layer *l, network *net) } -void forward_route_layer(const route_layer l, network_state state) +void forward_route_layer(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; - float *input = state.net.layers[index].output; + float *input = net.layers[index].output; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_cpu(input_size, input + j*input_size, 1, l.output + offset + j*l.outputs, 1); @@ -85,13 +85,13 @@ void forward_route_layer(const route_layer l, network_state state) } } -void backward_route_layer(const route_layer l, network_state state) +void backward_route_layer(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; - float *delta = state.net.layers[index].delta; + float *delta = net.layers[index].delta; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ axpy_cpu(input_size, 1, l.delta + offset + j*l.outputs, 1, delta + j*input_size, 1); @@ -101,13 +101,13 @@ void backward_route_layer(const route_layer l, network_state state) } #ifdef GPU -void forward_route_layer_gpu(const route_layer l, network_state state) +void forward_route_layer_gpu(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; - float *input = state.net.layers[index].output_gpu; + float *input = net.layers[index].output_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1); @@ -116,13 +116,13 @@ void forward_route_layer_gpu(const route_layer l, network_state state) } } -void backward_route_layer_gpu(const route_layer l, network_state state) +void backward_route_layer_gpu(const route_layer l, network net) { int i, j; int offset = 0; for(i = 0; i < l.n; ++i){ int index = l.input_layers[i]; - float *delta = state.net.layers[index].delta_gpu; + float *delta = net.layers[index].delta_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1); diff --git a/src/route_layer.h b/src/route_layer.h index 45467d95..1d40330f 100644 --- a/src/route_layer.h +++ b/src/route_layer.h @@ -6,13 +6,13 @@ typedef layer route_layer; route_layer make_route_layer(int batch, int n, int *input_layers, int *input_size); -void forward_route_layer(const route_layer l, network_state state); -void backward_route_layer(const route_layer l, network_state state); +void forward_route_layer(const route_layer l, network net); +void backward_route_layer(const route_layer l, network net); void resize_route_layer(route_layer *l, network *net); #ifdef GPU -void forward_route_layer_gpu(const route_layer l, network_state state); -void backward_route_layer_gpu(const route_layer l, network_state state); +void forward_route_layer_gpu(const route_layer l, network net); +void backward_route_layer_gpu(const route_layer l, network net); #endif #endif diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index 8bca50fb..530d9980 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -36,32 +36,32 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int return l; } -void forward_shortcut_layer(const layer l, network_state state) +void forward_shortcut_layer(const layer l, network net) { - copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1); - shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); + copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1); + shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output); activate_array(l.output, l.outputs*l.batch, l.activation); } -void backward_shortcut_layer(const layer l, network_state state) +void backward_shortcut_layer(const layer l, network net) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); - axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1); - shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta); + axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, net.delta, 1); + shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, net.layers[l.index].delta); } #ifdef GPU -void forward_shortcut_layer_gpu(const layer l, network_state state) +void forward_shortcut_layer_gpu(const layer l, network net) { - copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1); - shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu); + copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); + shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); } -void backward_shortcut_layer_gpu(const layer l, network_state state) +void backward_shortcut_layer_gpu(const layer l, network net) { gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1); - shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu); + axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1); + shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, net.layers[l.index].delta_gpu); } #endif diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h index c09a8097..32e4ebdc 100644 --- a/src/shortcut_layer.h +++ b/src/shortcut_layer.h @@ -5,12 +5,12 @@ #include "network.h" layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2); -void forward_shortcut_layer(const layer l, network_state state); -void backward_shortcut_layer(const layer l, network_state state); +void forward_shortcut_layer(const layer l, network net); +void backward_shortcut_layer(const layer l, network net); #ifdef GPU -void forward_shortcut_layer_gpu(const layer l, network_state state); -void backward_shortcut_layer_gpu(const layer l, network_state state); +void forward_shortcut_layer_gpu(const layer l, network net); +void backward_shortcut_layer_gpu(const layer l, network net); #endif #endif diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 88f032fc..1eb6e972 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -32,24 +32,24 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups) return l; } -void forward_softmax_layer(const softmax_layer l, network_state state) +void forward_softmax_layer(const softmax_layer l, network net) { if(l.softmax_tree){ int i; int count = 0; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; - softmax_cpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count); + softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count); count += group_size; } } else { - softmax_cpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output); + softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output); } } -void backward_softmax_layer(const softmax_layer l, network_state state) +void backward_softmax_layer(const softmax_layer l, network net) { - axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1); + axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1); } #ifdef GPU @@ -59,24 +59,24 @@ void pull_softmax_layer_output(const softmax_layer layer) cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); } -void forward_softmax_layer_gpu(const softmax_layer l, network_state state) +void forward_softmax_layer_gpu(const softmax_layer l, network net) { if(l.softmax_tree){ int i; int count = 0; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; - softmax_gpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); + softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); count += group_size; } } else { - softmax_gpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); + softmax_gpu(net.input_gpu, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); } } -void backward_softmax_layer_gpu(const softmax_layer layer, network_state state) +void backward_softmax_layer_gpu(const softmax_layer layer, network net) { - axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1); + axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/softmax_layer.h b/src/softmax_layer.h index 821a8dd7..2e3ffe01 100644 --- a/src/softmax_layer.h +++ b/src/softmax_layer.h @@ -7,13 +7,13 @@ typedef layer softmax_layer; void softmax_array(float *input, int n, float temp, float *output); softmax_layer make_softmax_layer(int batch, int inputs, int groups); -void forward_softmax_layer(const softmax_layer l, network_state state); -void backward_softmax_layer(const softmax_layer l, network_state state); +void forward_softmax_layer(const softmax_layer l, network net); +void backward_softmax_layer(const softmax_layer l, network net); #ifdef GPU void pull_softmax_layer_output(const softmax_layer l); -void forward_softmax_layer_gpu(const softmax_layer l, network_state state); -void backward_softmax_layer_gpu(const softmax_layer l, network_state state); +void forward_softmax_layer_gpu(const softmax_layer l, network net); +void backward_softmax_layer_gpu(const softmax_layer l, network net); #endif #endif