diff --git a/Makefile b/Makefile index 2f724f55..a0393b53 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -GPU=1 -CUDNN=1 -OPENCV=1 +GPU=0 +CUDNN=0 +OPENCV=0 DEBUG=0 ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ diff --git a/cfg/gru.cfg b/cfg/gru.cfg index ea0e22d9..1b5363ca 100644 --- a/cfg/gru.cfg +++ b/cfg/gru.cfg @@ -1,23 +1,24 @@ [net] -inputs=256 - -# Test -batch = 1 -time_steps=1 - -# Train -# batch = 512 -# time_steps=64 - subdivisions=1 +batch = 256 +inputs=256 momentum=0.9 -decay=0.001 -learning_rate=0.1 +decay=0.0 +time_steps=128 +learning_rate=.002 +adam=1 -burn_in=100 -policy=poly +policy=constant power=4 -max_batches=10000 +max_batches=400000 + +[gru] +batch_normalize=1 +output = 1024 + +[gru] +batch_normalize=1 +output = 1024 [gru] batch_normalize=1 diff --git a/examples/darknet.c b/examples/darknet.c index 31652b12..92f42bca 100644 --- a/examples/darknet.c +++ b/examples/darknet.c @@ -112,6 +112,26 @@ void operations(char *cfgfile) ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w; } else if(l.type == CONNECTED){ ops += 2l * l.inputs * l.outputs; + } else if (l.type == RNN){ + ops += 2l * l.input_layer->inputs * l.input_layer->outputs; + ops += 2l * l.self_layer->inputs * l.self_layer->outputs; + ops += 2l * l.output_layer->inputs * l.output_layer->outputs; + } else if (l.type == GRU){ + ops += 2l * l.uz->inputs * l.uz->outputs; + ops += 2l * l.uh->inputs * l.uh->outputs; + ops += 2l * l.ur->inputs * l.ur->outputs; + ops += 2l * l.wz->inputs * l.wz->outputs; + ops += 2l * l.wh->inputs * l.wh->outputs; + ops += 2l * l.wr->inputs * l.wr->outputs; + } else if (l.type == LSTM){ + ops += 2l * l.uf->inputs * l.uf->outputs; + ops += 2l * l.ui->inputs * l.ui->outputs; + ops += 2l * l.ug->inputs * l.ug->outputs; + ops += 2l * l.uo->inputs * l.uo->outputs; + ops += 2l * l.wf->inputs * l.wf->outputs; + ops += 2l * l.wi->inputs * l.wi->outputs; + ops += 2l * l.wg->inputs * l.wg->outputs; + ops += 2l * l.wo->inputs * l.wo->outputs; } } printf("Floating Point Operations: %ld\n", ops); diff --git a/examples/lsd.c b/examples/lsd.c index 1b2b0b5f..8f5a1a1c 100644 --- a/examples/lsd.c +++ b/examples/lsd.c @@ -64,7 +64,7 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg int ax_size = anet.inputs*anet.batch; int ay_size = anet.truths*anet.batch; - fill_ongpu(ay_size, .9, anet.truth_gpu, 1); + fill_gpu(ay_size, .9, anet.truth_gpu, 1); anet.delta_gpu = cuda_make_array(0, ax_size); anet.train = 1; @@ -102,36 +102,36 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg forward_network_gpu(fnet, fstate); float *feats = fnet.layers[fnet.n - 2].output_gpu; - copy_ongpu(y_size, feats, 1, fstate.truth, 1); + copy_gpu(y_size, feats, 1, fstate.truth, 1); forward_network_gpu(gnet, gstate); float *gen = gnet.layers[gnet.n-1].output_gpu; - copy_ongpu(x_size, gen, 1, fstate.input, 1); + copy_gpu(x_size, gen, 1, fstate.input, 1); - fill_ongpu(x_size, 0, fstate.delta, 1); + fill_gpu(x_size, 0, fstate.delta, 1); forward_network_gpu(fnet, fstate); backward_network_gpu(fnet, fstate); //HERE astate.input = gen; - fill_ongpu(ax_size, 0, astate.delta, 1); + fill_gpu(ax_size, 0, astate.delta, 1); forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); float *delta = imlayer.delta_gpu; - fill_ongpu(x_size, 0, delta, 1); - scal_ongpu(x_size, 100, astate.delta, 1); - scal_ongpu(x_size, .001, fstate.delta, 1); - axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1); - axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); + fill_gpu(x_size, 0, delta, 1); + scal_gpu(x_size, 100, astate.delta, 1); + scal_gpu(x_size, .001, fstate.delta, 1); + axpy_gpu(x_size, 1, fstate.delta, 1, delta, 1); + axpy_gpu(x_size, 1, astate.delta, 1, delta, 1); - //fill_ongpu(x_size, 0, delta, 1); + //fill_gpu(x_size, 0, delta, 1); //cuda_push_array(delta, X, x_size); - //axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1); + //axpy_gpu(x_size, -1, imlayer.output_gpu, 1, delta, 1); //printf("pix error: %f\n", cuda_mag_array(delta, x_size)); printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size)); printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size)); - //axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); + //axpy_gpu(x_size, 1, astate.delta, 1, delta, 1); backward_network_gpu(gnet, gstate); @@ -273,7 +273,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear float *imerror = cuda_make_array(0, imlayer.outputs); float *ones_gpu = cuda_make_array(0, ay_size); - fill_ongpu(ay_size, .9, ones_gpu, 1); + fill_gpu(ay_size, .9, ones_gpu, 1); float aloss_avg = -1; float gloss_avg = -1; @@ -318,23 +318,23 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear *net.seen += net.batch; forward_network_gpu(net, gstate); - fill_ongpu(imlayer.outputs, 0, imerror, 1); + fill_gpu(imlayer.outputs, 0, imerror, 1); astate.input = imlayer.output_gpu; astate.delta = imerror; astate.truth = ones_gpu; forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); - scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1); + scal_gpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1); backward_network_gpu(net, gstate); - scal_ongpu(imlayer.outputs, 1000, imerror, 1); + scal_gpu(imlayer.outputs, 1000, imerror, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs)); - axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); + axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); gloss += get_network_cost(net) /(net.subdivisions*net.batch); @@ -533,9 +533,9 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, *gnet.seen += gnet.batch; forward_network_gpu(gnet); - fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); - fill_ongpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1); - copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1); + fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); + fill_gpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1); + copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1); anet.delta_gpu = imerror; forward_network_gpu(anet); backward_network_gpu(anet); @@ -543,13 +543,13 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, float genaloss = *anet.cost / anet.batch; printf("%f\n", genaloss); - scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); - scal_ongpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1); + scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); + scal_gpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); printf("features %f\n", cuda_mag_array(gnet.layers[gnet.n-1].delta_gpu, imlayer.outputs*imlayer.batch)); - axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1); + axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1); backward_network_gpu(gnet); @@ -716,21 +716,21 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle *net.seen += net.batch; forward_network_gpu(net); - fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); - copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1); - fill_ongpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1); + fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); + copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1); + fill_gpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1); anet.delta_gpu = imerror; forward_network_gpu(anet); backward_network_gpu(anet); - scal_ongpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1); + scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1); - scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); + scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs*imlayer.batch)); - axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1); + axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1); backward_network_gpu(net); @@ -876,7 +876,7 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi float *imerror = cuda_make_array(0, imlayer.outputs); float *ones_gpu = cuda_make_array(0, ay_size); - fill_ongpu(ay_size, 1, ones_gpu, 1); + fill_gpu(ay_size, 1, ones_gpu, 1); float aloss_avg = -1; float gloss_avg = -1; @@ -902,15 +902,15 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi *net.seen += net.batch; forward_network_gpu(net, gstate); - fill_ongpu(imlayer.outputs, 0, imerror, 1); + fill_gpu(imlayer.outputs, 0, imerror, 1); astate.input = imlayer.output_gpu; astate.delta = imerror; astate.truth = ones_gpu; forward_network_gpu(anet, astate); backward_network_gpu(anet, astate); - scal_ongpu(imlayer.outputs, 1, imerror, 1); - axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); + scal_gpu(imlayer.outputs, 1, imerror, 1); + axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); backward_network_gpu(net, gstate); diff --git a/examples/nightmare.c b/examples/nightmare.c index fe7c5f7f..6763f116 100644 --- a/examples/nightmare.c +++ b/examples/nightmare.c @@ -50,7 +50,7 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa cuda_push_array(net->input_gpu, im.data, net->inputs); forward_network_gpu(*net); - copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); + copy_gpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); @@ -141,7 +141,7 @@ void reconstruct_picture(network net, float *features, image recon, image update forward_network_gpu(net); cuda_push_array(l.delta_gpu, features, l.outputs); - axpy_ongpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1); + axpy_gpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1); backward_network_gpu(net); cuda_pull_array(net.delta_gpu, delta.data, delta.w*delta.h*delta.c); diff --git a/examples/rnn.c b/examples/rnn.c index 1c45711d..8d1fa242 100644 --- a/examples/rnn.c +++ b/examples/rnn.c @@ -114,7 +114,10 @@ void reset_rnn_state(network net, int b) #ifdef GPU layer l = net.layers[i]; if(l.state_gpu){ - fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1); + fill_gpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1); + } + if(l.h_gpu){ + fill_gpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1); } #endif } diff --git a/examples/segmenter.c b/examples/segmenter.c index 325593ed..e3804d37 100644 --- a/examples/segmenter.c +++ b/examples/segmenter.c @@ -27,6 +27,11 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, } srand(time(0)); network net = nets[0]; + image pred = get_network_image(net); + + int div = net.w/pred.w; + assert(pred.w * div == net.w); + assert(pred.h * div == net.h); int imgs = net.batch * net.subdivisions * ngpus; @@ -46,6 +51,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, args.w = net.w; args.h = net.h; args.threads = 32; + args.scale = div; args.min = net.min_crop; args.max = net.max_crop; @@ -75,15 +81,6 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, pthread_join(load_thread, 0); train = buffer; load_thread = load_data(args); - image tr = float_to_image(net.w, net.h, 81, train.y.vals[0]); - image im = float_to_image(net.w, net.h, net.c, train.X.vals[0]); - image mask = mask_to_rgb(tr); - show_image(im, "input"); - show_image(mask, "truth"); -#ifdef OPENCV - cvWaitKey(100); -#endif - free_image(mask); printf("Loaded: %lf seconds\n", sec(clock()-time)); time=clock(); @@ -98,6 +95,20 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, #else loss = train_network(net, train); #endif + if(1){ + image tr = float_to_image(net.w/div, net.h/div, 80, train.y.vals[net.batch]); + image im = float_to_image(net.w, net.h, net.c, train.X.vals[net.batch]); + image mask = mask_to_rgb(tr); + image prmask = mask_to_rgb(pred); + show_image(im, "input"); + show_image(prmask, "pred"); + show_image(mask, "truth"); +#ifdef OPENCV + cvWaitKey(100); +#endif + free_image(mask); + free_image(prmask); + } if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); diff --git a/include/darknet.h b/include/darknet.h index 8d4366b6..8ae68eb4 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -84,7 +84,7 @@ typedef enum { } LAYER_TYPE; typedef enum{ - SSE, MASKED, L1, SMOOTH + SSE, MASKED, L1, SEG, SMOOTH } COST_TYPE; typedef struct{ @@ -203,6 +203,8 @@ struct layer{ float * forgot_state; float * forgot_delta; float * state_delta; + float * combine_cpu; + float * combine_delta_cpu; float * concat; float * concat_delta; @@ -271,6 +273,10 @@ struct layer{ struct layer *self_layer; struct layer *output_layer; + struct layer *reset_layer; + struct layer *update_layer; + struct layer *state_layer; + struct layer *input_gate_layer; struct layer *state_gate_layer; struct layer *input_save_layer; @@ -335,6 +341,9 @@ struct layer{ float *bias_v_gpu; float *scale_v_gpu; + float * combine_gpu; + float * combine_delta_gpu; + float * prev_state_gpu; float * forgot_state_gpu; float * forgot_delta_gpu; @@ -575,10 +584,10 @@ void normalize_cpu(float *x, float *mean, float *variance, int batch, int filter int best_3d_shift_r(image a, image b, int min, int max); #ifdef GPU -void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); -void fill_ongpu(int N, float ALPHA, float * X, int INCX); -void scal_ongpu(int N, float ALPHA, float * X, int INCX); -void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); +void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); +void fill_gpu(int N, float ALPHA, float * X, int INCX); +void scal_gpu(int N, float ALPHA, float * X, int INCX); +void copy_gpu(int N, float * X, int INCX, float * Y, int INCY); void cuda_set_device(int n); void cuda_free(float *x_gpu); diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu index 994e2068..73530056 100644 --- a/src/activation_kernels.cu +++ b/src/activation_kernels.cu @@ -152,13 +152,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt if(i < n) delta[i] *= gradient_kernel(x[i], a); } -extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) +extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a) { activate_array_kernel<<>>(x, n, a); check_error(cudaPeekAtLastError()); } -extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) +extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta) { gradient_array_kernel<<>>(x, n, a, delta); check_error(cudaPeekAtLastError()); diff --git a/src/activation_layer.c b/src/activation_layer.c index de4e4b7d..b4ba9539 100644 --- a/src/activation_layer.c +++ b/src/activation_layer.c @@ -51,13 +51,13 @@ void backward_activation_layer(layer l, network net) void forward_activation_layer_gpu(layer l, network net) { - copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); + activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); } void backward_activation_layer_gpu(layer l, network net) { - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/activations.h b/src/activations.h index 54ab3c22..d456dbe3 100644 --- a/src/activations.h +++ b/src/activations.h @@ -12,8 +12,8 @@ float gradient(float x, ACTIVATION a); void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); void activate_array(float *x, const int n, const ACTIVATION a); #ifdef GPU -void activate_array_ongpu(float *x, int n, ACTIVATION a); -void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta); +void activate_array_gpu(float *x, int n, ACTIVATION a); +void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta); #endif static inline float stair_activate(float x) diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index 4862531d..ebff387c 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -188,8 +188,8 @@ void push_batchnorm_layer(layer l) void forward_batchnorm_layer_gpu(layer l, network net) { - if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); + if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); + copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); if (net.train) { #ifdef CUDNN float one = 1; @@ -215,14 +215,14 @@ void forward_batchnorm_layer_gpu(layer l, network net) fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); - scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1); - axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); - scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1); - axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); + scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1); + axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1); + scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1); + axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); + copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); + copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); @@ -263,7 +263,7 @@ void backward_batchnorm_layer_gpu(layer l, network net) .00001, l.mean_gpu, l.variance_gpu); - copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); #else backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); @@ -274,6 +274,6 @@ void backward_batchnorm_layer_gpu(layer l, network net) fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); #endif - if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); + if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/blas.c b/src/blas.c index 668b12c3..d25c1969 100644 --- a/src/blas.c +++ b/src/blas.c @@ -55,6 +55,16 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c) } } +void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc) +{ + int i; + for(i = 0; i < n; ++i){ + if(da) da[i] += dc[i] * s[i]; + if(db) db[i] += dc[i] * (1-s[i]); + ds[i] += dc[i] * (a[i] - b[i]); + } +} + void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out) { int stride = w1/w2; @@ -162,12 +172,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX) for(i = 0; i < N; ++i) X[i*INCX] = ALPHA; } +void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + int i, j; + int index = 0; + for(j = 0; j < B; ++j) { + for(i = 0; i < NX; ++i){ + if(X) X[j*NX + i] += OUT[index]; + ++index; + } + for(i = 0; i < NY; ++i){ + if(Y) Y[j*NY + i] += OUT[index]; + ++index; + } + } +} + +void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + int i, j; + int index = 0; + for(j = 0; j < B; ++j) { + for(i = 0; i < NX; ++i){ + OUT[index++] = X[j*NX + i]; + } + for(i = 0; i < NY; ++i){ + OUT[index++] = Y[j*NY + i]; + } + } +} + void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) { int i; for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; } +void mult_add_into_cpu(int N, float *X, float *Y, float *Z) +{ + int i; + for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i]; +} + void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) { int i; diff --git a/src/blas.h b/src/blas.h index 6291746f..a8408f32 100644 --- a/src/blas.h +++ b/src/blas.h @@ -10,8 +10,12 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward void test_blas(); +void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT); +void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT); +void mult_add_into_cpu(int N, float *X, float *Y, float *Z); + void const_cpu(int N, float ALPHA, float *X, int INCX); -void constrain_ongpu(int N, float ALPHA, float * X, int INCX); +void constrain_gpu(int N, float ALPHA, float * X, int INCX); void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); void mul_cpu(int N, float *X, int INCX, float *Y, int INCY); @@ -33,6 +37,7 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); void l1_cpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); +void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc); void softmax(float *input, int n, float temp, int stride, float *output); void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); @@ -41,16 +46,17 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i #include "cuda.h" #include "tree.h" -void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); -void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); -void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); -void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); -void add_ongpu(int N, float ALPHA, float * X, int INCX); -void supp_ongpu(int N, float ALPHA, float * X, int INCX); -void mask_ongpu(int N, float * X, float mask_num, float * mask); -void const_ongpu(int N, float ALPHA, float *X, int INCX); -void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); -void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY); +void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); +void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); +void copy_gpu(int N, float * X, int INCX, float * Y, int INCY); +void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); +void add_gpu(int N, float ALPHA, float * X, int INCX); +void supp_gpu(int N, float ALPHA, float * X, int INCX); +void mask_gpu(int N, float * X, float mask_num, float * mask); +void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale); +void const_gpu(int N, float ALPHA, float *X, int INCX); +void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); +void mul_gpu(int N, float *X, int INCX, float *Y, int INCY); void mean_gpu(float *x, int batch, int filters, int spatial, float *mean); void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance); @@ -76,14 +82,16 @@ void l1_gpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc); void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c); void mult_add_into_gpu(int num, float *a, float *b, float *c); +void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT); +void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT); -void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); +void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t); void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); -void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); +void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out); void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier); #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 6fdfd3fa..484d469e 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -176,16 +176,16 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2 extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t) { - scal_ongpu(n, B1, m, 1); - scal_ongpu(n, B2, v, 1); - axpy_ongpu(n, -decay*batch, w, 1, d, 1); + scal_gpu(n, B1, m, 1); + scal_gpu(n, B2, v, 1); + axpy_gpu(n, -decay*batch, w, 1, d, 1); - axpy_ongpu(n, (1-B1), d, 1, m, 1); - mul_ongpu(n, d, 1, d, 1); - axpy_ongpu(n, (1-B2), d, 1, v, 1); + axpy_gpu(n, (1-B1), d, 1, m, 1); + mul_gpu(n, d, 1, d, 1); + axpy_gpu(n, (1-B2), d, 1, v, 1); - adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t); - fill_ongpu(n, 0, d, 1); + adam_gpu(n, w, m, v, B1, B2, rate, eps, t); + fill_gpu(n, 0, d, 1); } __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial) @@ -556,35 +556,35 @@ extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int check_error(cudaPeekAtLastError()); } -extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) +extern "C" void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { - axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); + axpy_gpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); } -extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) +extern "C" void pow_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { pow_kernel<<>>(N, ALPHA, X, INCX, Y, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +extern "C" void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { axpy_kernel<<>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY) +extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY) { - copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY); + copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY); } -extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY) +extern "C" void mul_gpu(int N, float * X, int INCX, float * Y, int INCY) { mul_kernel<<>>(N, X, INCX, Y, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { copy_kernel<<>>(N, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); @@ -607,58 +607,70 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat else out[i1] = x[i2]; } -extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out) +extern "C" void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out) { int size = spatial*batch*layers; flatten_kernel<<>>(size, x, spatial, layers, batch, forward, out); check_error(cudaPeekAtLastError()); } -extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) +extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) { int size = w*h*c*batch; reorg_kernel<<>>(size, x, w, h, c, batch, stride, forward, out); check_error(cudaPeekAtLastError()); } -extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask) +__global__ void scale_mask_kernel(int n, float *x, float mask_num, float *mask, float scale) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < n && mask[i] == mask_num) x[i] *= scale; +} + +extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale) +{ + scale_mask_kernel<<>>(N, X, mask_num, mask, scale); + check_error(cudaPeekAtLastError()); +} + +extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask) { mask_kernel<<>>(N, X, mask_num, mask); check_error(cudaPeekAtLastError()); } -extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX) { const_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void constrain_gpu(int N, float ALPHA, float * X, int INCX) { constrain_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void add_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void add_gpu(int N, float ALPHA, float * X, int INCX) { add_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void scal_gpu(int N, float ALPHA, float * X, int INCX) { scal_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void supp_gpu(int N, float ALPHA, float * X, int INCX) { supp_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX) +extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX) { fill_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); @@ -686,6 +698,9 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int int minw = (w1 < w2) ? w1 : w2; int minh = (h1 < h2) ? h1 : h2; int minc = (c1 < c2) ? c1 : c2; + assert(w1 == w2); + assert(h1 == h2); + assert(c1 == c2); int stride = w1/w2; int sample = w2/w1; @@ -765,6 +780,46 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float * } } +__global__ void deinter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < (NX+NY)*B){ + int b = i / (NX+NY); + int j = i % (NX+NY); + if (j < NX){ + if(X) X[b*NX + j] += OUT[i]; + } else { + if(Y) Y[b*NY + j - NX] += OUT[i]; + } + } +} + +extern "C" void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + deinter_kernel<<>>(NX, X, NY, Y, B, OUT); + check_error(cudaPeekAtLastError()); +} + +__global__ void inter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < (NX+NY)*B){ + int b = i / (NX+NY); + int j = i % (NX+NY); + if (j < NX){ + OUT[i] = X[b*NX + j]; + } else { + OUT[i] = Y[b*NY + j - NX]; + } + } +} + +extern "C" void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT) +{ + inter_kernel<<>>(NX, X, NY, Y, B, OUT); + check_error(cudaPeekAtLastError()); +} + extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c) { weighted_sum_kernel<<>>(num, a, b, s, c); @@ -776,8 +831,8 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < n){ if(da) da[i] += dc[i] * s[i]; - db[i] += dc[i] * (1-s[i]); - ds[i] += dc[i] * a[i] + dc[i] * -b[i]; + if(db) db[i] += dc[i] * (1-s[i]); + ds[i] += dc[i] * (a[i] - b[i]); } } diff --git a/src/col2im.h b/src/col2im.h index 02374972..3fbe0530 100644 --- a/src/col2im.h +++ b/src/col2im.h @@ -6,7 +6,7 @@ void col2im_cpu(float* data_col, int ksize, int stride, int pad, float* data_im); #ifdef GPU -void col2im_ongpu(float *data_col, +void col2im_gpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im); #endif diff --git a/src/col2im_kernels.cu b/src/col2im_kernels.cu index aed2df9b..ba45e0fd 100644 --- a/src/col2im_kernels.cu +++ b/src/col2im_kernels.cu @@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col, } } -void col2im_ongpu(float *data_col, +void col2im_gpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im){ // We are going to launch channels * height_col * width_col kernels, each diff --git a/src/connected_layer.c b/src/connected_layer.c index 5037e748..353f4e56 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -271,23 +271,23 @@ void update_connected_layer_gpu(layer l, update_args a) adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t); } }else{ - axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); + axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1); if(l.batch_normalize){ - axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1); + axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1); } - axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1); + axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1); } } void forward_connected_layer_gpu(layer l, network net) { - fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); int m = l.batch; int k = l.inputs; @@ -295,20 +295,20 @@ void forward_connected_layer_gpu(layer l, network net) float * a = net.input_gpu; float * b = l.weights_gpu; float * c = l.output_gpu; - gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); + gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if (l.batch_normalize) { forward_batchnorm_layer_gpu(l, net); } else { add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1); } - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); } void backward_connected_layer_gpu(layer l, network net) { - constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1); - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ backward_batchnorm_layer_gpu(l, net); } else { @@ -321,7 +321,7 @@ void backward_connected_layer_gpu(layer l, network net) float * a = l.delta_gpu; float * b = net.input_gpu; float * c = l.weight_updates_gpu; - gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n); + gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n); m = l.batch; k = l.outputs; @@ -331,6 +331,6 @@ void backward_connected_layer_gpu(layer l, network net) b = l.weights_gpu; c = net.delta_gpu; - if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); + if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } #endif diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index cc002182..b9b6f455 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -72,7 +72,7 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary) void forward_convolutional_layer_gpu(convolutional_layer l, network net) { - fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); if(l.binary){ binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); swap_binary(&l); @@ -107,11 +107,11 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net) int k = l.size*l.size*l.c; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ - im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); + im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); float * a = l.weights_gpu; float * b = net.workspace; float * c = l.output_gpu; - gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); + gemm_gpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); } #endif @@ -121,7 +121,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net) add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); //if(l.dot > 0) dot_error_gpu(l); if(l.binary || l.xnor) swap_binary(&l); } @@ -173,8 +173,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net) if(l.smooth){ smooth_layer(l, 5, l.smooth); } - constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ @@ -217,7 +217,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net) l.dsrcTensorDesc, net.delta_gpu); if(l.binary || l.xnor) swap_binary(&l); - if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu); + if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu); } #else @@ -231,8 +231,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net) float * b = net.workspace; float * c = l.weight_updates_gpu; - im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); - gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); + im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); + gemm_gpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n); if(net.delta_gpu){ if(l.binary || l.xnor) swap_binary(&l); @@ -240,13 +240,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net) float * b = l.delta_gpu; float * c = net.workspace; - gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k); + gemm_gpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k); - col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w); + col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w); if(l.binary || l.xnor) { swap_binary(&l); } - if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w); + if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w); } } #endif @@ -294,16 +294,16 @@ void update_convolutional_layer_gpu(layer l, update_args a) adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); } }else{ - axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_gpu(size, momentum, l.weight_updates_gpu, 1); - axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); + axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_gpu(l.n, momentum, l.bias_updates_gpu, 1); if(l.scales_gpu){ - axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_gpu(l.n, momentum, l.scale_updates_gpu, 1); } } } diff --git a/src/cost_layer.c b/src/cost_layer.c index 83e258b9..7ef1094b 100644 --- a/src/cost_layer.c +++ b/src/cost_layer.c @@ -9,6 +9,7 @@ COST_TYPE get_cost_type(char *s) { + if (strcmp(s, "seg")==0) return SEG; if (strcmp(s, "sse")==0) return SSE; if (strcmp(s, "masked")==0) return MASKED; if (strcmp(s, "smooth")==0) return SMOOTH; @@ -20,6 +21,8 @@ COST_TYPE get_cost_type(char *s) char *get_cost_string(COST_TYPE a) { switch(a){ + case SEG: + return "seg"; case SSE: return "sse"; case MASKED: @@ -122,11 +125,11 @@ void forward_cost_layer_gpu(cost_layer l, network net) { if (!net.truth_gpu) return; if(l.smooth){ - scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1); - add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1); + scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1); + add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1); } if (l.cost_type == MASKED) { - mask_ongpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu); + mask_gpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu); } if(l.cost_type == SMOOTH){ @@ -137,6 +140,11 @@ void forward_cost_layer_gpu(cost_layer l, network net) l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu); } + if (l.cost_type == SEG && l.noobject_scale != 1) { + scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale); + scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale); + } + if(l.ratio){ cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs); qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare); @@ -144,11 +152,11 @@ void forward_cost_layer_gpu(cost_layer l, network net) float thresh = l.delta[n]; thresh = 0; printf("%f\n", thresh); - supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1); + supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1); } if(l.thresh){ - supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1); + supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1); } cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs); @@ -157,7 +165,7 @@ void forward_cost_layer_gpu(cost_layer l, network net) void backward_cost_layer_gpu(const cost_layer l, network net) { - axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1); + axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 2554cf9c..e0ff78a2 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -209,10 +209,10 @@ void forward_crnn_layer_gpu(layer l, network net) layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); - fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); - fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); - fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); - if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); + fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); + fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); + if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { s.input_gpu = net.input_gpu; @@ -224,12 +224,12 @@ void forward_crnn_layer_gpu(layer l, network net) float *old_state = l.state_gpu; if(net.train) l.state_gpu += l.hidden*l.batch; if(l.shortcut){ - copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); + copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); }else{ - fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); + fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1); } - axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); - axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); + axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); + axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; forward_convolutional_layer_gpu(output_layer, s); @@ -254,8 +254,8 @@ void backward_crnn_layer_gpu(layer l, network net) increment_layer(&output_layer, l.steps - 1); l.state_gpu += l.hidden*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { - copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); - axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); + copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1); + axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; s.delta_gpu = self_layer.delta_gpu; @@ -268,8 +268,8 @@ void backward_crnn_layer_gpu(layer l, network net) if (i == 0) s.delta_gpu = 0; backward_convolutional_layer_gpu(self_layer, s); - copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); - if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); + copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); + if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); s.input_gpu = net.input_gpu + i*l.inputs*l.batch; if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch; else s.delta_gpu = 0; diff --git a/src/cuda.c b/src/cuda.c index e5114037..b5c0c329 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -97,7 +97,7 @@ float *cuda_make_array(float *x, size_t n) status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice); check_error(status); } else { - fill_ongpu(n, 0, x_gpu, 1); + fill_gpu(n, 0, x_gpu, 1); } if(!x_gpu) error("Cuda malloc failed\n"); return x_gpu; diff --git a/src/data.c b/src/data.c index c227b6f8..80ca1313 100644 --- a/src/data.c +++ b/src/data.c @@ -551,6 +551,33 @@ void exclusive_image(image src) } image get_segmentation_image(char *path, int w, int h, int classes) +{ + char labelpath[4096]; + find_replace(path, "images", "mask", labelpath); + find_replace(labelpath, "JPEGImages", "mask", labelpath); + find_replace(labelpath, ".jpg", ".txt", labelpath); + find_replace(labelpath, ".JPG", ".txt", labelpath); + find_replace(labelpath, ".JPEG", ".txt", labelpath); + image mask = make_image(w, h, classes); + FILE *file = fopen(labelpath, "r"); + if(!file) file_error(labelpath); + char buff[32788]; + int id; + image part = make_image(w, h, 1); + while(fscanf(file, "%d %s", &id, buff) == 2){ + int n = 0; + int *rle = read_intlist(buff, &n, 0); + load_rle(part, rle, n); + or_image(part, mask, id); + free(rle); + } + //exclusive_image(mask); + fclose(file); + free_image(part); + return mask; +} + +image get_segmentation_image2(char *path, int w, int h, int classes) { char labelpath[4096]; find_replace(path, "images", "mask", labelpath); @@ -584,7 +611,7 @@ image get_segmentation_image(char *path, int w, int h, int classes) return mask; } -data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure) +data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure, int div) { char **random_paths = get_random_paths(paths, n, m); int i; @@ -597,12 +624,14 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi d.y.rows = n; - d.y.cols = h*w*(classes+1); + d.y.cols = h*w*classes/div/div; d.y.vals = calloc(d.X.rows, sizeof(float*)); for(i = 0; i < n; ++i){ image orig = load_image_color(random_paths[i], 0, 0); augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h); + a.dx = 0; + a.dy = 0; image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect); int flip = rand()%2; @@ -612,7 +641,7 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi image mask = get_segmentation_image(random_paths[i], orig.w, orig.h, classes); //image mask = make_image(orig.w, orig.h, classes+1); - image sized_m = rotate_crop_image(mask, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect); + image sized_m = rotate_crop_image(mask, a.rad, a.scale/div, a.w/div, a.h/div, a.dx, a.dy, a.aspect); if(flip) flip_image(sized_m); d.y.vals[i] = sized_m.data; @@ -874,7 +903,7 @@ void *load_thread(void *ptr) } else if (a.type == WRITING_DATA){ *a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h); } else if (a.type == SEGMENTATION_DATA){ - *a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure); + *a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.scale); } else if (a.type == REGION_DATA){ *a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure); } else if (a.type == DETECTION_DATA){ diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu index 25026375..d7e29462 100644 --- a/src/deconvolutional_kernels.cu +++ b/src/deconvolutional_kernels.cu @@ -22,31 +22,31 @@ extern "C" void forward_deconvolutional_layer_gpu(layer l, network net) int n = l.h*l.w; int k = l.c; - fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); for(i = 0; i < l.batch; ++i){ float *a = l.weights_gpu; float *b = net.input_gpu + i*l.c*l.h*l.w; float *c = net.workspace; - gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n); + gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n); - col2im_ongpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs); + col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs); } if (l.batch_normalize) { forward_batchnorm_layer_gpu(l, net); } else { add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } - activate_array_ongpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation); + activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation); } extern "C" void backward_deconvolutional_layer_gpu(layer l, network net) { int i; - constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ backward_batchnorm_layer_gpu(l, net); @@ -65,9 +65,9 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net) float *b = net.workspace; float *c = l.weight_updates_gpu; - im2col_ongpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, + im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, b); - gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); + gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta_gpu){ int m = l.c; @@ -78,7 +78,7 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net) float *b = net.workspace; float *c = net.delta_gpu + i*n*m; - gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); + gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } } @@ -125,16 +125,16 @@ void update_deconvolutional_layer_gpu(layer l, update_args a) adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); } }else{ - axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_gpu(size, momentum, l.weight_updates_gpu, 1); - axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); + axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_gpu(l.n, momentum, l.bias_updates_gpu, 1); if(l.scales_gpu){ - axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_gpu(l.n, momentum, l.scale_updates_gpu, 1); } } } diff --git a/src/detection_layer.c b/src/detection_layer.c index 100694ba..5c8a1cea 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -256,7 +256,7 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box void forward_detection_layer_gpu(const detection_layer l, network net) { if(!net.train){ - copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); + copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); return; } @@ -270,8 +270,8 @@ void forward_detection_layer_gpu(const detection_layer l, network net) void backward_detection_layer_gpu(detection_layer l, network net) { - axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); - //copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1); + axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); + //copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/gemm.c b/src/gemm.c index 3003be0e..53ab1404 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -165,7 +165,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, #include -void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, +void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A_gpu, int lda, float *B_gpu, int ldb, float BETA, @@ -177,24 +177,6 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, check_error(status); } -void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc) -{ - float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M)); - float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K)); - float *C_gpu = cuda_make_array(C, ldc*M); - - gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); - - cuda_pull_array(C_gpu, C, ldc*M); - cuda_free(A_gpu); - cuda_free(B_gpu); - cuda_free(C_gpu); -} - #include #include #include @@ -224,7 +206,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) free(c); } -void time_ongpu(int TA, int TB, int m, int k, int n) +void time_gpu(int TA, int TB, int m, int k, int n) { int iter = 10; float *a = random_matrix(m,k); @@ -242,7 +224,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n) int i; clock_t start = clock(), end; for(i = 0; i= 0; --i) { - if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1); + if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1); + else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1); float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch; - copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1); + copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1); - copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1); + copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1); - activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC); - copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1); + copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1); if(l.tanh){ - activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); + activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH); } else { - activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC); } - weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu); + weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu); if(l.tanh){ - gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu); + gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu); } else { - gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu); + gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu); } - copy_ongpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); - fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1); + mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); + fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1); s.input_gpu = l.forgot_state_gpu; s.delta_gpu = l.forgot_delta_gpu; backward_connected_layer_gpu(wh, s); if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu); - mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, ur.delta_gpu); + mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu); - gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu); - copy_ongpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1); + gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu); + copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1); - gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu); - copy_ongpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1); + gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu); + copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1); - s.input_gpu = l.prev_state_gpu; + s.input_gpu = l.state_gpu; s.delta_gpu = prev_delta_gpu; backward_connected_layer_gpu(wr, s); @@ -399,5 +401,6 @@ void backward_gru_layer_gpu(layer l, network net) increment_layer(&wr, -1); increment_layer(&wh, -1); } + copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1); } #endif diff --git a/src/im2col.h b/src/im2col.h index f0ddeeeb..02c4247f 100644 --- a/src/im2col.h +++ b/src/im2col.h @@ -7,7 +7,7 @@ void im2col_cpu(float* data_im, #ifdef GPU -void im2col_ongpu(float *im, +void im2col_gpu(float *im, int channels, int height, int width, int ksize, int stride, int pad,float *data_col); diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index d42d600b..07b5e679 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -45,7 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im, } } -void im2col_ongpu(float *im, +void im2col_gpu(float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col){ // We are going to launch channels * height_col * width_col kernels, each diff --git a/src/local_layer.c b/src/local_layer.c index 170ba9b6..74f6910a 100644 --- a/src/local_layer.c +++ b/src/local_layer.c @@ -191,12 +191,12 @@ void forward_local_layer_gpu(const local_layer l, network net) int locations = out_h * out_w; for(i = 0; i < l.batch; ++i){ - copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1); + copy_gpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1); } for(i = 0; i < l.batch; ++i){ float *input = net.input_gpu + i*l.w*l.h*l.c; - im2col_ongpu(input, l.c, l.h, l.w, + im2col_gpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); float *output = l.output_gpu + i*l.outputs; for(j = 0; j < locations; ++j){ @@ -208,10 +208,10 @@ void forward_local_layer_gpu(const local_layer l, network net) int n = 1; int k = l.size*l.size*l.c; - gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations); + gemm_gpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations); } } - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); } void backward_local_layer_gpu(local_layer l, network net) @@ -219,14 +219,14 @@ void backward_local_layer_gpu(local_layer l, network net) int i, j; int locations = l.out_w*l.out_h; - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); for(i = 0; i < l.batch; ++i){ - axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1); + axpy_gpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1); } for(i = 0; i < l.batch; ++i){ float *input = net.input_gpu + i*l.w*l.h*l.c; - im2col_ongpu(input, l.c, l.h, l.w, + im2col_gpu(input, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace); for(j = 0; j < locations; ++j){ @@ -237,7 +237,7 @@ void backward_local_layer_gpu(local_layer l, network net) int n = l.size*l.size*l.c; int k = 1; - gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n); + gemm_gpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n); } if(net.delta_gpu){ @@ -250,10 +250,10 @@ void backward_local_layer_gpu(local_layer l, network net) int n = 1; int k = l.n; - gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations); + gemm_gpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } - col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w); + col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w); } } } @@ -267,12 +267,12 @@ void update_local_layer_gpu(local_layer l, update_args a) int locations = l.out_w*l.out_h; int size = l.size*l.size*l.c*l.n*locations; - axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); + axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1); - axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(size, momentum, l.weight_updates_gpu, 1); + axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_gpu(size, momentum, l.weight_updates_gpu, 1); } void pull_local_layer(local_layer l) diff --git a/src/lstm_layer.c b/src/lstm_layer.c index a0cd99b6..fb07de20 100644 --- a/src/lstm_layer.c +++ b/src/lstm_layer.c @@ -41,36 +41,36 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uf->batch = batch; - l.wf = malloc(sizeof(layer)); - fprintf(stderr, "\t\t"); - *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); - l.wf->batch = batch; - l.ui = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ui->batch = batch; - l.wi = malloc(sizeof(layer)); - fprintf(stderr, "\t\t"); - *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); - l.wi->batch = batch; - l.ug = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ug->batch = batch; - l.wg = malloc(sizeof(layer)); - fprintf(stderr, "\t\t"); - *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); - l.wg->batch = batch; - l.uo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uo->batch = batch; + l.wf = malloc(sizeof(layer)); + fprintf(stderr, "\t\t"); + *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); + l.wf->batch = batch; + + l.wi = malloc(sizeof(layer)); + fprintf(stderr, "\t\t"); + *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); + l.wi->batch = batch; + + l.wg = malloc(sizeof(layer)); + fprintf(stderr, "\t\t"); + *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); + l.wg->batch = batch; + l.wo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); @@ -410,17 +410,17 @@ void forward_lstm_layer_gpu(layer l, network state) layer ug = *(l.ug); layer uo = *(l.uo); - fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1); - fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1); if (state.train) { - fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); } for (i = 0; i < l.steps; ++i) { @@ -436,34 +436,34 @@ void forward_lstm_layer_gpu(layer l, network state) forward_connected_layer_gpu(ug, s); forward_connected_layer_gpu(uo, s); - copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); - copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); - copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); - activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); - activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH); + activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); - copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); + copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); - activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); - mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); + copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); + activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH); + mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1); + copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); + copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1); state.input_gpu += l.inputs*l.batch; l.output_gpu += l.outputs*l.batch; @@ -514,97 +514,97 @@ void backward_lstm_layer_gpu(layer l, network state) l.delta_gpu += l.outputs*l.batch*(l.steps - 1); for (i = l.steps - 1; i >= 0; --i) { - if (i != 0) copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1); - if (i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1); + if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1); + copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1); + if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1); + copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1); l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch; - copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); - copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); - copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); - activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); - activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH); + activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); - copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1); + copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); + copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH); - copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1); + mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); - gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu); - axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); + gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu); + axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); - mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH); + mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); + gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wo, s); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(uo, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wg, s); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(ug, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wi, s); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(ui, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1); + gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1); s.input_gpu = l.prev_state_gpu; s.delta_gpu = l.dh_gpu; backward_connected_layer_gpu(wf, s); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1); s.input_gpu = state.input_gpu; s.delta_gpu = state.delta_gpu; backward_connected_layer_gpu(uf, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); + copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1); state.input_gpu -= l.inputs*l.batch; if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch; diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 7c955309..5af37608 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -42,7 +42,7 @@ void forward_network_gpu(network net) net.index = i; layer l = net.layers[i]; if(l.delta_gpu){ - fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1); + fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1); } l.forward_gpu(l, net); net.input_gpu = l.output_gpu; @@ -107,9 +107,9 @@ void harmless_update_network_gpu(network net) int i; for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; - if(l.weight_updates_gpu) fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1); - if(l.bias_updates_gpu) fill_ongpu(l.nbiases, 0, l.bias_updates_gpu, 1); - if(l.scale_updates_gpu) fill_ongpu(l.nbiases, 0, l.scale_updates_gpu, 1); + if(l.weight_updates_gpu) fill_gpu(l.nweights, 0, l.weight_updates_gpu, 1); + if(l.bias_updates_gpu) fill_gpu(l.nbiases, 0, l.bias_updates_gpu, 1); + if(l.scale_updates_gpu) fill_gpu(l.nbiases, 0, l.scale_updates_gpu, 1); } } @@ -383,6 +383,7 @@ float train_networks(network *nets, int n, data d, int interval) float sum = 0; for(i = 0; i < n; ++i){ + nets[i].learning_rate *= n; data p = get_data_part(d, i, n); threads[i] = train_network_in_thread(nets[i], p, errors + i); } diff --git a/src/normalization_layer.c b/src/normalization_layer.c index 9f959a4f..424714fe 100644 --- a/src/normalization_layer.c +++ b/src/normalization_layer.c @@ -113,29 +113,29 @@ void forward_normalization_layer_gpu(const layer layer, network net) int w = layer.w; int h = layer.h; int c = layer.c; - scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1); + scal_gpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1); for(b = 0; b < layer.batch; ++b){ float *squared = layer.squared_gpu + w*h*c*b; float *norms = layer.norms_gpu + w*h*c*b; float *input = net.input_gpu + w*h*c*b; - pow_ongpu(w*h*c, 2, input, 1, squared, 1); + pow_gpu(w*h*c, 2, input, 1, squared, 1); - const_ongpu(w*h, layer.kappa, norms, 1); + const_gpu(w*h, layer.kappa, norms, 1); for(k = 0; k < layer.size/2; ++k){ - axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1); + axpy_gpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1); } for(k = 1; k < layer.c; ++k){ - copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1); + copy_gpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1); int prev = k - ((layer.size-1)/2) - 1; int next = k + (layer.size/2); - if(prev >= 0) axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1); - if(next < layer.c) axpy_ongpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1); + if(prev >= 0) axpy_gpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1); + if(next < layer.c) axpy_gpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1); } } - pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1); - mul_ongpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1); + pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1); + mul_gpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1); } void backward_normalization_layer_gpu(const layer layer, network net) @@ -145,7 +145,7 @@ void backward_normalization_layer_gpu(const layer layer, network net) int w = layer.w; int h = layer.h; int c = layer.c; - pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1); - mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1); + pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1); + mul_gpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/parser.c b/src/parser.c index 70390416..da025367 100644 --- a/src/parser.c +++ b/src/parser.c @@ -213,13 +213,11 @@ layer parse_crnn(list *options, size_params params) layer parse_rnn(list *options, size_params params) { int output = option_find_int(options, "output",1); - int hidden = option_find_int(options, "hidden",1); char *activation_s = option_find_str(options, "activation", "logistic"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); - int logistic = option_find_int_quiet(options, "logistic", 0); - layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic, params.net.adam); + layer l = make_rnn_layer(params.batch, params.inputs, output, params.time_steps, activation, batch_normalize, params.net.adam); l.shortcut = option_find_int_quiet(options, "shortcut", 0); @@ -353,6 +351,7 @@ cost_layer parse_cost(list *options, size_params params) float scale = option_find_float_quiet(options, "scale",1); cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale); layer.ratio = option_find_float_quiet(options, "ratio",0); + layer.noobject_scale = option_find_float_quiet(options, "noobj", 1); layer.thresh = option_find_float_quiet(options, "thresh",0); return layer; } @@ -921,12 +920,18 @@ void save_weights_upto(network net, char *filename, int cutoff) save_connected_weights(*(l.uo), fp); save_connected_weights(*(l.ug), fp); } if (l.type == GRU) { - save_connected_weights(*(l.wz), fp); - save_connected_weights(*(l.wr), fp); - save_connected_weights(*(l.wh), fp); - save_connected_weights(*(l.uz), fp); - save_connected_weights(*(l.ur), fp); - save_connected_weights(*(l.uh), fp); + if(1){ + save_connected_weights(*(l.wz), fp); + save_connected_weights(*(l.wr), fp); + save_connected_weights(*(l.wh), fp); + save_connected_weights(*(l.uz), fp); + save_connected_weights(*(l.ur), fp); + save_connected_weights(*(l.uh), fp); + }else{ + save_connected_weights(*(l.reset_layer), fp); + save_connected_weights(*(l.update_layer), fp); + save_connected_weights(*(l.state_layer), fp); + } } if(l.type == CRNN){ save_convolutional_weights(*(l.input_layer), fp); save_convolutional_weights(*(l.self_layer), fp); @@ -1132,12 +1137,18 @@ void load_weights_upto(network *net, char *filename, int start, int cutoff) load_connected_weights(*(l.ug), fp, transpose); } if (l.type == GRU) { - load_connected_weights(*(l.wz), fp, transpose); - load_connected_weights(*(l.wr), fp, transpose); - load_connected_weights(*(l.wh), fp, transpose); - load_connected_weights(*(l.uz), fp, transpose); - load_connected_weights(*(l.ur), fp, transpose); - load_connected_weights(*(l.uh), fp, transpose); + if(1){ + load_connected_weights(*(l.wz), fp, transpose); + load_connected_weights(*(l.wr), fp, transpose); + load_connected_weights(*(l.wh), fp, transpose); + load_connected_weights(*(l.uz), fp, transpose); + load_connected_weights(*(l.ur), fp, transpose); + load_connected_weights(*(l.uh), fp, transpose); + }else{ + load_connected_weights(*(l.reset_layer), fp, transpose); + load_connected_weights(*(l.update_layer), fp, transpose); + load_connected_weights(*(l.state_layer), fp, transpose); + } } if(l.type == LOCAL){ int locations = l.out_w*l.out_h; diff --git a/src/region_layer.c b/src/region_layer.c index d2f73029..c090075e 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -434,14 +434,14 @@ void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, f void forward_region_layer_gpu(const layer l, network net) { - copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); + copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); int b, n; for (b = 0; b < l.batch; ++b){ for(n = 0; n < l.n; ++n){ int index = entry_index(l, b, n*l.w*l.h, 0); - activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC); + activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC); index = entry_index(l, b, n*l.w*l.h, l.coords); - if(!l.background) activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC); + if(!l.background) activate_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC); } } if (l.softmax_tree){ @@ -545,12 +545,12 @@ void backward_region_layer_gpu(const layer l, network net) for (b = 0; b < l.batch; ++b){ for(n = 0; n < l.n; ++n){ int index = entry_index(l, b, n*l.w*l.h, 0); - gradient_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index); + gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index); index = entry_index(l, b, n*l.w*l.h, l.coords); - if(!l.background) gradient_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index); + if(!l.background) gradient_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index); } } - axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); + axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); } #endif diff --git a/src/reorg_layer.c b/src/reorg_layer.c index d7fc0010..31d6b843 100644 --- a/src/reorg_layer.c +++ b/src/reorg_layer.c @@ -136,18 +136,18 @@ void forward_reorg_layer_gpu(layer l, network net) int i; if(l.flatten){ if(l.reverse){ - flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu); + flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu); }else{ - flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu); + flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu); } } else if (l.extra) { for(i = 0; i < l.batch; ++i){ - copy_ongpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1); + copy_gpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1); } } else if (l.reverse) { - reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu); + reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu); }else { - reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu); + reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu); } } @@ -155,19 +155,19 @@ void backward_reorg_layer_gpu(layer l, network net) { if(l.flatten){ if(l.reverse){ - flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu); + flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu); }else{ - flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu); + flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu); } } else if (l.extra) { int i; for(i = 0; i < l.batch; ++i){ - copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1); + copy_gpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1); } } else if(l.reverse){ - reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu); + reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu); } else { - reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu); + reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu); } } #endif diff --git a/src/rnn_layer.c b/src/rnn_layer.c index 6ff9a0f8..8c9b457e 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps) #endif } -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam) +layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam) { fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; @@ -34,24 +34,24 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.batch = batch; l.type = RNN; l.steps = steps; - l.hidden = hidden; l.inputs = inputs; - l.state = calloc(batch*hidden*(steps+1), sizeof(float)); + l.state = calloc(batch*outputs, sizeof(float)); + l.prev_state = calloc(batch*outputs, sizeof(float)); l.input_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize, adam); + *(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize, adam); + *(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize, adam); + *(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam); l.output_layer->batch = batch; l.outputs = outputs; @@ -65,9 +65,15 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.forward_gpu = forward_rnn_layer_gpu; l.backward_gpu = backward_rnn_layer_gpu; l.update_gpu = update_rnn_layer_gpu; - l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1)); + l.state_gpu = cuda_make_array(0, batch*outputs); + l.prev_state_gpu = cuda_make_array(0, batch*outputs); l.output_gpu = l.output_layer->output_gpu; l.delta_gpu = l.output_layer->delta_gpu; +#ifdef CUDNN + cudnnSetTensor4dDescriptor(l.input_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.input_layer->out_c, l.input_layer->out_h, l.input_layer->out_w); + cudnnSetTensor4dDescriptor(l.self_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.self_layer->out_c, l.self_layer->out_h, l.self_layer->out_w); + cudnnSetTensor4dDescriptor(l.output_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.output_layer->out_c, l.output_layer->out_h, l.output_layer->out_w); +#endif #endif return l; @@ -90,9 +96,9 @@ void forward_rnn_layer(layer l, network net) layer output_layer = *(l.output_layer); fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); - fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); - fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); - if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1); + if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { s.input = net.input; @@ -102,14 +108,14 @@ void forward_rnn_layer(layer l, network net) forward_connected_layer(self_layer, s); float *old_state = l.state; - if(net.train) l.state += l.hidden*l.batch; + if(net.train) l.state += l.outputs*l.batch; if(l.shortcut){ - copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); + copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1); }else{ - fill_cpu(l.hidden * l.batch, 0, l.state, 1); + fill_cpu(l.outputs * l.batch, 0, l.state, 1); } - axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1); - axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); + axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1); + axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; forward_connected_layer(output_layer, s); @@ -134,32 +140,32 @@ void backward_rnn_layer(layer l, network net) increment_layer(&self_layer, l.steps-1); increment_layer(&output_layer, l.steps-1); - l.state += l.hidden*l.batch*l.steps; + l.state += l.outputs*l.batch*l.steps; for (i = l.steps-1; i >= 0; --i) { - copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1); - axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); + copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1); + axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; s.delta = self_layer.delta; backward_connected_layer(output_layer, s); - l.state -= l.hidden*l.batch; + l.state -= l.outputs*l.batch; /* if(i > 0){ - copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1); - axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1); + copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1); + axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1); }else{ - fill_cpu(l.hidden * l.batch, 0, l.state, 1); + fill_cpu(l.outputs * l.batch, 0, l.state, 1); } */ s.input = l.state; - s.delta = self_layer.delta - l.hidden*l.batch; + s.delta = self_layer.delta - l.outputs*l.batch; if (i == 0) s.delta = 0; backward_connected_layer(self_layer, s); - copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1); - if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1); + copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1); + if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1); s.input = net.input + i*l.inputs*l.batch; if(net.delta) s.delta = net.delta + i*l.inputs*l.batch; else s.delta = 0; @@ -196,17 +202,21 @@ void update_rnn_layer_gpu(layer l, update_args a) void forward_rnn_layer_gpu(layer l, network net) { - network s = net; + network s = {0}; s.train = net.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); - fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); - fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1); - fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1); - if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, self_layer.delta_gpu, 1); + fill_gpu(l.outputs * l.batch * l.steps, 0, input_layer.delta_gpu, 1); + + if(net.train) { + fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1); + } for (i = 0; i < l.steps; ++i) { s.input_gpu = net.input_gpu; @@ -215,15 +225,9 @@ void forward_rnn_layer_gpu(layer l, network net) s.input_gpu = l.state_gpu; forward_connected_layer_gpu(self_layer, s); - float *old_state = l.state_gpu; - if(net.train) l.state_gpu += l.hidden*l.batch; - if(l.shortcut){ - copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1); - }else{ - fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); - } - axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); - axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); + fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; forward_connected_layer_gpu(output_layer, s); @@ -237,7 +241,7 @@ void forward_rnn_layer_gpu(layer l, network net) void backward_rnn_layer_gpu(layer l, network net) { - network s = net; + network s = {0}; s.train = net.train; int i; layer input_layer = *(l.input_layer); @@ -246,24 +250,32 @@ void backward_rnn_layer_gpu(layer l, network net) increment_layer(&input_layer, l.steps - 1); increment_layer(&self_layer, l.steps - 1); increment_layer(&output_layer, l.steps - 1); - l.state_gpu += l.hidden*l.batch*l.steps; + float *last_input = input_layer.output_gpu; + float *last_self = self_layer.output_gpu; for (i = l.steps-1; i >= 0; --i) { + fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1); s.input_gpu = l.state_gpu; s.delta_gpu = self_layer.delta_gpu; backward_connected_layer_gpu(output_layer, s); - l.state_gpu -= l.hidden*l.batch; + if(i != 0) { + fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1); + }else { + copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1); + } - copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); + copy_gpu(l.outputs*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); s.input_gpu = l.state_gpu; - s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch; + s.delta_gpu = (i > 0) ? self_layer.delta_gpu - l.outputs*l.batch : 0; if (i == 0) s.delta_gpu = 0; backward_connected_layer_gpu(self_layer, s); - //copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1); - if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1); s.input_gpu = net.input_gpu + i*l.inputs*l.batch; if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch; else s.delta_gpu = 0; @@ -273,5 +285,8 @@ void backward_rnn_layer_gpu(layer l, network net) increment_layer(&self_layer, -1); increment_layer(&output_layer, -1); } + fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, last_input, 1, l.state_gpu, 1); + axpy_gpu(l.outputs * l.batch, 1, last_self, 1, l.state_gpu, 1); } #endif diff --git a/src/rnn_layer.h b/src/rnn_layer.h index 37e1f1ca..270a63ff 100644 --- a/src/rnn_layer.h +++ b/src/rnn_layer.h @@ -7,7 +7,7 @@ #include "network.h" #define USET -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam); +layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam); void forward_rnn_layer(layer l, network net); void backward_rnn_layer(layer l, network net); diff --git a/src/route_layer.c b/src/route_layer.c index bce66fa4..a8970a46 100644 --- a/src/route_layer.c +++ b/src/route_layer.c @@ -111,7 +111,7 @@ void forward_route_layer_gpu(const route_layer l, network net) float *input = net.layers[index].output_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ - copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1); + copy_gpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1); } offset += input_size; } @@ -126,7 +126,7 @@ void backward_route_layer_gpu(const route_layer l, network net) float *delta = net.layers[index].delta_gpu; int input_size = l.input_sizes[i]; for(j = 0; j < l.batch; ++j){ - axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1); + axpy_gpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1); } offset += input_size; } diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c index b39ba8e8..0818ca7e 100644 --- a/src/shortcut_layer.c +++ b/src/shortcut_layer.c @@ -55,15 +55,15 @@ void backward_shortcut_layer(const layer l, network net) #ifdef GPU void forward_shortcut_layer_gpu(const layer l, network net) { - copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); + copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1); shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu); - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation); } void backward_shortcut_layer_gpu(const layer l, network net) { - gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1); + gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); + axpy_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1); shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, net.layers[l.index].delta_gpu); } #endif diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 7f204802..372b037c 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -81,7 +81,7 @@ void forward_softmax_layer_gpu(const softmax_layer l, network net) void backward_softmax_layer_gpu(const softmax_layer layer, network net) { - axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1); + axpy_gpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1); } #endif