mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
🔥 🐛 🔥
This commit is contained in:
parent
1467621453
commit
8215a8864d
6
Makefile
6
Makefile
@ -1,6 +1,6 @@
|
||||
GPU=1
|
||||
CUDNN=1
|
||||
OPENCV=1
|
||||
GPU=0
|
||||
CUDNN=0
|
||||
OPENCV=0
|
||||
DEBUG=0
|
||||
|
||||
ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
|
||||
|
31
cfg/gru.cfg
31
cfg/gru.cfg
@ -1,23 +1,24 @@
|
||||
[net]
|
||||
inputs=256
|
||||
|
||||
# Test
|
||||
batch = 1
|
||||
time_steps=1
|
||||
|
||||
# Train
|
||||
# batch = 512
|
||||
# time_steps=64
|
||||
|
||||
subdivisions=1
|
||||
batch = 256
|
||||
inputs=256
|
||||
momentum=0.9
|
||||
decay=0.001
|
||||
learning_rate=0.1
|
||||
decay=0.0
|
||||
time_steps=128
|
||||
learning_rate=.002
|
||||
adam=1
|
||||
|
||||
burn_in=100
|
||||
policy=poly
|
||||
policy=constant
|
||||
power=4
|
||||
max_batches=10000
|
||||
max_batches=400000
|
||||
|
||||
[gru]
|
||||
batch_normalize=1
|
||||
output = 1024
|
||||
|
||||
[gru]
|
||||
batch_normalize=1
|
||||
output = 1024
|
||||
|
||||
[gru]
|
||||
batch_normalize=1
|
||||
|
@ -112,6 +112,26 @@ void operations(char *cfgfile)
|
||||
ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
|
||||
} else if(l.type == CONNECTED){
|
||||
ops += 2l * l.inputs * l.outputs;
|
||||
} else if (l.type == RNN){
|
||||
ops += 2l * l.input_layer->inputs * l.input_layer->outputs;
|
||||
ops += 2l * l.self_layer->inputs * l.self_layer->outputs;
|
||||
ops += 2l * l.output_layer->inputs * l.output_layer->outputs;
|
||||
} else if (l.type == GRU){
|
||||
ops += 2l * l.uz->inputs * l.uz->outputs;
|
||||
ops += 2l * l.uh->inputs * l.uh->outputs;
|
||||
ops += 2l * l.ur->inputs * l.ur->outputs;
|
||||
ops += 2l * l.wz->inputs * l.wz->outputs;
|
||||
ops += 2l * l.wh->inputs * l.wh->outputs;
|
||||
ops += 2l * l.wr->inputs * l.wr->outputs;
|
||||
} else if (l.type == LSTM){
|
||||
ops += 2l * l.uf->inputs * l.uf->outputs;
|
||||
ops += 2l * l.ui->inputs * l.ui->outputs;
|
||||
ops += 2l * l.ug->inputs * l.ug->outputs;
|
||||
ops += 2l * l.uo->inputs * l.uo->outputs;
|
||||
ops += 2l * l.wf->inputs * l.wf->outputs;
|
||||
ops += 2l * l.wi->inputs * l.wi->outputs;
|
||||
ops += 2l * l.wg->inputs * l.wg->outputs;
|
||||
ops += 2l * l.wo->inputs * l.wo->outputs;
|
||||
}
|
||||
}
|
||||
printf("Floating Point Operations: %ld\n", ops);
|
||||
|
@ -64,7 +64,7 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg
|
||||
|
||||
int ax_size = anet.inputs*anet.batch;
|
||||
int ay_size = anet.truths*anet.batch;
|
||||
fill_ongpu(ay_size, .9, anet.truth_gpu, 1);
|
||||
fill_gpu(ay_size, .9, anet.truth_gpu, 1);
|
||||
anet.delta_gpu = cuda_make_array(0, ax_size);
|
||||
anet.train = 1;
|
||||
|
||||
@ -102,36 +102,36 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg
|
||||
|
||||
forward_network_gpu(fnet, fstate);
|
||||
float *feats = fnet.layers[fnet.n - 2].output_gpu;
|
||||
copy_ongpu(y_size, feats, 1, fstate.truth, 1);
|
||||
copy_gpu(y_size, feats, 1, fstate.truth, 1);
|
||||
|
||||
forward_network_gpu(gnet, gstate);
|
||||
float *gen = gnet.layers[gnet.n-1].output_gpu;
|
||||
copy_ongpu(x_size, gen, 1, fstate.input, 1);
|
||||
copy_gpu(x_size, gen, 1, fstate.input, 1);
|
||||
|
||||
fill_ongpu(x_size, 0, fstate.delta, 1);
|
||||
fill_gpu(x_size, 0, fstate.delta, 1);
|
||||
forward_network_gpu(fnet, fstate);
|
||||
backward_network_gpu(fnet, fstate);
|
||||
//HERE
|
||||
|
||||
astate.input = gen;
|
||||
fill_ongpu(ax_size, 0, astate.delta, 1);
|
||||
fill_gpu(ax_size, 0, astate.delta, 1);
|
||||
forward_network_gpu(anet, astate);
|
||||
backward_network_gpu(anet, astate);
|
||||
|
||||
float *delta = imlayer.delta_gpu;
|
||||
fill_ongpu(x_size, 0, delta, 1);
|
||||
scal_ongpu(x_size, 100, astate.delta, 1);
|
||||
scal_ongpu(x_size, .001, fstate.delta, 1);
|
||||
axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1);
|
||||
axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
|
||||
fill_gpu(x_size, 0, delta, 1);
|
||||
scal_gpu(x_size, 100, astate.delta, 1);
|
||||
scal_gpu(x_size, .001, fstate.delta, 1);
|
||||
axpy_gpu(x_size, 1, fstate.delta, 1, delta, 1);
|
||||
axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
|
||||
|
||||
//fill_ongpu(x_size, 0, delta, 1);
|
||||
//fill_gpu(x_size, 0, delta, 1);
|
||||
//cuda_push_array(delta, X, x_size);
|
||||
//axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
|
||||
//axpy_gpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
|
||||
//printf("pix error: %f\n", cuda_mag_array(delta, x_size));
|
||||
printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size));
|
||||
printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size));
|
||||
//axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
|
||||
//axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
|
||||
|
||||
backward_network_gpu(gnet, gstate);
|
||||
|
||||
@ -273,7 +273,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear
|
||||
|
||||
float *imerror = cuda_make_array(0, imlayer.outputs);
|
||||
float *ones_gpu = cuda_make_array(0, ay_size);
|
||||
fill_ongpu(ay_size, .9, ones_gpu, 1);
|
||||
fill_gpu(ay_size, .9, ones_gpu, 1);
|
||||
|
||||
float aloss_avg = -1;
|
||||
float gloss_avg = -1;
|
||||
@ -318,23 +318,23 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear
|
||||
*net.seen += net.batch;
|
||||
forward_network_gpu(net, gstate);
|
||||
|
||||
fill_ongpu(imlayer.outputs, 0, imerror, 1);
|
||||
fill_gpu(imlayer.outputs, 0, imerror, 1);
|
||||
astate.input = imlayer.output_gpu;
|
||||
astate.delta = imerror;
|
||||
astate.truth = ones_gpu;
|
||||
forward_network_gpu(anet, astate);
|
||||
backward_network_gpu(anet, astate);
|
||||
|
||||
scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
|
||||
scal_gpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
|
||||
|
||||
backward_network_gpu(net, gstate);
|
||||
|
||||
scal_ongpu(imlayer.outputs, 1000, imerror, 1);
|
||||
scal_gpu(imlayer.outputs, 1000, imerror, 1);
|
||||
|
||||
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
|
||||
printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs));
|
||||
|
||||
axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
|
||||
axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
|
||||
|
||||
gloss += get_network_cost(net) /(net.subdivisions*net.batch);
|
||||
|
||||
@ -533,9 +533,9 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
|
||||
*gnet.seen += gnet.batch;
|
||||
forward_network_gpu(gnet);
|
||||
|
||||
fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
|
||||
fill_ongpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1);
|
||||
copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
|
||||
fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
|
||||
fill_gpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1);
|
||||
copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
|
||||
anet.delta_gpu = imerror;
|
||||
forward_network_gpu(anet);
|
||||
backward_network_gpu(anet);
|
||||
@ -543,13 +543,13 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
|
||||
float genaloss = *anet.cost / anet.batch;
|
||||
printf("%f\n", genaloss);
|
||||
|
||||
scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
|
||||
scal_ongpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1);
|
||||
scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
|
||||
scal_gpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1);
|
||||
|
||||
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
|
||||
printf("features %f\n", cuda_mag_array(gnet.layers[gnet.n-1].delta_gpu, imlayer.outputs*imlayer.batch));
|
||||
|
||||
axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1);
|
||||
axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1);
|
||||
|
||||
backward_network_gpu(gnet);
|
||||
|
||||
@ -716,21 +716,21 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
|
||||
*net.seen += net.batch;
|
||||
forward_network_gpu(net);
|
||||
|
||||
fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
|
||||
copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
|
||||
fill_ongpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1);
|
||||
fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
|
||||
copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
|
||||
fill_gpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1);
|
||||
anet.delta_gpu = imerror;
|
||||
forward_network_gpu(anet);
|
||||
backward_network_gpu(anet);
|
||||
|
||||
scal_ongpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1);
|
||||
scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1);
|
||||
|
||||
scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
|
||||
scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
|
||||
|
||||
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
|
||||
printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs*imlayer.batch));
|
||||
|
||||
axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1);
|
||||
axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1);
|
||||
|
||||
backward_network_gpu(net);
|
||||
|
||||
@ -876,7 +876,7 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi
|
||||
|
||||
float *imerror = cuda_make_array(0, imlayer.outputs);
|
||||
float *ones_gpu = cuda_make_array(0, ay_size);
|
||||
fill_ongpu(ay_size, 1, ones_gpu, 1);
|
||||
fill_gpu(ay_size, 1, ones_gpu, 1);
|
||||
|
||||
float aloss_avg = -1;
|
||||
float gloss_avg = -1;
|
||||
@ -902,15 +902,15 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi
|
||||
*net.seen += net.batch;
|
||||
forward_network_gpu(net, gstate);
|
||||
|
||||
fill_ongpu(imlayer.outputs, 0, imerror, 1);
|
||||
fill_gpu(imlayer.outputs, 0, imerror, 1);
|
||||
astate.input = imlayer.output_gpu;
|
||||
astate.delta = imerror;
|
||||
astate.truth = ones_gpu;
|
||||
forward_network_gpu(anet, astate);
|
||||
backward_network_gpu(anet, astate);
|
||||
|
||||
scal_ongpu(imlayer.outputs, 1, imerror, 1);
|
||||
axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
|
||||
scal_gpu(imlayer.outputs, 1, imerror, 1);
|
||||
axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
|
||||
|
||||
backward_network_gpu(net, gstate);
|
||||
|
||||
|
@ -50,7 +50,7 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
|
||||
cuda_push_array(net->input_gpu, im.data, net->inputs);
|
||||
|
||||
forward_network_gpu(*net);
|
||||
copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
|
||||
copy_gpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
|
||||
|
||||
cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
|
||||
calculate_loss(last.delta, last.delta, last.outputs, thresh);
|
||||
@ -141,7 +141,7 @@ void reconstruct_picture(network net, float *features, image recon, image update
|
||||
|
||||
forward_network_gpu(net);
|
||||
cuda_push_array(l.delta_gpu, features, l.outputs);
|
||||
axpy_ongpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
|
||||
axpy_gpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
|
||||
backward_network_gpu(net);
|
||||
|
||||
cuda_pull_array(net.delta_gpu, delta.data, delta.w*delta.h*delta.c);
|
||||
|
@ -114,7 +114,10 @@ void reset_rnn_state(network net, int b)
|
||||
#ifdef GPU
|
||||
layer l = net.layers[i];
|
||||
if(l.state_gpu){
|
||||
fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
|
||||
fill_gpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
|
||||
}
|
||||
if(l.h_gpu){
|
||||
fill_gpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -27,6 +27,11 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
|
||||
}
|
||||
srand(time(0));
|
||||
network net = nets[0];
|
||||
image pred = get_network_image(net);
|
||||
|
||||
int div = net.w/pred.w;
|
||||
assert(pred.w * div == net.w);
|
||||
assert(pred.h * div == net.h);
|
||||
|
||||
int imgs = net.batch * net.subdivisions * ngpus;
|
||||
|
||||
@ -46,6 +51,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
|
||||
args.w = net.w;
|
||||
args.h = net.h;
|
||||
args.threads = 32;
|
||||
args.scale = div;
|
||||
|
||||
args.min = net.min_crop;
|
||||
args.max = net.max_crop;
|
||||
@ -75,15 +81,6 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
|
||||
pthread_join(load_thread, 0);
|
||||
train = buffer;
|
||||
load_thread = load_data(args);
|
||||
image tr = float_to_image(net.w, net.h, 81, train.y.vals[0]);
|
||||
image im = float_to_image(net.w, net.h, net.c, train.X.vals[0]);
|
||||
image mask = mask_to_rgb(tr);
|
||||
show_image(im, "input");
|
||||
show_image(mask, "truth");
|
||||
#ifdef OPENCV
|
||||
cvWaitKey(100);
|
||||
#endif
|
||||
free_image(mask);
|
||||
|
||||
printf("Loaded: %lf seconds\n", sec(clock()-time));
|
||||
time=clock();
|
||||
@ -98,6 +95,20 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
|
||||
#else
|
||||
loss = train_network(net, train);
|
||||
#endif
|
||||
if(1){
|
||||
image tr = float_to_image(net.w/div, net.h/div, 80, train.y.vals[net.batch]);
|
||||
image im = float_to_image(net.w, net.h, net.c, train.X.vals[net.batch]);
|
||||
image mask = mask_to_rgb(tr);
|
||||
image prmask = mask_to_rgb(pred);
|
||||
show_image(im, "input");
|
||||
show_image(prmask, "pred");
|
||||
show_image(mask, "truth");
|
||||
#ifdef OPENCV
|
||||
cvWaitKey(100);
|
||||
#endif
|
||||
free_image(mask);
|
||||
free_image(prmask);
|
||||
}
|
||||
if(avg_loss == -1) avg_loss = loss;
|
||||
avg_loss = avg_loss*.9 + loss*.1;
|
||||
printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
|
||||
|
@ -84,7 +84,7 @@ typedef enum {
|
||||
} LAYER_TYPE;
|
||||
|
||||
typedef enum{
|
||||
SSE, MASKED, L1, SMOOTH
|
||||
SSE, MASKED, L1, SEG, SMOOTH
|
||||
} COST_TYPE;
|
||||
|
||||
typedef struct{
|
||||
@ -203,6 +203,8 @@ struct layer{
|
||||
float * forgot_state;
|
||||
float * forgot_delta;
|
||||
float * state_delta;
|
||||
float * combine_cpu;
|
||||
float * combine_delta_cpu;
|
||||
|
||||
float * concat;
|
||||
float * concat_delta;
|
||||
@ -271,6 +273,10 @@ struct layer{
|
||||
struct layer *self_layer;
|
||||
struct layer *output_layer;
|
||||
|
||||
struct layer *reset_layer;
|
||||
struct layer *update_layer;
|
||||
struct layer *state_layer;
|
||||
|
||||
struct layer *input_gate_layer;
|
||||
struct layer *state_gate_layer;
|
||||
struct layer *input_save_layer;
|
||||
@ -335,6 +341,9 @@ struct layer{
|
||||
float *bias_v_gpu;
|
||||
float *scale_v_gpu;
|
||||
|
||||
float * combine_gpu;
|
||||
float * combine_delta_gpu;
|
||||
|
||||
float * prev_state_gpu;
|
||||
float * forgot_state_gpu;
|
||||
float * forgot_delta_gpu;
|
||||
@ -575,10 +584,10 @@ void normalize_cpu(float *x, float *mean, float *variance, int batch, int filter
|
||||
|
||||
int best_3d_shift_r(image a, image b, int min, int max);
|
||||
#ifdef GPU
|
||||
void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
|
||||
void fill_ongpu(int N, float ALPHA, float * X, int INCX);
|
||||
void scal_ongpu(int N, float ALPHA, float * X, int INCX);
|
||||
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
|
||||
void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
|
||||
void fill_gpu(int N, float ALPHA, float * X, int INCX);
|
||||
void scal_gpu(int N, float ALPHA, float * X, int INCX);
|
||||
void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
|
||||
|
||||
void cuda_set_device(int n);
|
||||
void cuda_free(float *x_gpu);
|
||||
|
@ -152,13 +152,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt
|
||||
if(i < n) delta[i] *= gradient_kernel(x[i], a);
|
||||
}
|
||||
|
||||
extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a)
|
||||
extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a)
|
||||
{
|
||||
activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
|
||||
extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
|
||||
{
|
||||
gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
|
||||
check_error(cudaPeekAtLastError());
|
||||
|
@ -51,13 +51,13 @@ void backward_activation_layer(layer l, network net)
|
||||
|
||||
void forward_activation_layer_gpu(layer l, network net)
|
||||
{
|
||||
copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
}
|
||||
|
||||
void backward_activation_layer_gpu(layer l, network net)
|
||||
{
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
@ -12,8 +12,8 @@ float gradient(float x, ACTIVATION a);
|
||||
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
|
||||
void activate_array(float *x, const int n, const ACTIVATION a);
|
||||
#ifdef GPU
|
||||
void activate_array_ongpu(float *x, int n, ACTIVATION a);
|
||||
void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
|
||||
void activate_array_gpu(float *x, int n, ACTIVATION a);
|
||||
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
|
||||
#endif
|
||||
|
||||
static inline float stair_activate(float x)
|
||||
|
@ -188,8 +188,8 @@ void push_batchnorm_layer(layer l)
|
||||
|
||||
void forward_batchnorm_layer_gpu(layer l, network net)
|
||||
{
|
||||
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
|
||||
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
|
||||
if (net.train) {
|
||||
#ifdef CUDNN
|
||||
float one = 1;
|
||||
@ -215,14 +215,14 @@ void forward_batchnorm_layer_gpu(layer l, network net)
|
||||
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
|
||||
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
|
||||
|
||||
scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
|
||||
axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
|
||||
scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
|
||||
axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
|
||||
scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
|
||||
axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
|
||||
scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
|
||||
axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
|
||||
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
|
||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
|
||||
|
||||
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
|
||||
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
|
||||
@ -263,7 +263,7 @@ void backward_batchnorm_layer_gpu(layer l, network net)
|
||||
.00001,
|
||||
l.mean_gpu,
|
||||
l.variance_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
|
||||
#else
|
||||
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
|
||||
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
|
||||
@ -274,6 +274,6 @@ void backward_batchnorm_layer_gpu(layer l, network net)
|
||||
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
|
||||
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
|
||||
#endif
|
||||
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
46
src/blas.c
46
src/blas.c
@ -55,6 +55,16 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
|
||||
}
|
||||
}
|
||||
|
||||
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
|
||||
{
|
||||
int i;
|
||||
for(i = 0; i < n; ++i){
|
||||
if(da) da[i] += dc[i] * s[i];
|
||||
if(db) db[i] += dc[i] * (1-s[i]);
|
||||
ds[i] += dc[i] * (a[i] - b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
|
||||
{
|
||||
int stride = w1/w2;
|
||||
@ -162,12 +172,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
|
||||
for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
|
||||
}
|
||||
|
||||
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
int i, j;
|
||||
int index = 0;
|
||||
for(j = 0; j < B; ++j) {
|
||||
for(i = 0; i < NX; ++i){
|
||||
if(X) X[j*NX + i] += OUT[index];
|
||||
++index;
|
||||
}
|
||||
for(i = 0; i < NY; ++i){
|
||||
if(Y) Y[j*NY + i] += OUT[index];
|
||||
++index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
int i, j;
|
||||
int index = 0;
|
||||
for(j = 0; j < B; ++j) {
|
||||
for(i = 0; i < NX; ++i){
|
||||
OUT[index++] = X[j*NX + i];
|
||||
}
|
||||
for(i = 0; i < NY; ++i){
|
||||
OUT[index++] = Y[j*NY + i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
|
||||
{
|
||||
int i;
|
||||
for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
|
||||
}
|
||||
|
||||
void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
|
||||
{
|
||||
int i;
|
||||
for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
|
||||
}
|
||||
|
||||
void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
|
||||
{
|
||||
int i;
|
||||
|
34
src/blas.h
34
src/blas.h
@ -10,8 +10,12 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward
|
||||
|
||||
void test_blas();
|
||||
|
||||
void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
|
||||
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
|
||||
void mult_add_into_cpu(int N, float *X, float *Y, float *Z);
|
||||
|
||||
void const_cpu(int N, float ALPHA, float *X, int INCX);
|
||||
void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
|
||||
void constrain_gpu(int N, float ALPHA, float * X, int INCX);
|
||||
void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
|
||||
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
|
||||
|
||||
@ -33,6 +37,7 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
|
||||
void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
|
||||
void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
|
||||
void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
|
||||
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);
|
||||
|
||||
void softmax(float *input, int n, float temp, int stride, float *output);
|
||||
void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
|
||||
@ -41,16 +46,17 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i
|
||||
#include "cuda.h"
|
||||
#include "tree.h"
|
||||
|
||||
void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
|
||||
void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
|
||||
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
|
||||
void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
|
||||
void add_ongpu(int N, float ALPHA, float * X, int INCX);
|
||||
void supp_ongpu(int N, float ALPHA, float * X, int INCX);
|
||||
void mask_ongpu(int N, float * X, float mask_num, float * mask);
|
||||
void const_ongpu(int N, float ALPHA, float *X, int INCX);
|
||||
void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
|
||||
void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
|
||||
void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
|
||||
void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
|
||||
void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
|
||||
void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
|
||||
void add_gpu(int N, float ALPHA, float * X, int INCX);
|
||||
void supp_gpu(int N, float ALPHA, float * X, int INCX);
|
||||
void mask_gpu(int N, float * X, float mask_num, float * mask);
|
||||
void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
|
||||
void const_gpu(int N, float ALPHA, float *X, int INCX);
|
||||
void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
|
||||
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);
|
||||
|
||||
void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
|
||||
void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
|
||||
@ -76,14 +82,16 @@ void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
|
||||
void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
|
||||
void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
|
||||
void mult_add_into_gpu(int num, float *a, float *b, float *c);
|
||||
void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
|
||||
void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
|
||||
|
||||
void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
|
||||
void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
|
||||
|
||||
void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
|
||||
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
|
||||
void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
|
||||
|
||||
void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
|
||||
void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
|
||||
void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
|
||||
|
||||
#endif
|
||||
|
@ -176,16 +176,16 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2
|
||||
|
||||
extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
|
||||
{
|
||||
scal_ongpu(n, B1, m, 1);
|
||||
scal_ongpu(n, B2, v, 1);
|
||||
axpy_ongpu(n, -decay*batch, w, 1, d, 1);
|
||||
scal_gpu(n, B1, m, 1);
|
||||
scal_gpu(n, B2, v, 1);
|
||||
axpy_gpu(n, -decay*batch, w, 1, d, 1);
|
||||
|
||||
axpy_ongpu(n, (1-B1), d, 1, m, 1);
|
||||
mul_ongpu(n, d, 1, d, 1);
|
||||
axpy_ongpu(n, (1-B2), d, 1, v, 1);
|
||||
axpy_gpu(n, (1-B1), d, 1, m, 1);
|
||||
mul_gpu(n, d, 1, d, 1);
|
||||
axpy_gpu(n, (1-B2), d, 1, v, 1);
|
||||
|
||||
adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t);
|
||||
fill_ongpu(n, 0, d, 1);
|
||||
adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
|
||||
fill_gpu(n, 0, d, 1);
|
||||
}
|
||||
|
||||
__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
|
||||
@ -556,35 +556,35 @@ extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
|
||||
extern "C" void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
|
||||
{
|
||||
axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
|
||||
axpy_gpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
|
||||
}
|
||||
|
||||
extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
|
||||
extern "C" void pow_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
|
||||
{
|
||||
pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
|
||||
extern "C" void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
|
||||
{
|
||||
axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
|
||||
extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
|
||||
{
|
||||
copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
|
||||
copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
|
||||
}
|
||||
|
||||
extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
|
||||
extern "C" void mul_gpu(int N, float * X, int INCX, float * Y, int INCY)
|
||||
{
|
||||
mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
|
||||
extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
|
||||
{
|
||||
copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
|
||||
check_error(cudaPeekAtLastError());
|
||||
@ -607,58 +607,70 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat
|
||||
else out[i1] = x[i2];
|
||||
}
|
||||
|
||||
extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out)
|
||||
extern "C" void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out)
|
||||
{
|
||||
int size = spatial*batch*layers;
|
||||
flatten_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, spatial, layers, batch, forward, out);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
|
||||
extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
|
||||
{
|
||||
int size = w*h*c*batch;
|
||||
reorg_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, w, h, c, batch, stride, forward, out);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
|
||||
__global__ void scale_mask_kernel(int n, float *x, float mask_num, float *mask, float scale)
|
||||
{
|
||||
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
|
||||
if(i < n && mask[i] == mask_num) x[i] *= scale;
|
||||
}
|
||||
|
||||
extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale)
|
||||
{
|
||||
scale_mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, scale);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask)
|
||||
{
|
||||
mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void constrain_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
constrain_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
|
||||
extern "C" void add_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void add_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
add_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void scal_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void supp_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
supp_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
|
||||
extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX)
|
||||
{
|
||||
fill_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
|
||||
check_error(cudaPeekAtLastError());
|
||||
@ -686,6 +698,9 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int
|
||||
int minw = (w1 < w2) ? w1 : w2;
|
||||
int minh = (h1 < h2) ? h1 : h2;
|
||||
int minc = (c1 < c2) ? c1 : c2;
|
||||
assert(w1 == w2);
|
||||
assert(h1 == h2);
|
||||
assert(c1 == c2);
|
||||
|
||||
int stride = w1/w2;
|
||||
int sample = w2/w1;
|
||||
@ -765,6 +780,46 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void deinter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
|
||||
if(i < (NX+NY)*B){
|
||||
int b = i / (NX+NY);
|
||||
int j = i % (NX+NY);
|
||||
if (j < NX){
|
||||
if(X) X[b*NX + j] += OUT[i];
|
||||
} else {
|
||||
if(Y) Y[b*NY + j - NX] += OUT[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
deinter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
__global__ void inter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
|
||||
if(i < (NX+NY)*B){
|
||||
int b = i / (NX+NY);
|
||||
int j = i % (NX+NY);
|
||||
if (j < NX){
|
||||
OUT[i] = X[b*NX + j];
|
||||
} else {
|
||||
OUT[i] = Y[b*NY + j - NX];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
|
||||
{
|
||||
inter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
|
||||
check_error(cudaPeekAtLastError());
|
||||
}
|
||||
|
||||
extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
|
||||
{
|
||||
weighted_sum_kernel<<<cuda_gridsize(num), BLOCK>>>(num, a, b, s, c);
|
||||
@ -776,8 +831,8 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float
|
||||
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
|
||||
if(i < n){
|
||||
if(da) da[i] += dc[i] * s[i];
|
||||
db[i] += dc[i] * (1-s[i]);
|
||||
ds[i] += dc[i] * a[i] + dc[i] * -b[i];
|
||||
if(db) db[i] += dc[i] * (1-s[i]);
|
||||
ds[i] += dc[i] * (a[i] - b[i]);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@ void col2im_cpu(float* data_col,
|
||||
int ksize, int stride, int pad, float* data_im);
|
||||
|
||||
#ifdef GPU
|
||||
void col2im_ongpu(float *data_col,
|
||||
void col2im_gpu(float *data_col,
|
||||
int channels, int height, int width,
|
||||
int ksize, int stride, int pad, float *data_im);
|
||||
#endif
|
||||
|
@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
|
||||
}
|
||||
}
|
||||
|
||||
void col2im_ongpu(float *data_col,
|
||||
void col2im_gpu(float *data_col,
|
||||
int channels, int height, int width,
|
||||
int ksize, int stride, int pad, float *data_im){
|
||||
// We are going to launch channels * height_col * width_col kernels, each
|
||||
|
@ -271,23 +271,23 @@ void update_connected_layer_gpu(layer l, update_args a)
|
||||
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
|
||||
}
|
||||
}else{
|
||||
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
|
||||
axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
|
||||
|
||||
if(l.batch_normalize){
|
||||
axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
|
||||
axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1);
|
||||
}
|
||||
|
||||
axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
|
||||
}
|
||||
}
|
||||
|
||||
void forward_connected_layer_gpu(layer l, network net)
|
||||
{
|
||||
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
|
||||
int m = l.batch;
|
||||
int k = l.inputs;
|
||||
@ -295,20 +295,20 @@ void forward_connected_layer_gpu(layer l, network net)
|
||||
float * a = net.input_gpu;
|
||||
float * b = l.weights_gpu;
|
||||
float * c = l.output_gpu;
|
||||
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||
|
||||
if (l.batch_normalize) {
|
||||
forward_batchnorm_layer_gpu(l, net);
|
||||
} else {
|
||||
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
|
||||
}
|
||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
}
|
||||
|
||||
void backward_connected_layer_gpu(layer l, network net)
|
||||
{
|
||||
constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1);
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
if(l.batch_normalize){
|
||||
backward_batchnorm_layer_gpu(l, net);
|
||||
} else {
|
||||
@ -321,7 +321,7 @@ void backward_connected_layer_gpu(layer l, network net)
|
||||
float * a = l.delta_gpu;
|
||||
float * b = net.input_gpu;
|
||||
float * c = l.weight_updates_gpu;
|
||||
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
|
||||
gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
|
||||
|
||||
m = l.batch;
|
||||
k = l.outputs;
|
||||
@ -331,6 +331,6 @@ void backward_connected_layer_gpu(layer l, network net)
|
||||
b = l.weights_gpu;
|
||||
c = net.delta_gpu;
|
||||
|
||||
if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
||||
if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
||||
}
|
||||
#endif
|
||||
|
@ -72,7 +72,7 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary)
|
||||
|
||||
void forward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
{
|
||||
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
if(l.binary){
|
||||
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
|
||||
swap_binary(&l);
|
||||
@ -107,11 +107,11 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
int k = l.size*l.size*l.c;
|
||||
int n = l.out_w*l.out_h;
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
|
||||
im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
|
||||
float * a = l.weights_gpu;
|
||||
float * b = net.workspace;
|
||||
float * c = l.output_gpu;
|
||||
gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
|
||||
gemm_gpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -121,7 +121,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
|
||||
}
|
||||
|
||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
//if(l.dot > 0) dot_error_gpu(l);
|
||||
if(l.binary || l.xnor) swap_binary(&l);
|
||||
}
|
||||
@ -173,8 +173,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
if(l.smooth){
|
||||
smooth_layer(l, 5, l.smooth);
|
||||
}
|
||||
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
|
||||
|
||||
if(l.batch_normalize){
|
||||
@ -217,7 +217,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
l.dsrcTensorDesc,
|
||||
net.delta_gpu);
|
||||
if(l.binary || l.xnor) swap_binary(&l);
|
||||
if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
|
||||
if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
|
||||
}
|
||||
|
||||
#else
|
||||
@ -231,8 +231,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
float * b = net.workspace;
|
||||
float * c = l.weight_updates_gpu;
|
||||
|
||||
im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
|
||||
gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
|
||||
im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
|
||||
gemm_gpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
|
||||
|
||||
if(net.delta_gpu){
|
||||
if(l.binary || l.xnor) swap_binary(&l);
|
||||
@ -240,13 +240,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
|
||||
float * b = l.delta_gpu;
|
||||
float * c = net.workspace;
|
||||
|
||||
gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
|
||||
gemm_gpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
|
||||
|
||||
col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
|
||||
col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
|
||||
if(l.binary || l.xnor) {
|
||||
swap_binary(&l);
|
||||
}
|
||||
if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
|
||||
if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -294,16 +294,16 @@ void update_convolutional_layer_gpu(layer l, update_args a)
|
||||
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
|
||||
}
|
||||
}else{
|
||||
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
|
||||
axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
|
||||
axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
|
||||
|
||||
if(l.scales_gpu){
|
||||
axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
|
||||
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -9,6 +9,7 @@
|
||||
|
||||
COST_TYPE get_cost_type(char *s)
|
||||
{
|
||||
if (strcmp(s, "seg")==0) return SEG;
|
||||
if (strcmp(s, "sse")==0) return SSE;
|
||||
if (strcmp(s, "masked")==0) return MASKED;
|
||||
if (strcmp(s, "smooth")==0) return SMOOTH;
|
||||
@ -20,6 +21,8 @@ COST_TYPE get_cost_type(char *s)
|
||||
char *get_cost_string(COST_TYPE a)
|
||||
{
|
||||
switch(a){
|
||||
case SEG:
|
||||
return "seg";
|
||||
case SSE:
|
||||
return "sse";
|
||||
case MASKED:
|
||||
@ -122,11 +125,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
|
||||
{
|
||||
if (!net.truth_gpu) return;
|
||||
if(l.smooth){
|
||||
scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
|
||||
add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
|
||||
scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
|
||||
add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
|
||||
}
|
||||
if (l.cost_type == MASKED) {
|
||||
mask_ongpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
|
||||
mask_gpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
|
||||
}
|
||||
|
||||
if(l.cost_type == SMOOTH){
|
||||
@ -137,6 +140,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
|
||||
l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
|
||||
}
|
||||
|
||||
if (l.cost_type == SEG && l.noobject_scale != 1) {
|
||||
scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
|
||||
scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
|
||||
}
|
||||
|
||||
if(l.ratio){
|
||||
cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
|
||||
qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare);
|
||||
@ -144,11 +152,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
|
||||
float thresh = l.delta[n];
|
||||
thresh = 0;
|
||||
printf("%f\n", thresh);
|
||||
supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
|
||||
supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
|
||||
}
|
||||
|
||||
if(l.thresh){
|
||||
supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
|
||||
supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
|
||||
}
|
||||
|
||||
cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
|
||||
@ -157,7 +165,7 @@ void forward_cost_layer_gpu(cost_layer l, network net)
|
||||
|
||||
void backward_cost_layer_gpu(const cost_layer l, network net)
|
||||
{
|
||||
axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -209,10 +209,10 @@ void forward_crnn_layer_gpu(layer l, network net)
|
||||
layer self_layer = *(l.self_layer);
|
||||
layer output_layer = *(l.output_layer);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
|
||||
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
|
||||
fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
|
||||
if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
|
||||
fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
|
||||
fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
|
||||
if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
|
||||
for (i = 0; i < l.steps; ++i) {
|
||||
s.input_gpu = net.input_gpu;
|
||||
@ -224,12 +224,12 @@ void forward_crnn_layer_gpu(layer l, network net)
|
||||
float *old_state = l.state_gpu;
|
||||
if(net.train) l.state_gpu += l.hidden*l.batch;
|
||||
if(l.shortcut){
|
||||
copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
|
||||
copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
|
||||
}else{
|
||||
fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
}
|
||||
axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
|
||||
s.input_gpu = l.state_gpu;
|
||||
forward_convolutional_layer_gpu(output_layer, s);
|
||||
@ -254,8 +254,8 @@ void backward_crnn_layer_gpu(layer l, network net)
|
||||
increment_layer(&output_layer, l.steps - 1);
|
||||
l.state_gpu += l.hidden*l.batch*l.steps;
|
||||
for (i = l.steps-1; i >= 0; --i) {
|
||||
copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
|
||||
s.input_gpu = l.state_gpu;
|
||||
s.delta_gpu = self_layer.delta_gpu;
|
||||
@ -268,8 +268,8 @@ void backward_crnn_layer_gpu(layer l, network net)
|
||||
if (i == 0) s.delta_gpu = 0;
|
||||
backward_convolutional_layer_gpu(self_layer, s);
|
||||
|
||||
copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
|
||||
if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
|
||||
copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
|
||||
if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
|
||||
s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
|
||||
if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
|
||||
else s.delta_gpu = 0;
|
||||
|
@ -97,7 +97,7 @@ float *cuda_make_array(float *x, size_t n)
|
||||
status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
|
||||
check_error(status);
|
||||
} else {
|
||||
fill_ongpu(n, 0, x_gpu, 1);
|
||||
fill_gpu(n, 0, x_gpu, 1);
|
||||
}
|
||||
if(!x_gpu) error("Cuda malloc failed\n");
|
||||
return x_gpu;
|
||||
|
37
src/data.c
37
src/data.c
@ -551,6 +551,33 @@ void exclusive_image(image src)
|
||||
}
|
||||
|
||||
image get_segmentation_image(char *path, int w, int h, int classes)
|
||||
{
|
||||
char labelpath[4096];
|
||||
find_replace(path, "images", "mask", labelpath);
|
||||
find_replace(labelpath, "JPEGImages", "mask", labelpath);
|
||||
find_replace(labelpath, ".jpg", ".txt", labelpath);
|
||||
find_replace(labelpath, ".JPG", ".txt", labelpath);
|
||||
find_replace(labelpath, ".JPEG", ".txt", labelpath);
|
||||
image mask = make_image(w, h, classes);
|
||||
FILE *file = fopen(labelpath, "r");
|
||||
if(!file) file_error(labelpath);
|
||||
char buff[32788];
|
||||
int id;
|
||||
image part = make_image(w, h, 1);
|
||||
while(fscanf(file, "%d %s", &id, buff) == 2){
|
||||
int n = 0;
|
||||
int *rle = read_intlist(buff, &n, 0);
|
||||
load_rle(part, rle, n);
|
||||
or_image(part, mask, id);
|
||||
free(rle);
|
||||
}
|
||||
//exclusive_image(mask);
|
||||
fclose(file);
|
||||
free_image(part);
|
||||
return mask;
|
||||
}
|
||||
|
||||
image get_segmentation_image2(char *path, int w, int h, int classes)
|
||||
{
|
||||
char labelpath[4096];
|
||||
find_replace(path, "images", "mask", labelpath);
|
||||
@ -584,7 +611,7 @@ image get_segmentation_image(char *path, int w, int h, int classes)
|
||||
return mask;
|
||||
}
|
||||
|
||||
data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
|
||||
data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure, int div)
|
||||
{
|
||||
char **random_paths = get_random_paths(paths, n, m);
|
||||
int i;
|
||||
@ -597,12 +624,14 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi
|
||||
|
||||
|
||||
d.y.rows = n;
|
||||
d.y.cols = h*w*(classes+1);
|
||||
d.y.cols = h*w*classes/div/div;
|
||||
d.y.vals = calloc(d.X.rows, sizeof(float*));
|
||||
|
||||
for(i = 0; i < n; ++i){
|
||||
image orig = load_image_color(random_paths[i], 0, 0);
|
||||
augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
|
||||
a.dx = 0;
|
||||
a.dy = 0;
|
||||
image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
|
||||
|
||||
int flip = rand()%2;
|
||||
@ -612,7 +641,7 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi
|
||||
|
||||
image mask = get_segmentation_image(random_paths[i], orig.w, orig.h, classes);
|
||||
//image mask = make_image(orig.w, orig.h, classes+1);
|
||||
image sized_m = rotate_crop_image(mask, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
|
||||
image sized_m = rotate_crop_image(mask, a.rad, a.scale/div, a.w/div, a.h/div, a.dx, a.dy, a.aspect);
|
||||
|
||||
if(flip) flip_image(sized_m);
|
||||
d.y.vals[i] = sized_m.data;
|
||||
@ -874,7 +903,7 @@ void *load_thread(void *ptr)
|
||||
} else if (a.type == WRITING_DATA){
|
||||
*a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
|
||||
} else if (a.type == SEGMENTATION_DATA){
|
||||
*a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
|
||||
*a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.scale);
|
||||
} else if (a.type == REGION_DATA){
|
||||
*a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
|
||||
} else if (a.type == DETECTION_DATA){
|
||||
|
@ -22,31 +22,31 @@ extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
|
||||
int n = l.h*l.w;
|
||||
int k = l.c;
|
||||
|
||||
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
float *a = l.weights_gpu;
|
||||
float *b = net.input_gpu + i*l.c*l.h*l.w;
|
||||
float *c = net.workspace;
|
||||
|
||||
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
|
||||
gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
|
||||
|
||||
col2im_ongpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
|
||||
col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
|
||||
}
|
||||
if (l.batch_normalize) {
|
||||
forward_batchnorm_layer_gpu(l, net);
|
||||
} else {
|
||||
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
|
||||
}
|
||||
activate_array_ongpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
|
||||
activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
|
||||
}
|
||||
|
||||
extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
|
||||
{
|
||||
int i;
|
||||
|
||||
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
|
||||
if(l.batch_normalize){
|
||||
backward_batchnorm_layer_gpu(l, net);
|
||||
@ -65,9 +65,9 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
|
||||
float *b = net.workspace;
|
||||
float *c = l.weight_updates_gpu;
|
||||
|
||||
im2col_ongpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w,
|
||||
im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w,
|
||||
l.size, l.stride, l.pad, b);
|
||||
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||
|
||||
if(net.delta_gpu){
|
||||
int m = l.c;
|
||||
@ -78,7 +78,7 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
|
||||
float *b = net.workspace;
|
||||
float *c = net.delta_gpu + i*n*m;
|
||||
|
||||
gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
||||
gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -125,16 +125,16 @@ void update_deconvolutional_layer_gpu(layer l, update_args a)
|
||||
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
|
||||
}
|
||||
}else{
|
||||
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
|
||||
axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
|
||||
axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
|
||||
|
||||
if(l.scales_gpu){
|
||||
axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
|
||||
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
|
||||
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -256,7 +256,7 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
|
||||
void forward_detection_layer_gpu(const detection_layer l, network net)
|
||||
{
|
||||
if(!net.train){
|
||||
copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
|
||||
copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -270,8 +270,8 @@ void forward_detection_layer_gpu(const detection_layer l, network net)
|
||||
|
||||
void backward_detection_layer_gpu(detection_layer l, network net)
|
||||
{
|
||||
axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
//copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
//copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
58
src/gemm.c
58
src/gemm.c
@ -165,7 +165,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
|
||||
#include <math.h>
|
||||
|
||||
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A_gpu, int lda,
|
||||
float *B_gpu, int ldb,
|
||||
float BETA,
|
||||
@ -177,24 +177,6 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
check_error(status);
|
||||
}
|
||||
|
||||
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A, int lda,
|
||||
float *B, int ldb,
|
||||
float BETA,
|
||||
float *C, int ldc)
|
||||
{
|
||||
float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
|
||||
float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
|
||||
float *C_gpu = cuda_make_array(C, ldc*M);
|
||||
|
||||
gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
|
||||
|
||||
cuda_pull_array(C_gpu, C, ldc*M);
|
||||
cuda_free(A_gpu);
|
||||
cuda_free(B_gpu);
|
||||
cuda_free(C_gpu);
|
||||
}
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -224,7 +206,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
|
||||
free(c);
|
||||
}
|
||||
|
||||
void time_ongpu(int TA, int TB, int m, int k, int n)
|
||||
void time_gpu(int TA, int TB, int m, int k, int n)
|
||||
{
|
||||
int iter = 10;
|
||||
float *a = random_matrix(m,k);
|
||||
@ -242,7 +224,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n)
|
||||
int i;
|
||||
clock_t start = clock(), end;
|
||||
for(i = 0; i<iter; ++i){
|
||||
gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
|
||||
gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
|
||||
cudaThreadSynchronize();
|
||||
}
|
||||
double flop = ((double)m)*n*(2.*k + 2.)*iter;
|
||||
@ -313,24 +295,24 @@ int test_gpu_blas()
|
||||
|
||||
test_gpu_accuracy(0,0,10,10,10);
|
||||
|
||||
time_ongpu(0,0,64,2916,363);
|
||||
time_ongpu(0,0,64,2916,363);
|
||||
time_ongpu(0,0,64,2916,363);
|
||||
time_ongpu(0,0,192,729,1600);
|
||||
time_ongpu(0,0,384,196,1728);
|
||||
time_ongpu(0,0,256,196,3456);
|
||||
time_ongpu(0,0,256,196,2304);
|
||||
time_ongpu(0,0,128,4096,12544);
|
||||
time_ongpu(0,0,128,4096,4096);
|
||||
time_gpu(0,0,64,2916,363);
|
||||
time_gpu(0,0,64,2916,363);
|
||||
time_gpu(0,0,64,2916,363);
|
||||
time_gpu(0,0,192,729,1600);
|
||||
time_gpu(0,0,384,196,1728);
|
||||
time_gpu(0,0,256,196,3456);
|
||||
time_gpu(0,0,256,196,2304);
|
||||
time_gpu(0,0,128,4096,12544);
|
||||
time_gpu(0,0,128,4096,4096);
|
||||
*/
|
||||
time_ongpu(0,0,64,75,12544);
|
||||
time_ongpu(0,0,64,75,12544);
|
||||
time_ongpu(0,0,64,75,12544);
|
||||
time_ongpu(0,0,64,576,12544);
|
||||
time_ongpu(0,0,256,2304,784);
|
||||
time_ongpu(1,1,2304,256,784);
|
||||
time_ongpu(0,0,512,4608,196);
|
||||
time_ongpu(1,1,4608,512,196);
|
||||
time_gpu(0,0,64,75,12544);
|
||||
time_gpu(0,0,64,75,12544);
|
||||
time_gpu(0,0,64,75,12544);
|
||||
time_gpu(0,0,64,576,12544);
|
||||
time_gpu(0,0,256,2304,784);
|
||||
time_gpu(1,1,2304,256,784);
|
||||
time_gpu(0,0,512,4608,196);
|
||||
time_gpu(1,1,4608,512,196);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -19,7 +19,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *C, int ldc);
|
||||
|
||||
#ifdef GPU
|
||||
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
|
||||
float *A_gpu, int lda,
|
||||
float *B_gpu, int ldb,
|
||||
float BETA,
|
||||
|
@ -238,16 +238,16 @@ void forward_gru_layer_gpu(layer l, network net)
|
||||
layer wr = *(l.wr);
|
||||
layer wh = *(l.wh);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
|
||||
if(net.train) {
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
|
||||
}
|
||||
|
||||
for (i = 0; i < l.steps; ++i) {
|
||||
@ -260,32 +260,32 @@ void forward_gru_layer_gpu(layer l, network net)
|
||||
forward_connected_layer_gpu(ur, s);
|
||||
forward_connected_layer_gpu(uh, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||
|
||||
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||
|
||||
s.input_gpu = l.forgot_state_gpu;
|
||||
forward_connected_layer_gpu(wh, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||
|
||||
if(l.tanh){
|
||||
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
} else {
|
||||
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
}
|
||||
|
||||
weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
|
||||
|
||||
net.input_gpu += l.inputs*l.batch;
|
||||
l.output_gpu += l.outputs*l.batch;
|
||||
@ -324,56 +324,58 @@ void backward_gru_layer_gpu(layer l, network net)
|
||||
if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
|
||||
l.output_gpu += l.outputs*l.batch*(l.steps-1);
|
||||
l.delta_gpu += l.outputs*l.batch*(l.steps-1);
|
||||
float *end_state = l.output_gpu;
|
||||
for (i = l.steps-1; i >= 0; --i) {
|
||||
if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
|
||||
if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
|
||||
else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
|
||||
float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||
|
||||
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||
|
||||
if(l.tanh){
|
||||
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
} else {
|
||||
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
}
|
||||
|
||||
weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
|
||||
weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
|
||||
|
||||
if(l.tanh){
|
||||
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
|
||||
gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
|
||||
} else {
|
||||
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
|
||||
gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
|
||||
}
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||
fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||
fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
|
||||
|
||||
s.input_gpu = l.forgot_state_gpu;
|
||||
s.delta_gpu = l.forgot_delta_gpu;
|
||||
|
||||
backward_connected_layer_gpu(wh, s);
|
||||
if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
|
||||
mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, ur.delta_gpu);
|
||||
mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu);
|
||||
|
||||
gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
|
||||
gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
|
||||
copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
|
||||
|
||||
gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
|
||||
gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
|
||||
copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
|
||||
|
||||
s.input_gpu = l.prev_state_gpu;
|
||||
s.input_gpu = l.state_gpu;
|
||||
s.delta_gpu = prev_delta_gpu;
|
||||
|
||||
backward_connected_layer_gpu(wr, s);
|
||||
@ -399,5 +401,6 @@ void backward_gru_layer_gpu(layer l, network net)
|
||||
increment_layer(&wr, -1);
|
||||
increment_layer(&wh, -1);
|
||||
}
|
||||
copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
@ -7,7 +7,7 @@ void im2col_cpu(float* data_im,
|
||||
|
||||
#ifdef GPU
|
||||
|
||||
void im2col_ongpu(float *im,
|
||||
void im2col_gpu(float *im,
|
||||
int channels, int height, int width,
|
||||
int ksize, int stride, int pad,float *data_col);
|
||||
|
||||
|
@ -45,7 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
|
||||
}
|
||||
}
|
||||
|
||||
void im2col_ongpu(float *im,
|
||||
void im2col_gpu(float *im,
|
||||
int channels, int height, int width,
|
||||
int ksize, int stride, int pad, float *data_col){
|
||||
// We are going to launch channels * height_col * width_col kernels, each
|
||||
|
@ -191,12 +191,12 @@ void forward_local_layer_gpu(const local_layer l, network net)
|
||||
int locations = out_h * out_w;
|
||||
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
|
||||
copy_gpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
|
||||
}
|
||||
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
float *input = net.input_gpu + i*l.w*l.h*l.c;
|
||||
im2col_ongpu(input, l.c, l.h, l.w,
|
||||
im2col_gpu(input, l.c, l.h, l.w,
|
||||
l.size, l.stride, l.pad, net.workspace);
|
||||
float *output = l.output_gpu + i*l.outputs;
|
||||
for(j = 0; j < locations; ++j){
|
||||
@ -208,10 +208,10 @@ void forward_local_layer_gpu(const local_layer l, network net)
|
||||
int n = 1;
|
||||
int k = l.size*l.size*l.c;
|
||||
|
||||
gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
|
||||
gemm_gpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
|
||||
}
|
||||
}
|
||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
}
|
||||
|
||||
void backward_local_layer_gpu(local_layer l, network net)
|
||||
@ -219,14 +219,14 @@ void backward_local_layer_gpu(local_layer l, network net)
|
||||
int i, j;
|
||||
int locations = l.out_w*l.out_h;
|
||||
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
|
||||
axpy_gpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
|
||||
}
|
||||
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
float *input = net.input_gpu + i*l.w*l.h*l.c;
|
||||
im2col_ongpu(input, l.c, l.h, l.w,
|
||||
im2col_gpu(input, l.c, l.h, l.w,
|
||||
l.size, l.stride, l.pad, net.workspace);
|
||||
|
||||
for(j = 0; j < locations; ++j){
|
||||
@ -237,7 +237,7 @@ void backward_local_layer_gpu(local_layer l, network net)
|
||||
int n = l.size*l.size*l.c;
|
||||
int k = 1;
|
||||
|
||||
gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
|
||||
gemm_gpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
|
||||
}
|
||||
|
||||
if(net.delta_gpu){
|
||||
@ -250,10 +250,10 @@ void backward_local_layer_gpu(local_layer l, network net)
|
||||
int n = 1;
|
||||
int k = l.n;
|
||||
|
||||
gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
|
||||
gemm_gpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
|
||||
}
|
||||
|
||||
col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
|
||||
col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -267,12 +267,12 @@ void update_local_layer_gpu(local_layer l, update_args a)
|
||||
|
||||
int locations = l.out_w*l.out_h;
|
||||
int size = l.size*l.size*l.c*l.n*locations;
|
||||
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
|
||||
axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
|
||||
|
||||
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
}
|
||||
|
||||
void pull_local_layer(local_layer l)
|
||||
|
184
src/lstm_layer.c
184
src/lstm_layer.c
@ -41,36 +41,36 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
|
||||
*(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.uf->batch = batch;
|
||||
|
||||
l.wf = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wf->batch = batch;
|
||||
|
||||
l.ui = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.ui->batch = batch;
|
||||
|
||||
l.wi = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wi->batch = batch;
|
||||
|
||||
l.ug = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.ug->batch = batch;
|
||||
|
||||
l.wg = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wg->batch = batch;
|
||||
|
||||
l.uo = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.uo->batch = batch;
|
||||
|
||||
l.wf = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wf->batch = batch;
|
||||
|
||||
l.wi = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wi->batch = batch;
|
||||
|
||||
l.wg = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
l.wg->batch = batch;
|
||||
|
||||
l.wo = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
|
||||
@ -410,17 +410,17 @@ void forward_lstm_layer_gpu(layer l, network state)
|
||||
layer ug = *(l.ug);
|
||||
layer uo = *(l.uo);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
|
||||
if (state.train) {
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||
}
|
||||
|
||||
for (i = 0; i < l.steps; ++i) {
|
||||
@ -436,34 +436,34 @@ void forward_lstm_layer_gpu(layer l, network state)
|
||||
forward_connected_layer_gpu(ug, s);
|
||||
forward_connected_layer_gpu(uo, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
|
||||
|
||||
activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
|
||||
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
|
||||
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||
mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
|
||||
|
||||
state.input_gpu += l.inputs*l.batch;
|
||||
l.output_gpu += l.outputs*l.batch;
|
||||
@ -514,97 +514,97 @@ void backward_lstm_layer_gpu(layer l, network state)
|
||||
l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
|
||||
|
||||
for (i = l.steps - 1; i >= 0; --i) {
|
||||
if (i != 0) copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
|
||||
if (i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
|
||||
if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
|
||||
if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
|
||||
|
||||
l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
|
||||
axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
|
||||
|
||||
activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
|
||||
activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
|
||||
activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
|
||||
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
|
||||
activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
|
||||
|
||||
gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
|
||||
gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
|
||||
axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
|
||||
activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
|
||||
mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
|
||||
activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
|
||||
mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
|
||||
s.input_gpu = l.prev_state_gpu;
|
||||
s.delta_gpu = l.dh_gpu;
|
||||
backward_connected_layer_gpu(wo, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
|
||||
s.input_gpu = state.input_gpu;
|
||||
s.delta_gpu = state.delta_gpu;
|
||||
backward_connected_layer_gpu(uo, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
|
||||
s.input_gpu = l.prev_state_gpu;
|
||||
s.delta_gpu = l.dh_gpu;
|
||||
backward_connected_layer_gpu(wg, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
|
||||
s.input_gpu = state.input_gpu;
|
||||
s.delta_gpu = state.delta_gpu;
|
||||
backward_connected_layer_gpu(ug, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
|
||||
s.input_gpu = l.prev_state_gpu;
|
||||
s.delta_gpu = l.dh_gpu;
|
||||
backward_connected_layer_gpu(wi, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
|
||||
s.input_gpu = state.input_gpu;
|
||||
s.delta_gpu = state.delta_gpu;
|
||||
backward_connected_layer_gpu(ui, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
|
||||
gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
|
||||
s.input_gpu = l.prev_state_gpu;
|
||||
s.delta_gpu = l.dh_gpu;
|
||||
backward_connected_layer_gpu(wf, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
|
||||
s.input_gpu = state.input_gpu;
|
||||
s.delta_gpu = state.delta_gpu;
|
||||
backward_connected_layer_gpu(uf, s);
|
||||
|
||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
|
||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||
mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
|
||||
|
||||
state.input_gpu -= l.inputs*l.batch;
|
||||
if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;
|
||||
|
@ -42,7 +42,7 @@ void forward_network_gpu(network net)
|
||||
net.index = i;
|
||||
layer l = net.layers[i];
|
||||
if(l.delta_gpu){
|
||||
fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
|
||||
}
|
||||
l.forward_gpu(l, net);
|
||||
net.input_gpu = l.output_gpu;
|
||||
@ -107,9 +107,9 @@ void harmless_update_network_gpu(network net)
|
||||
int i;
|
||||
for(i = 0; i < net.n; ++i){
|
||||
layer l = net.layers[i];
|
||||
if(l.weight_updates_gpu) fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
|
||||
if(l.bias_updates_gpu) fill_ongpu(l.nbiases, 0, l.bias_updates_gpu, 1);
|
||||
if(l.scale_updates_gpu) fill_ongpu(l.nbiases, 0, l.scale_updates_gpu, 1);
|
||||
if(l.weight_updates_gpu) fill_gpu(l.nweights, 0, l.weight_updates_gpu, 1);
|
||||
if(l.bias_updates_gpu) fill_gpu(l.nbiases, 0, l.bias_updates_gpu, 1);
|
||||
if(l.scale_updates_gpu) fill_gpu(l.nbiases, 0, l.scale_updates_gpu, 1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -383,6 +383,7 @@ float train_networks(network *nets, int n, data d, int interval)
|
||||
|
||||
float sum = 0;
|
||||
for(i = 0; i < n; ++i){
|
||||
nets[i].learning_rate *= n;
|
||||
data p = get_data_part(d, i, n);
|
||||
threads[i] = train_network_in_thread(nets[i], p, errors + i);
|
||||
}
|
||||
|
@ -113,29 +113,29 @@ void forward_normalization_layer_gpu(const layer layer, network net)
|
||||
int w = layer.w;
|
||||
int h = layer.h;
|
||||
int c = layer.c;
|
||||
scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
|
||||
scal_gpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
|
||||
|
||||
for(b = 0; b < layer.batch; ++b){
|
||||
float *squared = layer.squared_gpu + w*h*c*b;
|
||||
float *norms = layer.norms_gpu + w*h*c*b;
|
||||
float *input = net.input_gpu + w*h*c*b;
|
||||
pow_ongpu(w*h*c, 2, input, 1, squared, 1);
|
||||
pow_gpu(w*h*c, 2, input, 1, squared, 1);
|
||||
|
||||
const_ongpu(w*h, layer.kappa, norms, 1);
|
||||
const_gpu(w*h, layer.kappa, norms, 1);
|
||||
for(k = 0; k < layer.size/2; ++k){
|
||||
axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
|
||||
axpy_gpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
|
||||
}
|
||||
|
||||
for(k = 1; k < layer.c; ++k){
|
||||
copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
|
||||
copy_gpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
|
||||
int prev = k - ((layer.size-1)/2) - 1;
|
||||
int next = k + (layer.size/2);
|
||||
if(prev >= 0) axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
|
||||
if(next < layer.c) axpy_ongpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
|
||||
if(prev >= 0) axpy_gpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
|
||||
if(next < layer.c) axpy_gpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
|
||||
}
|
||||
}
|
||||
pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
|
||||
mul_ongpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
|
||||
pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
|
||||
mul_gpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
|
||||
}
|
||||
|
||||
void backward_normalization_layer_gpu(const layer layer, network net)
|
||||
@ -145,7 +145,7 @@ void backward_normalization_layer_gpu(const layer layer, network net)
|
||||
int w = layer.w;
|
||||
int h = layer.h;
|
||||
int c = layer.c;
|
||||
pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
|
||||
mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
|
||||
pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
|
||||
mul_gpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
17
src/parser.c
17
src/parser.c
@ -213,13 +213,11 @@ layer parse_crnn(list *options, size_params params)
|
||||
layer parse_rnn(list *options, size_params params)
|
||||
{
|
||||
int output = option_find_int(options, "output",1);
|
||||
int hidden = option_find_int(options, "hidden",1);
|
||||
char *activation_s = option_find_str(options, "activation", "logistic");
|
||||
ACTIVATION activation = get_activation(activation_s);
|
||||
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
|
||||
int logistic = option_find_int_quiet(options, "logistic", 0);
|
||||
|
||||
layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic, params.net.adam);
|
||||
layer l = make_rnn_layer(params.batch, params.inputs, output, params.time_steps, activation, batch_normalize, params.net.adam);
|
||||
|
||||
l.shortcut = option_find_int_quiet(options, "shortcut", 0);
|
||||
|
||||
@ -353,6 +351,7 @@ cost_layer parse_cost(list *options, size_params params)
|
||||
float scale = option_find_float_quiet(options, "scale",1);
|
||||
cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
|
||||
layer.ratio = option_find_float_quiet(options, "ratio",0);
|
||||
layer.noobject_scale = option_find_float_quiet(options, "noobj", 1);
|
||||
layer.thresh = option_find_float_quiet(options, "thresh",0);
|
||||
return layer;
|
||||
}
|
||||
@ -921,12 +920,18 @@ void save_weights_upto(network net, char *filename, int cutoff)
|
||||
save_connected_weights(*(l.uo), fp);
|
||||
save_connected_weights(*(l.ug), fp);
|
||||
} if (l.type == GRU) {
|
||||
if(1){
|
||||
save_connected_weights(*(l.wz), fp);
|
||||
save_connected_weights(*(l.wr), fp);
|
||||
save_connected_weights(*(l.wh), fp);
|
||||
save_connected_weights(*(l.uz), fp);
|
||||
save_connected_weights(*(l.ur), fp);
|
||||
save_connected_weights(*(l.uh), fp);
|
||||
}else{
|
||||
save_connected_weights(*(l.reset_layer), fp);
|
||||
save_connected_weights(*(l.update_layer), fp);
|
||||
save_connected_weights(*(l.state_layer), fp);
|
||||
}
|
||||
} if(l.type == CRNN){
|
||||
save_convolutional_weights(*(l.input_layer), fp);
|
||||
save_convolutional_weights(*(l.self_layer), fp);
|
||||
@ -1132,12 +1137,18 @@ void load_weights_upto(network *net, char *filename, int start, int cutoff)
|
||||
load_connected_weights(*(l.ug), fp, transpose);
|
||||
}
|
||||
if (l.type == GRU) {
|
||||
if(1){
|
||||
load_connected_weights(*(l.wz), fp, transpose);
|
||||
load_connected_weights(*(l.wr), fp, transpose);
|
||||
load_connected_weights(*(l.wh), fp, transpose);
|
||||
load_connected_weights(*(l.uz), fp, transpose);
|
||||
load_connected_weights(*(l.ur), fp, transpose);
|
||||
load_connected_weights(*(l.uh), fp, transpose);
|
||||
}else{
|
||||
load_connected_weights(*(l.reset_layer), fp, transpose);
|
||||
load_connected_weights(*(l.update_layer), fp, transpose);
|
||||
load_connected_weights(*(l.state_layer), fp, transpose);
|
||||
}
|
||||
}
|
||||
if(l.type == LOCAL){
|
||||
int locations = l.out_w*l.out_h;
|
||||
|
@ -434,14 +434,14 @@ void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, f
|
||||
|
||||
void forward_region_layer_gpu(const layer l, network net)
|
||||
{
|
||||
copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
|
||||
copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
|
||||
int b, n;
|
||||
for (b = 0; b < l.batch; ++b){
|
||||
for(n = 0; n < l.n; ++n){
|
||||
int index = entry_index(l, b, n*l.w*l.h, 0);
|
||||
activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
|
||||
activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
|
||||
index = entry_index(l, b, n*l.w*l.h, l.coords);
|
||||
if(!l.background) activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
|
||||
if(!l.background) activate_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
|
||||
}
|
||||
}
|
||||
if (l.softmax_tree){
|
||||
@ -545,12 +545,12 @@ void backward_region_layer_gpu(const layer l, network net)
|
||||
for (b = 0; b < l.batch; ++b){
|
||||
for(n = 0; n < l.n; ++n){
|
||||
int index = entry_index(l, b, n*l.w*l.h, 0);
|
||||
gradient_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
|
||||
gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
|
||||
index = entry_index(l, b, n*l.w*l.h, l.coords);
|
||||
if(!l.background) gradient_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index);
|
||||
if(!l.background) gradient_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index);
|
||||
}
|
||||
}
|
||||
axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -136,18 +136,18 @@ void forward_reorg_layer_gpu(layer l, network net)
|
||||
int i;
|
||||
if(l.flatten){
|
||||
if(l.reverse){
|
||||
flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
|
||||
flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
|
||||
}else{
|
||||
flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
|
||||
flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
|
||||
}
|
||||
} else if (l.extra) {
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
copy_ongpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
|
||||
copy_gpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
|
||||
}
|
||||
} else if (l.reverse) {
|
||||
reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
|
||||
reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
|
||||
}else {
|
||||
reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
|
||||
reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
|
||||
}
|
||||
}
|
||||
|
||||
@ -155,19 +155,19 @@ void backward_reorg_layer_gpu(layer l, network net)
|
||||
{
|
||||
if(l.flatten){
|
||||
if(l.reverse){
|
||||
flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
|
||||
flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
|
||||
}else{
|
||||
flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
|
||||
flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
|
||||
}
|
||||
} else if (l.extra) {
|
||||
int i;
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
|
||||
copy_gpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
|
||||
}
|
||||
} else if(l.reverse){
|
||||
reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
|
||||
reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
|
||||
} else {
|
||||
reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
|
||||
reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
107
src/rnn_layer.c
107
src/rnn_layer.c
@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
|
||||
#endif
|
||||
}
|
||||
|
||||
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam)
|
||||
layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam)
|
||||
{
|
||||
fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
|
||||
batch = batch / steps;
|
||||
@ -34,24 +34,24 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
|
||||
l.batch = batch;
|
||||
l.type = RNN;
|
||||
l.steps = steps;
|
||||
l.hidden = hidden;
|
||||
l.inputs = inputs;
|
||||
|
||||
l.state = calloc(batch*hidden*(steps+1), sizeof(float));
|
||||
l.state = calloc(batch*outputs, sizeof(float));
|
||||
l.prev_state = calloc(batch*outputs, sizeof(float));
|
||||
|
||||
l.input_layer = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize, adam);
|
||||
*(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam);
|
||||
l.input_layer->batch = batch;
|
||||
|
||||
l.self_layer = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize, adam);
|
||||
*(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
|
||||
l.self_layer->batch = batch;
|
||||
|
||||
l.output_layer = malloc(sizeof(layer));
|
||||
fprintf(stderr, "\t\t");
|
||||
*(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize, adam);
|
||||
*(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
|
||||
l.output_layer->batch = batch;
|
||||
|
||||
l.outputs = outputs;
|
||||
@ -65,9 +65,15 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
|
||||
l.forward_gpu = forward_rnn_layer_gpu;
|
||||
l.backward_gpu = backward_rnn_layer_gpu;
|
||||
l.update_gpu = update_rnn_layer_gpu;
|
||||
l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1));
|
||||
l.state_gpu = cuda_make_array(0, batch*outputs);
|
||||
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
|
||||
l.output_gpu = l.output_layer->output_gpu;
|
||||
l.delta_gpu = l.output_layer->delta_gpu;
|
||||
#ifdef CUDNN
|
||||
cudnnSetTensor4dDescriptor(l.input_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.input_layer->out_c, l.input_layer->out_h, l.input_layer->out_w);
|
||||
cudnnSetTensor4dDescriptor(l.self_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.self_layer->out_c, l.self_layer->out_h, l.self_layer->out_w);
|
||||
cudnnSetTensor4dDescriptor(l.output_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.output_layer->out_c, l.output_layer->out_h, l.output_layer->out_w);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return l;
|
||||
@ -90,9 +96,9 @@ void forward_rnn_layer(layer l, network net)
|
||||
layer output_layer = *(l.output_layer);
|
||||
|
||||
fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
|
||||
fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
|
||||
fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
|
||||
if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
|
||||
fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1);
|
||||
fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1);
|
||||
if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1);
|
||||
|
||||
for (i = 0; i < l.steps; ++i) {
|
||||
s.input = net.input;
|
||||
@ -102,14 +108,14 @@ void forward_rnn_layer(layer l, network net)
|
||||
forward_connected_layer(self_layer, s);
|
||||
|
||||
float *old_state = l.state;
|
||||
if(net.train) l.state += l.hidden*l.batch;
|
||||
if(net.train) l.state += l.outputs*l.batch;
|
||||
if(l.shortcut){
|
||||
copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
|
||||
copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1);
|
||||
}else{
|
||||
fill_cpu(l.hidden * l.batch, 0, l.state, 1);
|
||||
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
|
||||
}
|
||||
axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
|
||||
axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
|
||||
axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1);
|
||||
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
|
||||
|
||||
s.input = l.state;
|
||||
forward_connected_layer(output_layer, s);
|
||||
@ -134,32 +140,32 @@ void backward_rnn_layer(layer l, network net)
|
||||
increment_layer(&self_layer, l.steps-1);
|
||||
increment_layer(&output_layer, l.steps-1);
|
||||
|
||||
l.state += l.hidden*l.batch*l.steps;
|
||||
l.state += l.outputs*l.batch*l.steps;
|
||||
for (i = l.steps-1; i >= 0; --i) {
|
||||
copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
|
||||
axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
|
||||
copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1);
|
||||
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
|
||||
|
||||
s.input = l.state;
|
||||
s.delta = self_layer.delta;
|
||||
backward_connected_layer(output_layer, s);
|
||||
|
||||
l.state -= l.hidden*l.batch;
|
||||
l.state -= l.outputs*l.batch;
|
||||
/*
|
||||
if(i > 0){
|
||||
copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
|
||||
axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
|
||||
copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1);
|
||||
axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1);
|
||||
}else{
|
||||
fill_cpu(l.hidden * l.batch, 0, l.state, 1);
|
||||
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
|
||||
}
|
||||
*/
|
||||
|
||||
s.input = l.state;
|
||||
s.delta = self_layer.delta - l.hidden*l.batch;
|
||||
s.delta = self_layer.delta - l.outputs*l.batch;
|
||||
if (i == 0) s.delta = 0;
|
||||
backward_connected_layer(self_layer, s);
|
||||
|
||||
copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
|
||||
if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
|
||||
copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1);
|
||||
if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1);
|
||||
s.input = net.input + i*l.inputs*l.batch;
|
||||
if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
|
||||
else s.delta = 0;
|
||||
@ -196,17 +202,21 @@ void update_rnn_layer_gpu(layer l, update_args a)
|
||||
|
||||
void forward_rnn_layer_gpu(layer l, network net)
|
||||
{
|
||||
network s = net;
|
||||
network s = {0};
|
||||
s.train = net.train;
|
||||
int i;
|
||||
layer input_layer = *(l.input_layer);
|
||||
layer self_layer = *(l.self_layer);
|
||||
layer output_layer = *(l.output_layer);
|
||||
|
||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
|
||||
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
|
||||
fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
|
||||
if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
|
||||
|
||||
if(net.train) {
|
||||
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
|
||||
}
|
||||
|
||||
for (i = 0; i < l.steps; ++i) {
|
||||
s.input_gpu = net.input_gpu;
|
||||
@ -215,15 +225,9 @@ void forward_rnn_layer_gpu(layer l, network net)
|
||||
s.input_gpu = l.state_gpu;
|
||||
forward_connected_layer_gpu(self_layer, s);
|
||||
|
||||
float *old_state = l.state_gpu;
|
||||
if(net.train) l.state_gpu += l.hidden*l.batch;
|
||||
if(l.shortcut){
|
||||
copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
|
||||
}else{
|
||||
fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
|
||||
}
|
||||
axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
|
||||
s.input_gpu = l.state_gpu;
|
||||
forward_connected_layer_gpu(output_layer, s);
|
||||
@ -237,7 +241,7 @@ void forward_rnn_layer_gpu(layer l, network net)
|
||||
|
||||
void backward_rnn_layer_gpu(layer l, network net)
|
||||
{
|
||||
network s = net;
|
||||
network s = {0};
|
||||
s.train = net.train;
|
||||
int i;
|
||||
layer input_layer = *(l.input_layer);
|
||||
@ -246,24 +250,32 @@ void backward_rnn_layer_gpu(layer l, network net)
|
||||
increment_layer(&input_layer, l.steps - 1);
|
||||
increment_layer(&self_layer, l.steps - 1);
|
||||
increment_layer(&output_layer, l.steps - 1);
|
||||
l.state_gpu += l.hidden*l.batch*l.steps;
|
||||
float *last_input = input_layer.output_gpu;
|
||||
float *last_self = self_layer.output_gpu;
|
||||
for (i = l.steps-1; i >= 0; --i) {
|
||||
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
|
||||
|
||||
s.input_gpu = l.state_gpu;
|
||||
s.delta_gpu = self_layer.delta_gpu;
|
||||
backward_connected_layer_gpu(output_layer, s);
|
||||
|
||||
l.state_gpu -= l.hidden*l.batch;
|
||||
if(i != 0) {
|
||||
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
|
||||
}else {
|
||||
copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
|
||||
}
|
||||
|
||||
copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
|
||||
|
||||
s.input_gpu = l.state_gpu;
|
||||
s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch;
|
||||
s.delta_gpu = (i > 0) ? self_layer.delta_gpu - l.outputs*l.batch : 0;
|
||||
if (i == 0) s.delta_gpu = 0;
|
||||
backward_connected_layer_gpu(self_layer, s);
|
||||
|
||||
//copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
|
||||
if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
|
||||
s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
|
||||
if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
|
||||
else s.delta_gpu = 0;
|
||||
@ -273,5 +285,8 @@ void backward_rnn_layer_gpu(layer l, network net)
|
||||
increment_layer(&self_layer, -1);
|
||||
increment_layer(&output_layer, -1);
|
||||
}
|
||||
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, last_input, 1, l.state_gpu, 1);
|
||||
axpy_gpu(l.outputs * l.batch, 1, last_self, 1, l.state_gpu, 1);
|
||||
}
|
||||
#endif
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include "network.h"
|
||||
#define USET
|
||||
|
||||
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam);
|
||||
layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
|
||||
|
||||
void forward_rnn_layer(layer l, network net);
|
||||
void backward_rnn_layer(layer l, network net);
|
||||
|
@ -111,7 +111,7 @@ void forward_route_layer_gpu(const route_layer l, network net)
|
||||
float *input = net.layers[index].output_gpu;
|
||||
int input_size = l.input_sizes[i];
|
||||
for(j = 0; j < l.batch; ++j){
|
||||
copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
|
||||
copy_gpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
|
||||
}
|
||||
offset += input_size;
|
||||
}
|
||||
@ -126,7 +126,7 @@ void backward_route_layer_gpu(const route_layer l, network net)
|
||||
float *delta = net.layers[index].delta_gpu;
|
||||
int input_size = l.input_sizes[i];
|
||||
for(j = 0; j < l.batch; ++j){
|
||||
axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
|
||||
axpy_gpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
|
||||
}
|
||||
offset += input_size;
|
||||
}
|
||||
|
@ -55,15 +55,15 @@ void backward_shortcut_layer(const layer l, network net)
|
||||
#ifdef GPU
|
||||
void forward_shortcut_layer_gpu(const layer l, network net)
|
||||
{
|
||||
copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
|
||||
shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
|
||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||
}
|
||||
|
||||
void backward_shortcut_layer_gpu(const layer l, network net)
|
||||
{
|
||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||
axpy_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1);
|
||||
shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, net.layers[l.index].delta_gpu);
|
||||
}
|
||||
#endif
|
||||
|
@ -81,7 +81,7 @@ void forward_softmax_layer_gpu(const softmax_layer l, network net)
|
||||
|
||||
void backward_softmax_layer_gpu(const softmax_layer layer, network net)
|
||||
{
|
||||
axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
|
||||
axpy_gpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user