🔥 🐛 🔥

This commit is contained in:
Joseph Redmon 2017-06-18 13:05:37 -07:00
parent 1467621453
commit 8215a8864d
42 changed files with 699 additions and 497 deletions

View File

@ -1,6 +1,6 @@
GPU=1
CUDNN=1
OPENCV=1
GPU=0
CUDNN=0
OPENCV=0
DEBUG=0
ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \

View File

@ -1,23 +1,24 @@
[net]
inputs=256
# Test
batch = 1
time_steps=1
# Train
# batch = 512
# time_steps=64
subdivisions=1
batch = 256
inputs=256
momentum=0.9
decay=0.001
learning_rate=0.1
decay=0.0
time_steps=128
learning_rate=.002
adam=1
burn_in=100
policy=poly
policy=constant
power=4
max_batches=10000
max_batches=400000
[gru]
batch_normalize=1
output = 1024
[gru]
batch_normalize=1
output = 1024
[gru]
batch_normalize=1

View File

@ -112,6 +112,26 @@ void operations(char *cfgfile)
ops += 2l * l.n * l.size*l.size*l.c * l.out_h*l.out_w;
} else if(l.type == CONNECTED){
ops += 2l * l.inputs * l.outputs;
} else if (l.type == RNN){
ops += 2l * l.input_layer->inputs * l.input_layer->outputs;
ops += 2l * l.self_layer->inputs * l.self_layer->outputs;
ops += 2l * l.output_layer->inputs * l.output_layer->outputs;
} else if (l.type == GRU){
ops += 2l * l.uz->inputs * l.uz->outputs;
ops += 2l * l.uh->inputs * l.uh->outputs;
ops += 2l * l.ur->inputs * l.ur->outputs;
ops += 2l * l.wz->inputs * l.wz->outputs;
ops += 2l * l.wh->inputs * l.wh->outputs;
ops += 2l * l.wr->inputs * l.wr->outputs;
} else if (l.type == LSTM){
ops += 2l * l.uf->inputs * l.uf->outputs;
ops += 2l * l.ui->inputs * l.ui->outputs;
ops += 2l * l.ug->inputs * l.ug->outputs;
ops += 2l * l.uo->inputs * l.uo->outputs;
ops += 2l * l.wf->inputs * l.wf->outputs;
ops += 2l * l.wi->inputs * l.wi->outputs;
ops += 2l * l.wg->inputs * l.wg->outputs;
ops += 2l * l.wo->inputs * l.wo->outputs;
}
}
printf("Floating Point Operations: %ld\n", ops);

View File

@ -64,7 +64,7 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg
int ax_size = anet.inputs*anet.batch;
int ay_size = anet.truths*anet.batch;
fill_ongpu(ay_size, .9, anet.truth_gpu, 1);
fill_gpu(ay_size, .9, anet.truth_gpu, 1);
anet.delta_gpu = cuda_make_array(0, ax_size);
anet.train = 1;
@ -102,36 +102,36 @@ void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg
forward_network_gpu(fnet, fstate);
float *feats = fnet.layers[fnet.n - 2].output_gpu;
copy_ongpu(y_size, feats, 1, fstate.truth, 1);
copy_gpu(y_size, feats, 1, fstate.truth, 1);
forward_network_gpu(gnet, gstate);
float *gen = gnet.layers[gnet.n-1].output_gpu;
copy_ongpu(x_size, gen, 1, fstate.input, 1);
copy_gpu(x_size, gen, 1, fstate.input, 1);
fill_ongpu(x_size, 0, fstate.delta, 1);
fill_gpu(x_size, 0, fstate.delta, 1);
forward_network_gpu(fnet, fstate);
backward_network_gpu(fnet, fstate);
//HERE
astate.input = gen;
fill_ongpu(ax_size, 0, astate.delta, 1);
fill_gpu(ax_size, 0, astate.delta, 1);
forward_network_gpu(anet, astate);
backward_network_gpu(anet, astate);
float *delta = imlayer.delta_gpu;
fill_ongpu(x_size, 0, delta, 1);
scal_ongpu(x_size, 100, astate.delta, 1);
scal_ongpu(x_size, .001, fstate.delta, 1);
axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1);
axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
fill_gpu(x_size, 0, delta, 1);
scal_gpu(x_size, 100, astate.delta, 1);
scal_gpu(x_size, .001, fstate.delta, 1);
axpy_gpu(x_size, 1, fstate.delta, 1, delta, 1);
axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
//fill_ongpu(x_size, 0, delta, 1);
//fill_gpu(x_size, 0, delta, 1);
//cuda_push_array(delta, X, x_size);
//axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
//axpy_gpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
//printf("pix error: %f\n", cuda_mag_array(delta, x_size));
printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size));
printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size));
//axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
//axpy_gpu(x_size, 1, astate.delta, 1, delta, 1);
backward_network_gpu(gnet, gstate);
@ -273,7 +273,7 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear
float *imerror = cuda_make_array(0, imlayer.outputs);
float *ones_gpu = cuda_make_array(0, ay_size);
fill_ongpu(ay_size, .9, ones_gpu, 1);
fill_gpu(ay_size, .9, ones_gpu, 1);
float aloss_avg = -1;
float gloss_avg = -1;
@ -318,23 +318,23 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear
*net.seen += net.batch;
forward_network_gpu(net, gstate);
fill_ongpu(imlayer.outputs, 0, imerror, 1);
fill_gpu(imlayer.outputs, 0, imerror, 1);
astate.input = imlayer.output_gpu;
astate.delta = imerror;
astate.truth = ones_gpu;
forward_network_gpu(anet, astate);
backward_network_gpu(anet, astate);
scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
scal_gpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
backward_network_gpu(net, gstate);
scal_ongpu(imlayer.outputs, 1000, imerror, 1);
scal_gpu(imlayer.outputs, 1000, imerror, 1);
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs));
axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
gloss += get_network_cost(net) /(net.subdivisions*net.batch);
@ -533,9 +533,9 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
*gnet.seen += gnet.batch;
forward_network_gpu(gnet);
fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
fill_ongpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1);
copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
fill_gpu(anet.truths*anet.batch, .95, anet.truth_gpu, 1);
copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
anet.delta_gpu = imerror;
forward_network_gpu(anet);
backward_network_gpu(anet);
@ -543,13 +543,13 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
float genaloss = *anet.cost / anet.batch;
printf("%f\n", genaloss);
scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
scal_ongpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1);
scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
scal_gpu(imlayer.outputs*imlayer.batch, .00, gnet.layers[gnet.n-1].delta_gpu, 1);
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
printf("features %f\n", cuda_mag_array(gnet.layers[gnet.n-1].delta_gpu, imlayer.outputs*imlayer.batch));
axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1);
axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet.layers[gnet.n-1].delta_gpu, 1);
backward_network_gpu(gnet);
@ -716,21 +716,21 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
*net.seen += net.batch;
forward_network_gpu(net);
fill_ongpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
copy_ongpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
fill_ongpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1);
fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
copy_gpu(anet.inputs*anet.batch, imlayer.output_gpu, 1, anet.input_gpu, 1);
fill_gpu(anet.inputs*anet.batch, .95, anet.truth_gpu, 1);
anet.delta_gpu = imerror;
forward_network_gpu(anet);
backward_network_gpu(anet);
scal_ongpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1);
scal_gpu(imlayer.outputs*imlayer.batch, 1./100., net.layers[net.n-1].delta_gpu, 1);
scal_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs*imlayer.batch));
axpy_ongpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1);
axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, net.layers[net.n-1].delta_gpu, 1);
backward_network_gpu(net);
@ -876,7 +876,7 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi
float *imerror = cuda_make_array(0, imlayer.outputs);
float *ones_gpu = cuda_make_array(0, ay_size);
fill_ongpu(ay_size, 1, ones_gpu, 1);
fill_gpu(ay_size, 1, ones_gpu, 1);
float aloss_avg = -1;
float gloss_avg = -1;
@ -902,15 +902,15 @@ void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfi
*net.seen += net.batch;
forward_network_gpu(net, gstate);
fill_ongpu(imlayer.outputs, 0, imerror, 1);
fill_gpu(imlayer.outputs, 0, imerror, 1);
astate.input = imlayer.output_gpu;
astate.delta = imerror;
astate.truth = ones_gpu;
forward_network_gpu(anet, astate);
backward_network_gpu(anet, astate);
scal_ongpu(imlayer.outputs, 1, imerror, 1);
axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
scal_gpu(imlayer.outputs, 1, imerror, 1);
axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
backward_network_gpu(net, gstate);

View File

@ -50,7 +50,7 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
cuda_push_array(net->input_gpu, im.data, net->inputs);
forward_network_gpu(*net);
copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
copy_gpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1);
cuda_pull_array(last.delta_gpu, last.delta, last.outputs);
calculate_loss(last.delta, last.delta, last.outputs, thresh);
@ -141,7 +141,7 @@ void reconstruct_picture(network net, float *features, image recon, image update
forward_network_gpu(net);
cuda_push_array(l.delta_gpu, features, l.outputs);
axpy_ongpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
axpy_gpu(l.outputs, -1, l.output_gpu, 1, l.delta_gpu, 1);
backward_network_gpu(net);
cuda_pull_array(net.delta_gpu, delta.data, delta.w*delta.h*delta.c);

View File

@ -114,7 +114,10 @@ void reset_rnn_state(network net, int b)
#ifdef GPU
layer l = net.layers[i];
if(l.state_gpu){
fill_ongpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
fill_gpu(l.outputs, 0, l.state_gpu + l.outputs*b, 1);
}
if(l.h_gpu){
fill_gpu(l.outputs, 0, l.h_gpu + l.outputs*b, 1);
}
#endif
}

View File

@ -27,6 +27,11 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
}
srand(time(0));
network net = nets[0];
image pred = get_network_image(net);
int div = net.w/pred.w;
assert(pred.w * div == net.w);
assert(pred.h * div == net.h);
int imgs = net.batch * net.subdivisions * ngpus;
@ -46,6 +51,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
args.w = net.w;
args.h = net.h;
args.threads = 32;
args.scale = div;
args.min = net.min_crop;
args.max = net.max_crop;
@ -75,15 +81,6 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
pthread_join(load_thread, 0);
train = buffer;
load_thread = load_data(args);
image tr = float_to_image(net.w, net.h, 81, train.y.vals[0]);
image im = float_to_image(net.w, net.h, net.c, train.X.vals[0]);
image mask = mask_to_rgb(tr);
show_image(im, "input");
show_image(mask, "truth");
#ifdef OPENCV
cvWaitKey(100);
#endif
free_image(mask);
printf("Loaded: %lf seconds\n", sec(clock()-time));
time=clock();
@ -98,6 +95,20 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
#else
loss = train_network(net, train);
#endif
if(1){
image tr = float_to_image(net.w/div, net.h/div, 80, train.y.vals[net.batch]);
image im = float_to_image(net.w, net.h, net.c, train.X.vals[net.batch]);
image mask = mask_to_rgb(tr);
image prmask = mask_to_rgb(pred);
show_image(im, "input");
show_image(prmask, "pred");
show_image(mask, "truth");
#ifdef OPENCV
cvWaitKey(100);
#endif
free_image(mask);
free_image(prmask);
}
if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1;
printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);

View File

@ -84,7 +84,7 @@ typedef enum {
} LAYER_TYPE;
typedef enum{
SSE, MASKED, L1, SMOOTH
SSE, MASKED, L1, SEG, SMOOTH
} COST_TYPE;
typedef struct{
@ -203,6 +203,8 @@ struct layer{
float * forgot_state;
float * forgot_delta;
float * state_delta;
float * combine_cpu;
float * combine_delta_cpu;
float * concat;
float * concat_delta;
@ -271,6 +273,10 @@ struct layer{
struct layer *self_layer;
struct layer *output_layer;
struct layer *reset_layer;
struct layer *update_layer;
struct layer *state_layer;
struct layer *input_gate_layer;
struct layer *state_gate_layer;
struct layer *input_save_layer;
@ -335,6 +341,9 @@ struct layer{
float *bias_v_gpu;
float *scale_v_gpu;
float * combine_gpu;
float * combine_delta_gpu;
float * prev_state_gpu;
float * forgot_state_gpu;
float * forgot_delta_gpu;
@ -575,10 +584,10 @@ void normalize_cpu(float *x, float *mean, float *variance, int batch, int filter
int best_3d_shift_r(image a, image b, int min, int max);
#ifdef GPU
void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void fill_ongpu(int N, float ALPHA, float * X, int INCX);
void scal_ongpu(int N, float ALPHA, float * X, int INCX);
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void fill_gpu(int N, float ALPHA, float * X, int INCX);
void scal_gpu(int N, float ALPHA, float * X, int INCX);
void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
void cuda_set_device(int n);
void cuda_free(float *x_gpu);

View File

@ -152,13 +152,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt
if(i < n) delta[i] *= gradient_kernel(x[i], a);
}
extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a)
extern "C" void activate_array_gpu(float *x, int n, ACTIVATION a)
{
activate_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a);
check_error(cudaPeekAtLastError());
}
extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta)
extern "C" void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta)
{
gradient_array_kernel<<<cuda_gridsize(n), BLOCK>>>(x, n, a, delta);
check_error(cudaPeekAtLastError());

View File

@ -51,13 +51,13 @@ void backward_activation_layer(layer l, network net)
void forward_activation_layer_gpu(layer l, network net)
{
copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}
void backward_activation_layer_gpu(layer l, network net)
{
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -12,8 +12,8 @@ float gradient(float x, ACTIVATION a);
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU
void activate_array_ongpu(float *x, int n, ACTIVATION a);
void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta);
void activate_array_gpu(float *x, int n, ACTIVATION a);
void gradient_array_gpu(float *x, int n, ACTIVATION a, float *delta);
#endif
static inline float stair_activate(float x)

View File

@ -188,8 +188,8 @@ void push_batchnorm_layer(layer l)
void forward_batchnorm_layer_gpu(layer l, network net)
{
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
if (net.train) {
#ifdef CUDNN
float one = 1;
@ -215,14 +215,14 @@ void forward_batchnorm_layer_gpu(layer l, network net)
fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);
scal_ongpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_ongpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_ongpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_ongpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_mean_gpu, 1);
axpy_gpu(l.out_c, .01, l.mean_gpu, 1, l.rolling_mean_gpu, 1);
scal_gpu(l.out_c, .99, l.rolling_variance_gpu, 1);
axpy_gpu(l.out_c, .01, l.variance_gpu, 1, l.rolling_variance_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
@ -263,7 +263,7 @@ void backward_batchnorm_layer_gpu(layer l, network net)
.00001,
l.mean_gpu,
l.variance_gpu);
copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
#else
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);
@ -274,6 +274,6 @@ void backward_batchnorm_layer_gpu(layer l, network net)
fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
#endif
if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
if(l.type == BATCHNORM) copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -55,6 +55,16 @@ void weighted_sum_cpu(float *a, float *b, float *s, int n, float *c)
}
}
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc)
{
int i;
for(i = 0; i < n; ++i){
if(da) da[i] += dc[i] * s[i];
if(db) db[i] += dc[i] * (1-s[i]);
ds[i] += dc[i] * (a[i] - b[i]);
}
}
void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
{
int stride = w1/w2;
@ -162,12 +172,48 @@ void fill_cpu(int N, float ALPHA, float *X, int INCX)
for(i = 0; i < N; ++i) X[i*INCX] = ALPHA;
}
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
if(X) X[j*NX + i] += OUT[index];
++index;
}
for(i = 0; i < NY; ++i){
if(Y) Y[j*NY + i] += OUT[index];
++index;
}
}
}
void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i, j;
int index = 0;
for(j = 0; j < B; ++j) {
for(i = 0; i < NX; ++i){
OUT[index++] = X[j*NX + i];
}
for(i = 0; i < NY; ++i){
OUT[index++] = Y[j*NY + i];
}
}
}
void copy_cpu(int N, float *X, int INCX, float *Y, int INCY)
{
int i;
for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX];
}
void mult_add_into_cpu(int N, float *X, float *Y, float *Z)
{
int i;
for(i = 0; i < N; ++i) Z[i] += X[i]*Y[i];
}
void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
{
int i;

View File

@ -10,8 +10,12 @@ void reorg_cpu(float *x, int w, int h, int c, int batch, int stride, int forward
void test_blas();
void inter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void deinter_cpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void mult_add_into_cpu(int N, float *X, float *Y, float *Z);
void const_cpu(int N, float ALPHA, float *X, int INCX);
void constrain_ongpu(int N, float ALPHA, float * X, int INCX);
void constrain_gpu(int N, float ALPHA, float * X, int INCX);
void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
@ -33,6 +37,7 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);
void softmax(float *input, int n, float temp, int stride, float *output);
void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
@ -41,16 +46,17 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i
#include "cuda.h"
#include "tree.h"
void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void add_ongpu(int N, float ALPHA, float * X, int INCX);
void supp_ongpu(int N, float ALPHA, float * X, int INCX);
void mask_ongpu(int N, float * X, float mask_num, float * mask);
void const_ongpu(int N, float ALPHA, float *X, int INCX);
void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_ongpu(int N, float *X, int INCX, float *Y, int INCY);
void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
void add_gpu(int N, float ALPHA, float * X, int INCX);
void supp_gpu(int N, float ALPHA, float * X, int INCX);
void mask_gpu(int N, float * X, float mask_num, float * mask);
void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
void const_gpu(int N, float ALPHA, float *X, int INCX);
void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);
void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
@ -76,14 +82,16 @@ void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
void mult_add_into_gpu(int num, float *a, float *b, float *c);
void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT);
void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
#endif

View File

@ -176,16 +176,16 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2
extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
{
scal_ongpu(n, B1, m, 1);
scal_ongpu(n, B2, v, 1);
axpy_ongpu(n, -decay*batch, w, 1, d, 1);
scal_gpu(n, B1, m, 1);
scal_gpu(n, B2, v, 1);
axpy_gpu(n, -decay*batch, w, 1, d, 1);
axpy_ongpu(n, (1-B1), d, 1, m, 1);
mul_ongpu(n, d, 1, d, 1);
axpy_ongpu(n, (1-B2), d, 1, v, 1);
axpy_gpu(n, (1-B1), d, 1, m, 1);
mul_gpu(n, d, 1, d, 1);
axpy_gpu(n, (1-B2), d, 1, v, 1);
adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t);
fill_ongpu(n, 0, d, 1);
adam_gpu(n, w, m, v, B1, B2, rate, eps, t);
fill_gpu(n, 0, d, 1);
}
__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
@ -556,35 +556,35 @@ extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int
check_error(cudaPeekAtLastError());
}
extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
extern "C" void axpy_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
{
axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
axpy_gpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY);
}
extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
extern "C" void pow_gpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY)
{
pow_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX, Y, INCY);
check_error(cudaPeekAtLastError());
}
extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
extern "C" void axpy_gpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
{
axpy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY);
check_error(cudaPeekAtLastError());
}
extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY)
extern "C" void copy_gpu(int N, float * X, int INCX, float * Y, int INCY)
{
copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY);
copy_gpu_offset(N, X, 0, INCX, Y, 0, INCY);
}
extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY)
extern "C" void mul_gpu(int N, float * X, int INCX, float * Y, int INCY)
{
mul_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, INCX, Y, INCY);
check_error(cudaPeekAtLastError());
}
extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
extern "C" void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY)
{
copy_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, OFFX, INCX, Y, OFFY, INCY);
check_error(cudaPeekAtLastError());
@ -607,58 +607,70 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat
else out[i1] = x[i2];
}
extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out)
extern "C" void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out)
{
int size = spatial*batch*layers;
flatten_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, spatial, layers, batch, forward, out);
check_error(cudaPeekAtLastError());
}
extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out)
{
int size = w*h*c*batch;
reorg_kernel<<<cuda_gridsize(size), BLOCK>>>(size, x, w, h, c, batch, stride, forward, out);
check_error(cudaPeekAtLastError());
}
extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
__global__ void scale_mask_kernel(int n, float *x, float mask_num, float *mask, float scale)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n && mask[i] == mask_num) x[i] *= scale;
}
extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale)
{
scale_mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, scale);
check_error(cudaPeekAtLastError());
}
extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask)
{
mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
check_error(cudaPeekAtLastError());
}
extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX)
{
const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
}
extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void constrain_gpu(int N, float ALPHA, float * X, int INCX)
{
constrain_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
}
extern "C" void add_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void add_gpu(int N, float ALPHA, float * X, int INCX)
{
add_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
}
extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void scal_gpu(int N, float ALPHA, float * X, int INCX)
{
scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
}
extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void supp_gpu(int N, float ALPHA, float * X, int INCX)
{
supp_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
}
extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX)
{
fill_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
check_error(cudaPeekAtLastError());
@ -686,6 +698,9 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int
int minw = (w1 < w2) ? w1 : w2;
int minh = (h1 < h2) ? h1 : h2;
int minc = (c1 < c2) ? c1 : c2;
assert(w1 == w2);
assert(h1 == h2);
assert(c1 == c2);
int stride = w1/w2;
int sample = w2/w1;
@ -765,6 +780,46 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *
}
}
__global__ void deinter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < (NX+NY)*B){
int b = i / (NX+NY);
int j = i % (NX+NY);
if (j < NX){
if(X) X[b*NX + j] += OUT[i];
} else {
if(Y) Y[b*NY + j - NX] += OUT[i];
}
}
}
extern "C" void deinter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
deinter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
check_error(cudaPeekAtLastError());
}
__global__ void inter_kernel(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < (NX+NY)*B){
int b = i / (NX+NY);
int j = i % (NX+NY);
if (j < NX){
OUT[i] = X[b*NX + j];
} else {
OUT[i] = Y[b*NY + j - NX];
}
}
}
extern "C" void inter_gpu(int NX, float *X, int NY, float *Y, int B, float *OUT)
{
inter_kernel<<<cuda_gridsize((NX+NY)*B), BLOCK>>>(NX, X, NY, Y, B, OUT);
check_error(cudaPeekAtLastError());
}
extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c)
{
weighted_sum_kernel<<<cuda_gridsize(num), BLOCK>>>(num, a, b, s, c);
@ -776,8 +831,8 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if(i < n){
if(da) da[i] += dc[i] * s[i];
db[i] += dc[i] * (1-s[i]);
ds[i] += dc[i] * a[i] + dc[i] * -b[i];
if(db) db[i] += dc[i] * (1-s[i]);
ds[i] += dc[i] * (a[i] - b[i]);
}
}

View File

@ -6,7 +6,7 @@ void col2im_cpu(float* data_col,
int ksize, int stride, int pad, float* data_im);
#ifdef GPU
void col2im_ongpu(float *data_col,
void col2im_gpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im);
#endif

View File

@ -41,7 +41,7 @@ __global__ void col2im_gpu_kernel(const int n, const float* data_col,
}
}
void col2im_ongpu(float *data_col,
void col2im_gpu(float *data_col,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_im){
// We are going to launch channels * height_col * width_col kernels, each

View File

@ -271,23 +271,23 @@ void update_connected_layer_gpu(layer l, update_args a)
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
}
}else{
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
if(l.batch_normalize){
axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
axpy_gpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.outputs, momentum, l.scale_updates_gpu, 1);
}
axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
axpy_gpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
}
}
void forward_connected_layer_gpu(layer l, network net)
{
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
int m = l.batch;
int k = l.inputs;
@ -295,20 +295,20 @@ void forward_connected_layer_gpu(layer l, network net)
float * a = net.input_gpu;
float * b = l.weights_gpu;
float * c = l.output_gpu;
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, net);
} else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
}
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}
void backward_connected_layer_gpu(layer l, network net)
{
constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net);
} else {
@ -321,7 +321,7 @@ void backward_connected_layer_gpu(layer l, network net)
float * a = l.delta_gpu;
float * b = net.input_gpu;
float * c = l.weight_updates_gpu;
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
gemm_gpu(1,0,m,n,k,1,a,m,b,n,1,c,n);
m = l.batch;
k = l.outputs;
@ -331,6 +331,6 @@ void backward_connected_layer_gpu(layer l, network net)
b = l.weights_gpu;
c = net.delta_gpu;
if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
if(c) gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
#endif

View File

@ -72,7 +72,7 @@ void binarize_weights_gpu(float *weights, int n, int size, float *binary)
void forward_convolutional_layer_gpu(convolutional_layer l, network net)
{
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.binary){
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
swap_binary(&l);
@ -107,11 +107,11 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
int k = l.size*l.size*l.c;
int n = l.out_w*l.out_h;
for(i = 0; i < l.batch; ++i){
im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
float * a = l.weights_gpu;
float * b = net.workspace;
float * c = l.output_gpu;
gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
gemm_gpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
}
#endif
@ -121,7 +121,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network net)
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
//if(l.dot > 0) dot_error_gpu(l);
if(l.binary || l.xnor) swap_binary(&l);
}
@ -173,8 +173,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
if(l.smooth){
smooth_layer(l, 5, l.smooth);
}
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
if(l.batch_normalize){
@ -217,7 +217,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
l.dsrcTensorDesc,
net.delta_gpu);
if(l.binary || l.xnor) swap_binary(&l);
if(l.xnor) gradient_array_ongpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
if(l.xnor) gradient_array_gpu(original_input, l.batch*l.c*l.h*l.w, HARDTAN, net.delta_gpu);
}
#else
@ -231,8 +231,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
float * b = net.workspace;
float * c = l.weight_updates_gpu;
im2col_ongpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
im2col_gpu(net.input_gpu + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, net.workspace);
gemm_gpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
if(net.delta_gpu){
if(l.binary || l.xnor) swap_binary(&l);
@ -240,13 +240,13 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network net)
float * b = l.delta_gpu;
float * c = net.workspace;
gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
gemm_gpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu + i*l.c*l.h*l.w);
if(l.binary || l.xnor) {
swap_binary(&l);
}
if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
if(l.xnor) gradient_array_gpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, net.delta_gpu + i*l.c*l.h*l.w);
}
}
#endif
@ -294,16 +294,16 @@ void update_convolutional_layer_gpu(layer l, update_args a)
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
if(l.scales_gpu){
axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
}
}
}

View File

@ -9,6 +9,7 @@
COST_TYPE get_cost_type(char *s)
{
if (strcmp(s, "seg")==0) return SEG;
if (strcmp(s, "sse")==0) return SSE;
if (strcmp(s, "masked")==0) return MASKED;
if (strcmp(s, "smooth")==0) return SMOOTH;
@ -20,6 +21,8 @@ COST_TYPE get_cost_type(char *s)
char *get_cost_string(COST_TYPE a)
{
switch(a){
case SEG:
return "seg";
case SSE:
return "sse";
case MASKED:
@ -122,11 +125,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
{
if (!net.truth_gpu) return;
if(l.smooth){
scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
}
if (l.cost_type == MASKED) {
mask_ongpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
mask_gpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
}
if(l.cost_type == SMOOTH){
@ -137,6 +140,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
}
if (l.cost_type == SEG && l.noobject_scale != 1) {
scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
}
if(l.ratio){
cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
qsort(l.delta, l.batch*l.inputs, sizeof(float), float_abs_compare);
@ -144,11 +152,11 @@ void forward_cost_layer_gpu(cost_layer l, network net)
float thresh = l.delta[n];
thresh = 0;
printf("%f\n", thresh);
supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
supp_gpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
}
if(l.thresh){
supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
supp_gpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
}
cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
@ -157,7 +165,7 @@ void forward_cost_layer_gpu(cost_layer l, network net)
void backward_cost_layer_gpu(const cost_layer l, network net)
{
axpy_ongpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
axpy_gpu(l.batch*l.inputs, l.scale, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -209,10 +209,10 @@ void forward_crnn_layer_gpu(layer l, network net)
layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer);
fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_gpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_gpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
for (i = 0; i < l.steps; ++i) {
s.input_gpu = net.input_gpu;
@ -224,12 +224,12 @@ void forward_crnn_layer_gpu(layer l, network net)
float *old_state = l.state_gpu;
if(net.train) l.state_gpu += l.hidden*l.batch;
if(l.shortcut){
copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
copy_gpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
}else{
fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
fill_gpu(l.hidden * l.batch, 0, l.state_gpu, 1);
}
axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
s.input_gpu = l.state_gpu;
forward_convolutional_layer_gpu(output_layer, s);
@ -254,8 +254,8 @@ void backward_crnn_layer_gpu(layer l, network net)
increment_layer(&output_layer, l.steps - 1);
l.state_gpu += l.hidden*l.batch*l.steps;
for (i = l.steps-1; i >= 0; --i) {
copy_ongpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
copy_gpu(l.hidden * l.batch, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
s.input_gpu = l.state_gpu;
s.delta_gpu = self_layer.delta_gpu;
@ -268,8 +268,8 @@ void backward_crnn_layer_gpu(layer l, network net)
if (i == 0) s.delta_gpu = 0;
backward_convolutional_layer_gpu(self_layer, s);
copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
copy_gpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
if (i > 0 && l.shortcut) axpy_gpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
else s.delta_gpu = 0;

View File

@ -97,7 +97,7 @@ float *cuda_make_array(float *x, size_t n)
status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
check_error(status);
} else {
fill_ongpu(n, 0, x_gpu, 1);
fill_gpu(n, 0, x_gpu, 1);
}
if(!x_gpu) error("Cuda malloc failed\n");
return x_gpu;

View File

@ -551,6 +551,33 @@ void exclusive_image(image src)
}
image get_segmentation_image(char *path, int w, int h, int classes)
{
char labelpath[4096];
find_replace(path, "images", "mask", labelpath);
find_replace(labelpath, "JPEGImages", "mask", labelpath);
find_replace(labelpath, ".jpg", ".txt", labelpath);
find_replace(labelpath, ".JPG", ".txt", labelpath);
find_replace(labelpath, ".JPEG", ".txt", labelpath);
image mask = make_image(w, h, classes);
FILE *file = fopen(labelpath, "r");
if(!file) file_error(labelpath);
char buff[32788];
int id;
image part = make_image(w, h, 1);
while(fscanf(file, "%d %s", &id, buff) == 2){
int n = 0;
int *rle = read_intlist(buff, &n, 0);
load_rle(part, rle, n);
or_image(part, mask, id);
free(rle);
}
//exclusive_image(mask);
fclose(file);
free_image(part);
return mask;
}
image get_segmentation_image2(char *path, int w, int h, int classes)
{
char labelpath[4096];
find_replace(path, "images", "mask", labelpath);
@ -584,7 +611,7 @@ image get_segmentation_image(char *path, int w, int h, int classes)
return mask;
}
data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure)
data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int min, int max, float angle, float aspect, float hue, float saturation, float exposure, int div)
{
char **random_paths = get_random_paths(paths, n, m);
int i;
@ -597,12 +624,14 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi
d.y.rows = n;
d.y.cols = h*w*(classes+1);
d.y.cols = h*w*classes/div/div;
d.y.vals = calloc(d.X.rows, sizeof(float*));
for(i = 0; i < n; ++i){
image orig = load_image_color(random_paths[i], 0, 0);
augment_args a = random_augment_args(orig, angle, aspect, min, max, w, h);
a.dx = 0;
a.dy = 0;
image sized = rotate_crop_image(orig, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
int flip = rand()%2;
@ -612,7 +641,7 @@ data load_data_seg(int n, char **paths, int m, int w, int h, int classes, int mi
image mask = get_segmentation_image(random_paths[i], orig.w, orig.h, classes);
//image mask = make_image(orig.w, orig.h, classes+1);
image sized_m = rotate_crop_image(mask, a.rad, a.scale, a.w, a.h, a.dx, a.dy, a.aspect);
image sized_m = rotate_crop_image(mask, a.rad, a.scale/div, a.w/div, a.h/div, a.dx, a.dy, a.aspect);
if(flip) flip_image(sized_m);
d.y.vals[i] = sized_m.data;
@ -874,7 +903,7 @@ void *load_thread(void *ptr)
} else if (a.type == WRITING_DATA){
*a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
} else if (a.type == SEGMENTATION_DATA){
*a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
*a.d = load_data_seg(a.n, a.paths, a.m, a.w, a.h, a.classes, a.min, a.max, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.scale);
} else if (a.type == REGION_DATA){
*a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
} else if (a.type == DETECTION_DATA){

View File

@ -22,31 +22,31 @@ extern "C" void forward_deconvolutional_layer_gpu(layer l, network net)
int n = l.h*l.w;
int k = l.c;
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
for(i = 0; i < l.batch; ++i){
float *a = l.weights_gpu;
float *b = net.input_gpu + i*l.c*l.h*l.w;
float *c = net.workspace;
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
gemm_gpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
col2im_ongpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
col2im_gpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.outputs);
}
if (l.batch_normalize) {
forward_batchnorm_layer_gpu(l, net);
} else {
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
}
activate_array_ongpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
activate_array_gpu(l.output_gpu, l.batch*l.n*l.out_w*l.out_h, l.activation);
}
extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
{
int i;
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net);
@ -65,9 +65,9 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
float *b = net.workspace;
float *c = l.weight_updates_gpu;
im2col_ongpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w,
im2col_gpu(l.delta_gpu + i*l.outputs, l.out_c, l.out_h, l.out_w,
l.size, l.stride, l.pad, b);
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
gemm_gpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
if(net.delta_gpu){
int m = l.c;
@ -78,7 +78,7 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
float *b = net.workspace;
float *c = net.delta_gpu + i*n*m;
gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
gemm_gpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
}
}
@ -125,16 +125,16 @@ void update_deconvolutional_layer_gpu(layer l, update_args a)
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
}
}else{
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
if(l.scales_gpu){
axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
axpy_gpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
}
}
}

View File

@ -256,7 +256,7 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
void forward_detection_layer_gpu(const detection_layer l, network net)
{
if(!net.train){
copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
return;
}
@ -270,8 +270,8 @@ void forward_detection_layer_gpu(const detection_layer l, network net)
void backward_detection_layer_gpu(detection_layer l, network net)
{
axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
//copy_ongpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
//copy_gpu(l.batch*l.inputs, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -165,7 +165,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
#include <math.h>
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A_gpu, int lda,
float *B_gpu, int ldb,
float BETA,
@ -177,24 +177,6 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
check_error(status);
}
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A, int lda,
float *B, int ldb,
float BETA,
float *C, int ldc)
{
float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
float *C_gpu = cuda_make_array(C, ldc*M);
gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);
cuda_pull_array(C_gpu, C, ldc*M);
cuda_free(A_gpu);
cuda_free(B_gpu);
cuda_free(C_gpu);
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -224,7 +206,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n)
free(c);
}
void time_ongpu(int TA, int TB, int m, int k, int n)
void time_gpu(int TA, int TB, int m, int k, int n)
{
int iter = 10;
float *a = random_matrix(m,k);
@ -242,7 +224,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n)
int i;
clock_t start = clock(), end;
for(i = 0; i<iter; ++i){
gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
gemm_gpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
cudaThreadSynchronize();
}
double flop = ((double)m)*n*(2.*k + 2.)*iter;
@ -313,24 +295,24 @@ int test_gpu_blas()
test_gpu_accuracy(0,0,10,10,10);
time_ongpu(0,0,64,2916,363);
time_ongpu(0,0,64,2916,363);
time_ongpu(0,0,64,2916,363);
time_ongpu(0,0,192,729,1600);
time_ongpu(0,0,384,196,1728);
time_ongpu(0,0,256,196,3456);
time_ongpu(0,0,256,196,2304);
time_ongpu(0,0,128,4096,12544);
time_ongpu(0,0,128,4096,4096);
time_gpu(0,0,64,2916,363);
time_gpu(0,0,64,2916,363);
time_gpu(0,0,64,2916,363);
time_gpu(0,0,192,729,1600);
time_gpu(0,0,384,196,1728);
time_gpu(0,0,256,196,3456);
time_gpu(0,0,256,196,2304);
time_gpu(0,0,128,4096,12544);
time_gpu(0,0,128,4096,4096);
*/
time_ongpu(0,0,64,75,12544);
time_ongpu(0,0,64,75,12544);
time_ongpu(0,0,64,75,12544);
time_ongpu(0,0,64,576,12544);
time_ongpu(0,0,256,2304,784);
time_ongpu(1,1,2304,256,784);
time_ongpu(0,0,512,4608,196);
time_ongpu(1,1,4608,512,196);
time_gpu(0,0,64,75,12544);
time_gpu(0,0,64,75,12544);
time_gpu(0,0,64,75,12544);
time_gpu(0,0,64,576,12544);
time_gpu(0,0,256,2304,784);
time_gpu(1,1,2304,256,784);
time_gpu(0,0,512,4608,196);
time_gpu(1,1,4608,512,196);
return 0;
}

View File

@ -19,7 +19,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *C, int ldc);
#ifdef GPU
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,
float *A_gpu, int lda,
float *B_gpu, int ldb,
float BETA,

View File

@ -238,16 +238,16 @@ void forward_gru_layer_gpu(layer l, network net)
layer wr = *(l.wr);
layer wh = *(l.wh);
fill_ongpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
if(net.train) {
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
}
for (i = 0; i < l.steps; ++i) {
@ -260,32 +260,32 @@ void forward_gru_layer_gpu(layer l, network net)
forward_connected_layer_gpu(ur, s);
forward_connected_layer_gpu(uh, s);
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
s.input_gpu = l.forgot_state_gpu;
forward_connected_layer_gpu(wh, s);
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
if(l.tanh){
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
} else {
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
}
weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
net.input_gpu += l.inputs*l.batch;
l.output_gpu += l.outputs*l.batch;
@ -324,56 +324,58 @@ void backward_gru_layer_gpu(layer l, network net)
if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
l.output_gpu += l.outputs*l.batch*(l.steps-1);
l.delta_gpu += l.outputs*l.batch*(l.steps-1);
float *end_state = l.output_gpu;
for (i = l.steps-1; i >= 0; --i) {
if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
if(i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
else copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
copy_gpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
copy_gpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
copy_gpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
if(l.tanh){
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
} else {
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
}
weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
weighted_delta_gpu(l.state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
if(l.tanh){
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
gradient_array_gpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
} else {
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
gradient_array_gpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
}
copy_ongpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
mul_gpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
fill_gpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
s.input_gpu = l.forgot_state_gpu;
s.delta_gpu = l.forgot_delta_gpu;
backward_connected_layer_gpu(wh, s);
if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, ur.delta_gpu);
mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.state_gpu, ur.delta_gpu);
gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
copy_ongpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
gradient_array_gpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
copy_gpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
copy_ongpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
gradient_array_gpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
copy_gpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
s.input_gpu = l.prev_state_gpu;
s.input_gpu = l.state_gpu;
s.delta_gpu = prev_delta_gpu;
backward_connected_layer_gpu(wr, s);
@ -399,5 +401,6 @@ void backward_gru_layer_gpu(layer l, network net)
increment_layer(&wr, -1);
increment_layer(&wh, -1);
}
copy_gpu(l.outputs*l.batch, end_state, 1, l.state_gpu, 1);
}
#endif

View File

@ -7,7 +7,7 @@ void im2col_cpu(float* data_im,
#ifdef GPU
void im2col_ongpu(float *im,
void im2col_gpu(float *im,
int channels, int height, int width,
int ksize, int stride, int pad,float *data_col);

View File

@ -45,7 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im,
}
}
void im2col_ongpu(float *im,
void im2col_gpu(float *im,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_col){
// We are going to launch channels * height_col * width_col kernels, each

View File

@ -191,12 +191,12 @@ void forward_local_layer_gpu(const local_layer l, network net)
int locations = out_h * out_w;
for(i = 0; i < l.batch; ++i){
copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
copy_gpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
}
for(i = 0; i < l.batch; ++i){
float *input = net.input_gpu + i*l.w*l.h*l.c;
im2col_ongpu(input, l.c, l.h, l.w,
im2col_gpu(input, l.c, l.h, l.w,
l.size, l.stride, l.pad, net.workspace);
float *output = l.output_gpu + i*l.outputs;
for(j = 0; j < locations; ++j){
@ -208,10 +208,10 @@ void forward_local_layer_gpu(const local_layer l, network net)
int n = 1;
int k = l.size*l.size*l.c;
gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
gemm_gpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations);
}
}
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}
void backward_local_layer_gpu(local_layer l, network net)
@ -219,14 +219,14 @@ void backward_local_layer_gpu(local_layer l, network net)
int i, j;
int locations = l.out_w*l.out_h;
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
for(i = 0; i < l.batch; ++i){
axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
axpy_gpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
}
for(i = 0; i < l.batch; ++i){
float *input = net.input_gpu + i*l.w*l.h*l.c;
im2col_ongpu(input, l.c, l.h, l.w,
im2col_gpu(input, l.c, l.h, l.w,
l.size, l.stride, l.pad, net.workspace);
for(j = 0; j < locations; ++j){
@ -237,7 +237,7 @@ void backward_local_layer_gpu(local_layer l, network net)
int n = l.size*l.size*l.c;
int k = 1;
gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
gemm_gpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
}
if(net.delta_gpu){
@ -250,10 +250,10 @@ void backward_local_layer_gpu(local_layer l, network net)
int n = 1;
int k = l.n;
gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
gemm_gpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
}
col2im_ongpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
col2im_gpu(net.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, net.delta_gpu+i*l.c*l.h*l.w);
}
}
}
@ -267,12 +267,12 @@ void update_local_layer_gpu(local_layer l, update_args a)
int locations = l.out_w*l.out_h;
int size = l.size*l.size*l.c*l.n*locations;
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
axpy_gpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_gpu(l.outputs, momentum, l.bias_updates_gpu, 1);
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_gpu(size, momentum, l.weight_updates_gpu, 1);
}
void pull_local_layer(local_layer l)

View File

@ -41,36 +41,36 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
*(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uf->batch = batch;
l.wf = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wf->batch = batch;
l.ui = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ui->batch = batch;
l.wi = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wi->batch = batch;
l.ug = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ug->batch = batch;
l.wg = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wg->batch = batch;
l.uo = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uo->batch = batch;
l.wf = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wf->batch = batch;
l.wi = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wi->batch = batch;
l.wg = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wg->batch = batch;
l.wo = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
@ -410,17 +410,17 @@ void forward_lstm_layer_gpu(layer l, network state)
layer ug = *(l.ug);
layer uo = *(l.uo);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wf.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wi.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wg.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, wo.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
fill_ongpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, uf.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, ui.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, ug.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, uo.delta_gpu, 1);
if (state.train) {
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
}
for (i = 0; i < l.steps; ++i) {
@ -436,34 +436,34 @@ void forward_lstm_layer_gpu(layer l, network state)
forward_connected_layer_gpu(ug, s);
forward_connected_layer_gpu(uo, s);
copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
copy_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1);
activate_array_gpu(l.h_gpu, l.outputs*l.batch, TANH);
mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
copy_gpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
state.input_gpu += l.inputs*l.batch;
l.output_gpu += l.outputs*l.batch;
@ -514,97 +514,97 @@ void backward_lstm_layer_gpu(layer l, network state)
l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
for (i = l.steps - 1; i >= 0; --i) {
if (i != 0) copy_ongpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
if (i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
if (i != 0) copy_gpu(l.outputs*l.batch, l.cell_gpu - l.outputs*l.batch, 1, l.prev_cell_gpu, 1);
copy_gpu(l.outputs*l.batch, l.cell_gpu, 1, l.c_gpu, 1);
if (i != 0) copy_gpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
copy_gpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
copy_gpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1);
copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
copy_gpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1);
copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
copy_gpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1);
copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
copy_gpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1);
axpy_gpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1);
activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH);
activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC);
activate_array_gpu(l.g_gpu, l.outputs*l.batch, TANH);
activate_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC);
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
copy_gpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
mul_gpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1);
gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
gradient_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu);
axpy_gpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH);
mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1);
activate_array_gpu(l.temp_gpu, l.outputs*l.batch, TANH);
mul_gpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
gradient_array_gpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
s.input_gpu = l.prev_state_gpu;
s.delta_gpu = l.dh_gpu;
backward_connected_layer_gpu(wo, s);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
s.input_gpu = state.input_gpu;
s.delta_gpu = state.delta_gpu;
backward_connected_layer_gpu(uo, s);
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
gradient_array_gpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
s.input_gpu = l.prev_state_gpu;
s.delta_gpu = l.dh_gpu;
backward_connected_layer_gpu(wg, s);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
s.input_gpu = state.input_gpu;
s.delta_gpu = state.delta_gpu;
backward_connected_layer_gpu(ug, s);
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
gradient_array_gpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
s.input_gpu = l.prev_state_gpu;
s.delta_gpu = l.dh_gpu;
backward_connected_layer_gpu(wi, s);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
s.input_gpu = state.input_gpu;
s.delta_gpu = state.delta_gpu;
backward_connected_layer_gpu(ui, s);
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
gradient_array_gpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
s.input_gpu = l.prev_state_gpu;
s.delta_gpu = l.dh_gpu;
backward_connected_layer_gpu(wf, s);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
s.input_gpu = state.input_gpu;
s.delta_gpu = state.delta_gpu;
backward_connected_layer_gpu(uf, s);
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
mul_gpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
copy_gpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
state.input_gpu -= l.inputs*l.batch;
if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;

View File

@ -42,7 +42,7 @@ void forward_network_gpu(network net)
net.index = i;
layer l = net.layers[i];
if(l.delta_gpu){
fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
}
l.forward_gpu(l, net);
net.input_gpu = l.output_gpu;
@ -107,9 +107,9 @@ void harmless_update_network_gpu(network net)
int i;
for(i = 0; i < net.n; ++i){
layer l = net.layers[i];
if(l.weight_updates_gpu) fill_ongpu(l.nweights, 0, l.weight_updates_gpu, 1);
if(l.bias_updates_gpu) fill_ongpu(l.nbiases, 0, l.bias_updates_gpu, 1);
if(l.scale_updates_gpu) fill_ongpu(l.nbiases, 0, l.scale_updates_gpu, 1);
if(l.weight_updates_gpu) fill_gpu(l.nweights, 0, l.weight_updates_gpu, 1);
if(l.bias_updates_gpu) fill_gpu(l.nbiases, 0, l.bias_updates_gpu, 1);
if(l.scale_updates_gpu) fill_gpu(l.nbiases, 0, l.scale_updates_gpu, 1);
}
}
@ -383,6 +383,7 @@ float train_networks(network *nets, int n, data d, int interval)
float sum = 0;
for(i = 0; i < n; ++i){
nets[i].learning_rate *= n;
data p = get_data_part(d, i, n);
threads[i] = train_network_in_thread(nets[i], p, errors + i);
}

View File

@ -113,29 +113,29 @@ void forward_normalization_layer_gpu(const layer layer, network net)
int w = layer.w;
int h = layer.h;
int c = layer.c;
scal_ongpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
scal_gpu(w*h*c*layer.batch, 0, layer.squared_gpu, 1);
for(b = 0; b < layer.batch; ++b){
float *squared = layer.squared_gpu + w*h*c*b;
float *norms = layer.norms_gpu + w*h*c*b;
float *input = net.input_gpu + w*h*c*b;
pow_ongpu(w*h*c, 2, input, 1, squared, 1);
pow_gpu(w*h*c, 2, input, 1, squared, 1);
const_ongpu(w*h, layer.kappa, norms, 1);
const_gpu(w*h, layer.kappa, norms, 1);
for(k = 0; k < layer.size/2; ++k){
axpy_ongpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
axpy_gpu(w*h, layer.alpha, squared + w*h*k, 1, norms, 1);
}
for(k = 1; k < layer.c; ++k){
copy_ongpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
copy_gpu(w*h, norms + w*h*(k-1), 1, norms + w*h*k, 1);
int prev = k - ((layer.size-1)/2) - 1;
int next = k + (layer.size/2);
if(prev >= 0) axpy_ongpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
if(next < layer.c) axpy_ongpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
if(prev >= 0) axpy_gpu(w*h, -layer.alpha, squared + w*h*prev, 1, norms + w*h*k, 1);
if(next < layer.c) axpy_gpu(w*h, layer.alpha, squared + w*h*next, 1, norms + w*h*k, 1);
}
}
pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
mul_ongpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, layer.output_gpu, 1);
mul_gpu(w*h*c*layer.batch, net.input_gpu, 1, layer.output_gpu, 1);
}
void backward_normalization_layer_gpu(const layer layer, network net)
@ -145,7 +145,7 @@ void backward_normalization_layer_gpu(const layer layer, network net)
int w = layer.w;
int h = layer.h;
int c = layer.c;
pow_ongpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
mul_ongpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
pow_gpu(w*h*c*layer.batch, -layer.beta, layer.norms_gpu, 1, net.delta_gpu, 1);
mul_gpu(w*h*c*layer.batch, layer.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -213,13 +213,11 @@ layer parse_crnn(list *options, size_params params)
layer parse_rnn(list *options, size_params params)
{
int output = option_find_int(options, "output",1);
int hidden = option_find_int(options, "hidden",1);
char *activation_s = option_find_str(options, "activation", "logistic");
ACTIVATION activation = get_activation(activation_s);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
int logistic = option_find_int_quiet(options, "logistic", 0);
layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic, params.net.adam);
layer l = make_rnn_layer(params.batch, params.inputs, output, params.time_steps, activation, batch_normalize, params.net.adam);
l.shortcut = option_find_int_quiet(options, "shortcut", 0);
@ -353,6 +351,7 @@ cost_layer parse_cost(list *options, size_params params)
float scale = option_find_float_quiet(options, "scale",1);
cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
layer.ratio = option_find_float_quiet(options, "ratio",0);
layer.noobject_scale = option_find_float_quiet(options, "noobj", 1);
layer.thresh = option_find_float_quiet(options, "thresh",0);
return layer;
}
@ -921,12 +920,18 @@ void save_weights_upto(network net, char *filename, int cutoff)
save_connected_weights(*(l.uo), fp);
save_connected_weights(*(l.ug), fp);
} if (l.type == GRU) {
save_connected_weights(*(l.wz), fp);
save_connected_weights(*(l.wr), fp);
save_connected_weights(*(l.wh), fp);
save_connected_weights(*(l.uz), fp);
save_connected_weights(*(l.ur), fp);
save_connected_weights(*(l.uh), fp);
if(1){
save_connected_weights(*(l.wz), fp);
save_connected_weights(*(l.wr), fp);
save_connected_weights(*(l.wh), fp);
save_connected_weights(*(l.uz), fp);
save_connected_weights(*(l.ur), fp);
save_connected_weights(*(l.uh), fp);
}else{
save_connected_weights(*(l.reset_layer), fp);
save_connected_weights(*(l.update_layer), fp);
save_connected_weights(*(l.state_layer), fp);
}
} if(l.type == CRNN){
save_convolutional_weights(*(l.input_layer), fp);
save_convolutional_weights(*(l.self_layer), fp);
@ -1132,12 +1137,18 @@ void load_weights_upto(network *net, char *filename, int start, int cutoff)
load_connected_weights(*(l.ug), fp, transpose);
}
if (l.type == GRU) {
load_connected_weights(*(l.wz), fp, transpose);
load_connected_weights(*(l.wr), fp, transpose);
load_connected_weights(*(l.wh), fp, transpose);
load_connected_weights(*(l.uz), fp, transpose);
load_connected_weights(*(l.ur), fp, transpose);
load_connected_weights(*(l.uh), fp, transpose);
if(1){
load_connected_weights(*(l.wz), fp, transpose);
load_connected_weights(*(l.wr), fp, transpose);
load_connected_weights(*(l.wh), fp, transpose);
load_connected_weights(*(l.uz), fp, transpose);
load_connected_weights(*(l.ur), fp, transpose);
load_connected_weights(*(l.uh), fp, transpose);
}else{
load_connected_weights(*(l.reset_layer), fp, transpose);
load_connected_weights(*(l.update_layer), fp, transpose);
load_connected_weights(*(l.state_layer), fp, transpose);
}
}
if(l.type == LOCAL){
int locations = l.out_w*l.out_h;

View File

@ -434,14 +434,14 @@ void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, f
void forward_region_layer_gpu(const layer l, network net)
{
copy_ongpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
int b, n;
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
index = entry_index(l, b, n*l.w*l.h, l.coords);
if(!l.background) activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
if(!l.background) activate_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC);
}
}
if (l.softmax_tree){
@ -545,12 +545,12 @@ void backward_region_layer_gpu(const layer l, network net)
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
gradient_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
index = entry_index(l, b, n*l.w*l.h, l.coords);
if(!l.background) gradient_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index);
if(!l.background) gradient_array_gpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index);
}
}
axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

View File

@ -136,18 +136,18 @@ void forward_reorg_layer_gpu(layer l, network net)
int i;
if(l.flatten){
if(l.reverse){
flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
}else{
flatten_ongpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
flatten_gpu(net.input_gpu, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
}
} else if (l.extra) {
for(i = 0; i < l.batch; ++i){
copy_ongpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
copy_gpu(l.inputs, net.input_gpu + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
}
} else if (l.reverse) {
reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
}else {
reorg_ongpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
reorg_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
}
}
@ -155,19 +155,19 @@ void backward_reorg_layer_gpu(layer l, network net)
{
if(l.flatten){
if(l.reverse){
flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, net.delta_gpu);
}else{
flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
flatten_gpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, net.delta_gpu);
}
} else if (l.extra) {
int i;
for(i = 0; i < l.batch; ++i){
copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
copy_gpu(l.inputs, l.delta_gpu + i*l.outputs, 1, net.delta_gpu + i*l.inputs, 1);
}
} else if(l.reverse){
reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, net.delta_gpu);
} else {
reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
reorg_gpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, net.delta_gpu);
}
}
#endif

View File

@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
#endif
}
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam)
layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam)
{
fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
batch = batch / steps;
@ -34,24 +34,24 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
l.batch = batch;
l.type = RNN;
l.steps = steps;
l.hidden = hidden;
l.inputs = inputs;
l.state = calloc(batch*hidden*(steps+1), sizeof(float));
l.state = calloc(batch*outputs, sizeof(float));
l.prev_state = calloc(batch*outputs, sizeof(float));
l.input_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize, adam);
*(l.input_layer) = make_connected_layer(batch*steps, inputs, outputs, activation, batch_normalize, adam);
l.input_layer->batch = batch;
l.self_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize, adam);
*(l.self_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
l.self_layer->batch = batch;
l.output_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t");
*(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize, adam);
*(l.output_layer) = make_connected_layer(batch*steps, outputs, outputs, activation, batch_normalize, adam);
l.output_layer->batch = batch;
l.outputs = outputs;
@ -65,9 +65,15 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
l.forward_gpu = forward_rnn_layer_gpu;
l.backward_gpu = backward_rnn_layer_gpu;
l.update_gpu = update_rnn_layer_gpu;
l.state_gpu = cuda_make_array(l.state, batch*hidden*(steps+1));
l.state_gpu = cuda_make_array(0, batch*outputs);
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
l.output_gpu = l.output_layer->output_gpu;
l.delta_gpu = l.output_layer->delta_gpu;
#ifdef CUDNN
cudnnSetTensor4dDescriptor(l.input_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.input_layer->out_c, l.input_layer->out_h, l.input_layer->out_w);
cudnnSetTensor4dDescriptor(l.self_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.self_layer->out_c, l.self_layer->out_h, l.self_layer->out_w);
cudnnSetTensor4dDescriptor(l.output_layer->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.output_layer->out_c, l.output_layer->out_h, l.output_layer->out_w);
#endif
#endif
return l;
@ -90,9 +96,9 @@ void forward_rnn_layer(layer l, network net)
layer output_layer = *(l.output_layer);
fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1);
fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1);
fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1);
if(net.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, self_layer.delta, 1);
fill_cpu(l.outputs * l.batch * l.steps, 0, input_layer.delta, 1);
if(net.train) fill_cpu(l.outputs * l.batch, 0, l.state, 1);
for (i = 0; i < l.steps; ++i) {
s.input = net.input;
@ -102,14 +108,14 @@ void forward_rnn_layer(layer l, network net)
forward_connected_layer(self_layer, s);
float *old_state = l.state;
if(net.train) l.state += l.hidden*l.batch;
if(net.train) l.state += l.outputs*l.batch;
if(l.shortcut){
copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1);
copy_cpu(l.outputs * l.batch, old_state, 1, l.state, 1);
}else{
fill_cpu(l.hidden * l.batch, 0, l.state, 1);
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
}
axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1);
axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, input_layer.output, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
s.input = l.state;
forward_connected_layer(output_layer, s);
@ -134,32 +140,32 @@ void backward_rnn_layer(layer l, network net)
increment_layer(&self_layer, l.steps-1);
increment_layer(&output_layer, l.steps-1);
l.state += l.hidden*l.batch*l.steps;
l.state += l.outputs*l.batch*l.steps;
for (i = l.steps-1; i >= 0; --i) {
copy_cpu(l.hidden * l.batch, input_layer.output, 1, l.state, 1);
axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1);
copy_cpu(l.outputs * l.batch, input_layer.output, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output, 1, l.state, 1);
s.input = l.state;
s.delta = self_layer.delta;
backward_connected_layer(output_layer, s);
l.state -= l.hidden*l.batch;
l.state -= l.outputs*l.batch;
/*
if(i > 0){
copy_cpu(l.hidden * l.batch, input_layer.output - l.hidden*l.batch, 1, l.state, 1);
axpy_cpu(l.hidden * l.batch, 1, self_layer.output - l.hidden*l.batch, 1, l.state, 1);
copy_cpu(l.outputs * l.batch, input_layer.output - l.outputs*l.batch, 1, l.state, 1);
axpy_cpu(l.outputs * l.batch, 1, self_layer.output - l.outputs*l.batch, 1, l.state, 1);
}else{
fill_cpu(l.hidden * l.batch, 0, l.state, 1);
fill_cpu(l.outputs * l.batch, 0, l.state, 1);
}
*/
s.input = l.state;
s.delta = self_layer.delta - l.hidden*l.batch;
s.delta = self_layer.delta - l.outputs*l.batch;
if (i == 0) s.delta = 0;
backward_connected_layer(self_layer, s);
copy_cpu(l.hidden*l.batch, self_layer.delta, 1, input_layer.delta, 1);
if (i > 0 && l.shortcut) axpy_cpu(l.hidden*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.hidden*l.batch, 1);
copy_cpu(l.outputs*l.batch, self_layer.delta, 1, input_layer.delta, 1);
if (i > 0 && l.shortcut) axpy_cpu(l.outputs*l.batch, 1, self_layer.delta, 1, self_layer.delta - l.outputs*l.batch, 1);
s.input = net.input + i*l.inputs*l.batch;
if(net.delta) s.delta = net.delta + i*l.inputs*l.batch;
else s.delta = 0;
@ -196,17 +202,21 @@ void update_rnn_layer_gpu(layer l, update_args a)
void forward_rnn_layer_gpu(layer l, network net)
{
network s = net;
network s = {0};
s.train = net.train;
int i;
layer input_layer = *(l.input_layer);
layer self_layer = *(l.self_layer);
layer output_layer = *(l.output_layer);
fill_ongpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_ongpu(l.hidden * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, output_layer.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, self_layer.delta_gpu, 1);
fill_gpu(l.outputs * l.batch * l.steps, 0, input_layer.delta_gpu, 1);
if(net.train) {
fill_gpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
}
for (i = 0; i < l.steps; ++i) {
s.input_gpu = net.input_gpu;
@ -215,15 +225,9 @@ void forward_rnn_layer_gpu(layer l, network net)
s.input_gpu = l.state_gpu;
forward_connected_layer_gpu(self_layer, s);
float *old_state = l.state_gpu;
if(net.train) l.state_gpu += l.hidden*l.batch;
if(l.shortcut){
copy_ongpu(l.hidden * l.batch, old_state, 1, l.state_gpu, 1);
}else{
fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1);
}
axpy_ongpu(l.hidden * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_ongpu(l.hidden * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
s.input_gpu = l.state_gpu;
forward_connected_layer_gpu(output_layer, s);
@ -237,7 +241,7 @@ void forward_rnn_layer_gpu(layer l, network net)
void backward_rnn_layer_gpu(layer l, network net)
{
network s = net;
network s = {0};
s.train = net.train;
int i;
layer input_layer = *(l.input_layer);
@ -246,24 +250,32 @@ void backward_rnn_layer_gpu(layer l, network net)
increment_layer(&input_layer, l.steps - 1);
increment_layer(&self_layer, l.steps - 1);
increment_layer(&output_layer, l.steps - 1);
l.state_gpu += l.hidden*l.batch*l.steps;
float *last_input = input_layer.output_gpu;
float *last_self = self_layer.output_gpu;
for (i = l.steps-1; i >= 0; --i) {
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu, 1, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu, 1, l.state_gpu, 1);
s.input_gpu = l.state_gpu;
s.delta_gpu = self_layer.delta_gpu;
backward_connected_layer_gpu(output_layer, s);
l.state_gpu -= l.hidden*l.batch;
if(i != 0) {
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, input_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, self_layer.output_gpu - l.outputs*l.batch, 1, l.state_gpu, 1);
}else {
copy_gpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.state_gpu, 1);
}
copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
copy_gpu(l.outputs*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
s.input_gpu = l.state_gpu;
s.delta_gpu = self_layer.delta_gpu - l.hidden*l.batch;
s.delta_gpu = (i > 0) ? self_layer.delta_gpu - l.outputs*l.batch : 0;
if (i == 0) s.delta_gpu = 0;
backward_connected_layer_gpu(self_layer, s);
//copy_ongpu(l.hidden*l.batch, self_layer.delta_gpu, 1, input_layer.delta_gpu, 1);
if (i > 0 && l.shortcut) axpy_ongpu(l.hidden*l.batch, 1, self_layer.delta_gpu, 1, self_layer.delta_gpu - l.hidden*l.batch, 1);
s.input_gpu = net.input_gpu + i*l.inputs*l.batch;
if(net.delta_gpu) s.delta_gpu = net.delta_gpu + i*l.inputs*l.batch;
else s.delta_gpu = 0;
@ -273,5 +285,8 @@ void backward_rnn_layer_gpu(layer l, network net)
increment_layer(&self_layer, -1);
increment_layer(&output_layer, -1);
}
fill_gpu(l.outputs * l.batch, 0, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, last_input, 1, l.state_gpu, 1);
axpy_gpu(l.outputs * l.batch, 1, last_self, 1, l.state_gpu, 1);
}
#endif

View File

@ -7,7 +7,7 @@
#include "network.h"
#define USET
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam);
layer make_rnn_layer(int batch, int inputs, int outputs, int steps, ACTIVATION activation, int batch_normalize, int adam);
void forward_rnn_layer(layer l, network net);
void backward_rnn_layer(layer l, network net);

View File

@ -111,7 +111,7 @@ void forward_route_layer_gpu(const route_layer l, network net)
float *input = net.layers[index].output_gpu;
int input_size = l.input_sizes[i];
for(j = 0; j < l.batch; ++j){
copy_ongpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
copy_gpu(input_size, input + j*input_size, 1, l.output_gpu + offset + j*l.outputs, 1);
}
offset += input_size;
}
@ -126,7 +126,7 @@ void backward_route_layer_gpu(const route_layer l, network net)
float *delta = net.layers[index].delta_gpu;
int input_size = l.input_sizes[i];
for(j = 0; j < l.batch; ++j){
axpy_ongpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
axpy_gpu(input_size, 1, l.delta_gpu + offset + j*l.outputs, 1, delta + j*input_size, 1);
}
offset += input_size;
}

View File

@ -55,15 +55,15 @@ void backward_shortcut_layer(const layer l, network net)
#ifdef GPU
void forward_shortcut_layer_gpu(const layer l, network net)
{
copy_ongpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
}
void backward_shortcut_layer_gpu(const layer l, network net)
{
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1);
gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
axpy_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1);
shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, net.layers[l.index].delta_gpu);
}
#endif

View File

@ -81,7 +81,7 @@ void forward_softmax_layer_gpu(const softmax_layer l, network net)
void backward_softmax_layer_gpu(const softmax_layer layer, network net)
{
axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
axpy_gpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta_gpu, 1);
}
#endif