diff --git a/Makefile b/Makefile index a0393b53..2f724f55 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -GPU=0 -CUDNN=0 -OPENCV=0 +GPU=1 +CUDNN=1 +OPENCV=1 DEBUG=0 ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ diff --git a/cfg/gru.cfg b/cfg/gru.cfg index 76eaf0c6..ea0e22d9 100644 --- a/cfg/gru.cfg +++ b/cfg/gru.cfg @@ -22,17 +22,6 @@ max_batches=10000 [gru] batch_normalize=1 output = 1024 -tanh = 1 - -[gru] -batch_normalize=1 -output = 1024 -tanh = 1 - -[gru] -batch_normalize=1 -output = 1024 -tanh = 1 [connected] output=256 diff --git a/examples/captcha.c b/examples/captcha.c index 1de21fa3..fd4cb5a5 100644 --- a/examples/captcha.c +++ b/examples/captcha.c @@ -83,7 +83,7 @@ void train_captcha(char *cfgfile, char *weightfile) float loss = train_network(net, train); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); + printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); free_data(train); if(i%100==0){ char buff[256]; diff --git a/examples/cifar.c b/examples/cifar.c index a05d0997..fd01e601 100644 --- a/examples/cifar.c +++ b/examples/cifar.c @@ -25,7 +25,7 @@ void train_cifar(char *cfgfile, char *weightfile) float loss = train_network_sgd(net, train, 1); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.95 + loss*.05; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); if(*net.seen/N > epoch){ epoch = *net.seen/N; char buff[256]; @@ -81,7 +81,7 @@ void train_cifar_distill(char *cfgfile, char *weightfile) float loss = train_network_sgd(net, train, 1); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.95 + loss*.05; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); if(*net.seen/N > epoch){ epoch = *net.seen/N; char buff[256]; diff --git a/examples/classifier.c b/examples/classifier.c index 645c80d4..0d28a58b 100644 --- a/examples/classifier.c +++ b/examples/classifier.c @@ -105,7 +105,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus, #endif if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); free_data(train); if(*net.seen/N > epoch){ epoch = *net.seen/N; diff --git a/examples/detector.c b/examples/detector.c index e6b57b8c..4bec0507 100644 --- a/examples/detector.c +++ b/examples/detector.c @@ -128,7 +128,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i avg_loss = avg_loss*.9 + loss*.1; i = get_current_batch(net); - printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); + printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); if(i%1000==0){ #ifdef GPU if(ngpus != 1) sync_nets(nets, ngpus, 0); diff --git a/examples/dice.c b/examples/dice.c index 6a810492..f56d76c0 100644 --- a/examples/dice.c +++ b/examples/dice.c @@ -31,7 +31,7 @@ void train_dice(char *cfgfile, char *weightfile) float loss = train_network(net, train); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); + printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); free_data(train); if((i % 100) == 0) net.learning_rate *= .1; if(i%100==0){ diff --git a/examples/go.c b/examples/go.c index f4513574..7f4a250d 100644 --- a/examples/go.c +++ b/examples/go.c @@ -169,7 +169,7 @@ void train_go(char *cfgfile, char *weightfile, char *filename, int *gpus, int ng if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.95 + loss*.05; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); if(*net.seen/N > epoch){ epoch = *net.seen/N; char buff[256]; @@ -184,7 +184,7 @@ void train_go(char *cfgfile, char *weightfile, char *filename, int *gpus, int ng } if(get_current_batch(net)%10000 == 0){ char buff[256]; - sprintf(buff, "%s/%s_%d.backup",backup_directory,base,get_current_batch(net)); + sprintf(buff, "%s/%s_%ld.backup",backup_directory,base,get_current_batch(net)); save_weights(net, buff); } } diff --git a/examples/regressor.c b/examples/regressor.c index 6dd2b4a2..b246d48b 100644 --- a/examples/regressor.c +++ b/examples/regressor.c @@ -91,7 +91,7 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, #endif if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); free_data(train); if(*net.seen/N > epoch){ epoch = *net.seen/N; diff --git a/examples/rnn.c b/examples/rnn.c index 197e270d..1c45711d 100644 --- a/examples/rnn.c +++ b/examples/rnn.c @@ -182,7 +182,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, if (avg_loss < 0) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - int chars = get_current_batch(net)*batch; + size_t chars = get_current_batch(net)*batch; fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size); for(j = 0; j < streams; ++j){ @@ -194,12 +194,12 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, } } - if(i%1000==0){ + if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(net, buff); } - if(i%10==0){ + if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(net, buff); @@ -409,7 +409,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed) input[c] = 0; sum += log(out[next])/log2; c = next; - printf("%d Perplexity: %4.4f Word Perplexity: %4.4f\n", count, pow(2, -sum/count), pow(2, -sum/words)); + printf("%d BPC: %4.4f Perplexity: %4.4f Word Perplexity: %4.4f\n", count, -sum/count, pow(2, -sum/count), pow(2, -sum/words)); } } diff --git a/examples/segmenter.c b/examples/segmenter.c index fe3d498e..325593ed 100644 --- a/examples/segmenter.c +++ b/examples/segmenter.c @@ -100,7 +100,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus, #endif if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); free_data(train); if(*net.seen/N > epoch){ epoch = *net.seen/N; diff --git a/examples/tag.c b/examples/tag.c index f10cea5f..51bbb95e 100644 --- a/examples/tag.c +++ b/examples/tag.c @@ -58,7 +58,7 @@ void train_tag(char *cfgfile, char *weightfile, int clear) float loss = train_network(net, train); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); free_data(train); if(*net.seen/N > epoch){ epoch = *net.seen/N; diff --git a/examples/writing.c b/examples/writing.c index 5d30ea90..1b6ff83b 100644 --- a/examples/writing.c +++ b/examples/writing.c @@ -63,11 +63,11 @@ void train_writing(char *cfgfile, char *weightfile) if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); free_data(train); if(get_current_batch(net)%100 == 0){ char buff[256]; - sprintf(buff, "%s/%s_batch_%d.weights", backup_directory, base, get_current_batch(net)); + sprintf(buff, "%s/%s_batch_%ld.weights", backup_directory, base, get_current_batch(net)); save_weights(net, buff); } if(*net.seen/N > epoch){ diff --git a/include/darknet.h b/include/darknet.h index 269f216e..8d4366b6 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -87,6 +87,18 @@ typedef enum{ SSE, MASKED, L1, SMOOTH } COST_TYPE; +typedef struct{ + int batch; + float learning_rate; + float momentum; + float decay; + int adam; + float B1; + float B2; + float eps; + int t; +} update_args; + struct network; typedef struct network network; @@ -99,10 +111,10 @@ struct layer{ COST_TYPE cost_type; void (*forward) (struct layer, struct network); void (*backward) (struct layer, struct network); - void (*update) (struct layer, int, float, float, float); + void (*update) (struct layer, update_args); void (*forward_gpu) (struct layer, struct network); void (*backward_gpu) (struct layer, struct network); - void (*update_gpu) (struct layer, int, float, float, float); + void (*update_gpu) (struct layer, update_args); int batch_normalize; int shortcut; int batch; @@ -156,12 +168,6 @@ struct layer{ int log; int tanh; - int adam; - float B1; - float B2; - float eps; - int t; - float alpha; float beta; float kappa; @@ -395,16 +401,17 @@ typedef enum { typedef struct network{ int n; int batch; - int *seen; + size_t *seen; + int *t; float epoch; int subdivisions; - float momentum; - float decay; layer *layers; float *output; learning_rate_policy policy; float learning_rate; + float momentum; + float decay; float gamma; float scale; float power; @@ -648,7 +655,7 @@ void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, flo float get_current_rate(network net); void composite_3d(char *f1, char *f2, char *out, int delta); data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h); -int get_current_batch(network net); +size_t get_current_batch(network net); void constrain_image(image im); image get_network_image_layer(network net, int i); layer get_network_output_layer(network net); diff --git a/src/blas.h b/src/blas.h index 7b0218f6..6291746f 100644 --- a/src/blas.h +++ b/src/blas.h @@ -80,6 +80,7 @@ void mult_add_into_gpu(int num, float *a, float *b, float *c); void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); +void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t); void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 2ce60153..6fdfd3fa 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -74,6 +74,19 @@ void add_bias_gpu(float *output, float *biases, int batch, int n, int size) check_error(cudaPeekAtLastError()); } +__global__ void backward_bias_conn_kernel(float *bias_updates, float *delta, int batch, int n) +{ + int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (index >= n) return; + int b; + float sum = 0; + for(b = 0; b < batch; ++b){ + int i = b*n + index; + sum += delta[i]; + } + bias_updates[index] += sum; +} + __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size) { __shared__ float part[BLOCK]; @@ -94,6 +107,16 @@ __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batc } } +void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size) +{ + if(size == 1){ + backward_bias_conn_kernel<<>>(bias_updates, delta, batch, n); + }else{ + backward_bias_kernel<<>>(bias_updates, delta, batch, n, size); + } + check_error(cudaPeekAtLastError()); +} + /* __global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta) { @@ -136,12 +159,6 @@ void dot_error_gpu(layer l) } */ -void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size) -{ - backward_bias_kernel<<>>(bias_updates, delta, batch, n, size); - check_error(cudaPeekAtLastError()); -} - __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) { @@ -149,7 +166,6 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float if (index >= N) return; x[index] = x[index] + (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)); - //if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps))); } extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) @@ -158,6 +174,20 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2 check_error(cudaPeekAtLastError()); } +extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t) +{ + scal_ongpu(n, B1, m, 1); + scal_ongpu(n, B2, v, 1); + axpy_ongpu(n, -decay*batch, w, 1, d, 1); + + axpy_ongpu(n, (1-B1), d, 1, m, 1); + mul_ongpu(n, d, 1, d, 1); + axpy_ongpu(n, (1-B2), d, 1, v, 1); + + adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t); + fill_ongpu(n, 0, d, 1); +} + __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial) { int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; diff --git a/src/compare.c b/src/compare.c index 4fd266c7..d2d2b3bd 100644 --- a/src/compare.c +++ b/src/compare.c @@ -54,7 +54,7 @@ void train_compare(char *cfgfile, char *weightfile) float loss = train_network(net, train); if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%.3f: %f, %f avg, %lf seconds, %d images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen); + printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen); free_data(train); if(i%100 == 0){ char buff[256]; diff --git a/src/connected_layer.c b/src/connected_layer.c index 38c492cb..5037e748 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -11,10 +11,11 @@ #include #include -connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize) +layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam) { int i; - connected_layer l = {0}; + layer l = {0}; + l.learning_rate_scale = 1; l.type = CONNECTED; l.inputs = inputs; @@ -51,6 +52,14 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.biases[i] = 0; } + if(adam){ + l.m = calloc(l.inputs*l.outputs, sizeof(float)); + l.v = calloc(l.inputs*l.outputs, sizeof(float)); + l.bias_m = calloc(l.outputs, sizeof(float)); + l.scale_m = calloc(l.outputs, sizeof(float)); + l.bias_v = calloc(l.outputs, sizeof(float)); + l.scale_v = calloc(l.outputs, sizeof(float)); + } if(batch_normalize){ l.scales = calloc(outputs, sizeof(float)); l.scale_updates = calloc(outputs, sizeof(float)); @@ -83,6 +92,15 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.output_gpu = cuda_make_array(l.output, outputs*batch); l.delta_gpu = cuda_make_array(l.delta, outputs*batch); + if (adam) { + l.m_gpu = cuda_make_array(0, inputs*outputs); + l.v_gpu = cuda_make_array(0, inputs*outputs); + l.bias_m_gpu = cuda_make_array(0, outputs); + l.bias_v_gpu = cuda_make_array(0, outputs); + l.scale_m_gpu = cuda_make_array(0, outputs); + l.scale_v_gpu = cuda_make_array(0, outputs); + } + if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, outputs); l.variance_gpu = cuda_make_array(l.variance, outputs); @@ -111,8 +129,12 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT return l; } -void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay) +void update_connected_layer(layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1); @@ -126,7 +148,7 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1); } -void forward_connected_layer(connected_layer l, network net) +void forward_connected_layer(layer l, network net) { fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; @@ -144,7 +166,7 @@ void forward_connected_layer(connected_layer l, network net) activate_array(l.output, l.outputs*l.batch, l.activation); } -void backward_connected_layer(connected_layer l, network net) +void backward_connected_layer(layer l, network net) { gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); @@ -210,7 +232,7 @@ void statistics_connected_layer(layer l) #ifdef GPU -void pull_connected_layer(connected_layer l) +void pull_connected_layer(layer l) { cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_pull_array(l.biases_gpu, l.biases, l.outputs); @@ -223,7 +245,7 @@ void pull_connected_layer(connected_layer l) } } -void push_connected_layer(connected_layer l) +void push_connected_layer(layer l) { cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_push_array(l.biases_gpu, l.biases, l.outputs); @@ -236,22 +258,34 @@ void push_connected_layer(connected_layer l) } } -void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay) +void update_connected_layer_gpu(layer l, update_args a) { - axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); - scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + if(a.adam){ + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.inputs*l.outputs, batch, a.t); + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t); + if(l.scales_gpu){ + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t); + } + }else{ + axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); - if(l.batch_normalize){ - axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); - scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1); + if(l.batch_normalize){ + axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1); + } + + axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1); } - - axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); - axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); - scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1); } -void forward_connected_layer_gpu(connected_layer l, network net) +void forward_connected_layer_gpu(layer l, network net) { fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); @@ -271,9 +305,9 @@ void forward_connected_layer_gpu(connected_layer l, network net) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); } -void backward_connected_layer_gpu(connected_layer l, network net) +void backward_connected_layer_gpu(layer l, network net) { - constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); + constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); if(l.batch_normalize){ backward_batchnorm_layer_gpu(l, net); diff --git a/src/connected_layer.h b/src/connected_layer.h index 067b2566..6727a964 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -5,20 +5,18 @@ #include "layer.h" #include "network.h" -typedef layer connected_layer; +layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam); -connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize); - -void forward_connected_layer(connected_layer layer, network net); -void backward_connected_layer(connected_layer layer, network net); -void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay); +void forward_connected_layer(layer l, network net); +void backward_connected_layer(layer l, network net); +void update_connected_layer(layer l, update_args a); #ifdef GPU -void forward_connected_layer_gpu(connected_layer layer, network net); -void backward_connected_layer_gpu(connected_layer layer, network net); -void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay); -void push_connected_layer(connected_layer layer); -void pull_connected_layer(connected_layer layer); +void forward_connected_layer_gpu(layer l, network net); +void backward_connected_layer_gpu(layer l, network net); +void update_connected_layer_gpu(layer l, update_args a); +void push_connected_layer(layer l); +void pull_connected_layer(layer l); #endif #endif diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index b53dd16a..cc002182 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -263,10 +263,6 @@ void pull_convolutional_layer(convolutional_layer layer) cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); } - if (layer.adam){ - cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); - } } void push_convolutional_layer(convolutional_layer layer) @@ -280,35 +276,22 @@ void push_convolutional_layer(convolutional_layer layer) cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); } - if (layer.adam){ - cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size); - cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size); - } } -void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t) +void update_convolutional_layer_gpu(layer l, update_args a) { - scal_ongpu(n, B1, m, 1); - scal_ongpu(n, B2, v, 1); - axpy_ongpu(n, -decay*batch, w, 1, d, 1); + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; - axpy_ongpu(n, (1-B1), d, 1, m, 1); - mul_ongpu(n, d, 1, d, 1); - axpy_ongpu(n, (1-B2), d, 1, v, 1); - - adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t); - fill_ongpu(n, 0, d, 1); -} - -void update_convolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) -{ int size = l.size*l.size*l.c*l.n; - if(l.adam){ - adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t); - adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + if(a.adam){ + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t); + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); if(l.scales_gpu){ - adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); } }else{ axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index e5b5bb6f..a12b7c23 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -234,7 +234,6 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } if(adam){ - l.adam = 1; l.m = calloc(c*n*size*size, sizeof(float)); l.v = calloc(c*n*size*size, sizeof(float)); l.bias_m = calloc(n, sizeof(float)); @@ -507,8 +506,13 @@ void backward_convolutional_layer(convolutional_layer l, network net) } } -void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay) +void update_convolutional_layer(convolutional_layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 91394c38..d42b7423 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -12,7 +12,7 @@ typedef layer convolutional_layer; #ifdef GPU void forward_convolutional_layer_gpu(convolutional_layer layer, network net); void backward_convolutional_layer_gpu(convolutional_layer layer, network net); -void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_convolutional_layer_gpu(convolutional_layer layer, update_args a); void push_convolutional_layer(convolutional_layer layer); void pull_convolutional_layer(convolutional_layer layer); @@ -28,7 +28,7 @@ void cudnn_convolutional_setup(layer *l); convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam); void resize_convolutional_layer(convolutional_layer *layer, int w, int h); void forward_convolutional_layer(const convolutional_layer layer, network net); -void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_convolutional_layer(convolutional_layer layer, update_args a); image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights); void binarize_weights(float *weights, int n, int size, float *binary); void swap_binary(convolutional_layer *l); diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 2478fef5..2554cf9c 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -81,11 +81,11 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou return l; } -void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay) +void update_crnn_layer(layer l, update_args a) { - update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay); + update_convolutional_layer(*(l.input_layer), a); + update_convolutional_layer(*(l.self_layer), a); + update_convolutional_layer(*(l.output_layer), a); } void forward_crnn_layer(layer l, network net) @@ -194,11 +194,11 @@ void push_crnn_layer(layer l) push_convolutional_layer(*(l.output_layer)); } -void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_crnn_layer_gpu(layer l, update_args a) { - update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); - update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); + update_convolutional_layer_gpu(*(l.input_layer), a); + update_convolutional_layer_gpu(*(l.self_layer), a); + update_convolutional_layer_gpu(*(l.output_layer), a); } void forward_crnn_layer_gpu(layer l, network net) diff --git a/src/crnn_layer.h b/src/crnn_layer.h index ce89211b..515f3783 100644 --- a/src/crnn_layer.h +++ b/src/crnn_layer.h @@ -10,12 +10,12 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou void forward_crnn_layer(layer l, network net); void backward_crnn_layer(layer l, network net); -void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); +void update_crnn_layer(layer l, update_args a); #ifdef GPU void forward_crnn_layer_gpu(layer l, network net); void backward_crnn_layer_gpu(layer l, network net); -void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_crnn_layer_gpu(layer l, update_args a); void push_crnn_layer(layer l); void pull_crnn_layer(layer l); #endif diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu index 16694634..25026375 100644 --- a/src/deconvolutional_kernels.cu +++ b/src/deconvolutional_kernels.cu @@ -109,15 +109,20 @@ extern "C" void push_deconvolutional_layer(layer l) } } -void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_deconvolutional_layer_gpu(layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + int size = l.size*l.size*l.c*l.n; - if(l.adam){ - adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t); - adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + if(a.adam){ + adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t); + adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); if(l.scales_gpu){ - adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); + adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t); } }else{ axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); diff --git a/src/deconvolutional_layer.c b/src/deconvolutional_layer.c index 3778b418..0959d738 100644 --- a/src/deconvolutional_layer.c +++ b/src/deconvolutional_layer.c @@ -79,7 +79,6 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); } if(adam){ - l.adam = 1; l.m = calloc(c*n*size*size, sizeof(float)); l.v = calloc(c*n*size*size, sizeof(float)); l.bias_m = calloc(n, sizeof(float)); @@ -252,8 +251,13 @@ void backward_deconvolutional_layer(layer l, network net) } } -void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay) +void update_deconvolutional_layer(layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + int size = l.size*l.size*l.c*l.n; axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); diff --git a/src/deconvolutional_layer.h b/src/deconvolutional_layer.h index 42ccbc14..b254fb91 100644 --- a/src/deconvolutional_layer.h +++ b/src/deconvolutional_layer.h @@ -10,7 +10,7 @@ #ifdef GPU void forward_deconvolutional_layer_gpu(layer l, network net); void backward_deconvolutional_layer_gpu(layer l, network net); -void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_deconvolutional_layer_gpu(layer l, update_args a); void push_deconvolutional_layer(layer l); void pull_deconvolutional_layer(layer l); #endif @@ -18,7 +18,7 @@ void pull_deconvolutional_layer(layer l); layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam); void resize_deconvolutional_layer(layer *l, int h, int w); void forward_deconvolutional_layer(const layer l, network net); -void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay); +void update_deconvolutional_layer(layer l, update_args a); void backward_deconvolutional_layer(layer l, network net); #endif diff --git a/src/gru_layer.c b/src/gru_layer.c index 75497ae4..30ec641e 100644 --- a/src/gru_layer.c +++ b/src/gru_layer.c @@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps) #endif } -layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize) +layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam) { fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; @@ -38,34 +38,34 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no l.uz = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uz->batch = batch; l.wz = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wz->batch = batch; l.ur = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ur->batch = batch; l.wr = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wr->batch = batch; l.uh = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uh->batch = batch; l.wh = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wh->batch = batch; l.batch_normalize = batch_normalize; @@ -115,11 +115,14 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no return l; } -void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay) +void update_gru_layer(layer l, update_args a) { - update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); + update_connected_layer(*(l.ur), a); + update_connected_layer(*(l.uz), a); + update_connected_layer(*(l.uh), a); + update_connected_layer(*(l.wr), a); + update_connected_layer(*(l.wz), a); + update_connected_layer(*(l.wh), a); } void forward_gru_layer(layer l, network net) @@ -212,14 +215,14 @@ void push_gru_layer(layer l) { } -void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_gru_layer_gpu(layer l, update_args a) { - update_connected_layer_gpu(*(l.ur), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.uz), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.uh), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wr), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wz), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wh), batch, learning_rate, momentum, decay); + update_connected_layer_gpu(*(l.ur), a); + update_connected_layer_gpu(*(l.uz), a); + update_connected_layer_gpu(*(l.uh), a); + update_connected_layer_gpu(*(l.wr), a); + update_connected_layer_gpu(*(l.wz), a); + update_connected_layer_gpu(*(l.wh), a); } void forward_gru_layer_gpu(layer l, network net) diff --git a/src/gru_layer.h b/src/gru_layer.h index a0e57171..9067942e 100644 --- a/src/gru_layer.h +++ b/src/gru_layer.h @@ -6,16 +6,16 @@ #include "layer.h" #include "network.h" -layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize); +layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); void forward_gru_layer(layer l, network state); void backward_gru_layer(layer l, network state); -void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay); +void update_gru_layer(layer l, update_args a); #ifdef GPU void forward_gru_layer_gpu(layer l, network state); void backward_gru_layer_gpu(layer l, network state); -void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_gru_layer_gpu(layer l, update_args a); void push_gru_layer(layer l); void pull_gru_layer(layer l); #endif diff --git a/src/local_layer.c b/src/local_layer.c index aad036e0..170ba9b6 100644 --- a/src/local_layer.c +++ b/src/local_layer.c @@ -164,8 +164,13 @@ void backward_local_layer(local_layer l, network net) } } -void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay) +void update_local_layer(local_layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + int locations = l.out_w*l.out_h; int size = l.size*l.size*l.c*l.n*locations; axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); @@ -253,8 +258,13 @@ void backward_local_layer_gpu(local_layer l, network net) } } -void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay) +void update_local_layer_gpu(local_layer l, update_args a) { + float learning_rate = a.learning_rate*l.learning_rate_scale; + float momentum = a.momentum; + float decay = a.decay; + int batch = a.batch; + int locations = l.out_w*l.out_h; int size = l.size*l.size*l.c*l.n*locations; axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); diff --git a/src/local_layer.h b/src/local_layer.h index 5e292f91..776e572f 100644 --- a/src/local_layer.h +++ b/src/local_layer.h @@ -12,7 +12,7 @@ typedef layer local_layer; #ifdef GPU void forward_local_layer_gpu(local_layer layer, network net); void backward_local_layer_gpu(local_layer layer, network net); -void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_local_layer_gpu(local_layer layer, update_args a); void push_local_layer(local_layer layer); void pull_local_layer(local_layer layer); @@ -22,7 +22,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in void forward_local_layer(const local_layer layer, network net); void backward_local_layer(local_layer layer, network net); -void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay); +void update_local_layer(local_layer layer, update_args a); void bias_output(float *output, float *biases, int batch, int n, int size); void backward_bias(float *bias_updates, float *delta, int batch, int n, int size); diff --git a/src/lstm_layer.c b/src/lstm_layer.c index 74f6a136..a0cd99b6 100644 --- a/src/lstm_layer.c +++ b/src/lstm_layer.c @@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps) #endif } -layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize) +layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam) { fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; @@ -38,42 +38,42 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n l.uf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uf->batch = batch; l.wf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wf->batch = batch; l.ui = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ui->batch = batch; l.wi = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wi->batch = batch; l.ug = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.ug->batch = batch; l.wg = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wg->batch = batch; l.uo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam); l.uo->batch = batch; l.wo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam); l.wo->batch = batch; l.batch_normalize = batch_normalize; @@ -141,16 +141,16 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n return l; } -void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay) +void update_lstm_layer(layer l, update_args a) { - update_connected_layer(*(l.wf), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.wi), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.wg), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.wo), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.uf), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.ui), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.ug), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.uo), batch, learning_rate, momentum, decay); + update_connected_layer(*(l.wf), a); + update_connected_layer(*(l.wi), a); + update_connected_layer(*(l.wg), a); + update_connected_layer(*(l.wo), a); + update_connected_layer(*(l.uf), a); + update_connected_layer(*(l.ui), a); + update_connected_layer(*(l.ug), a); + update_connected_layer(*(l.uo), a); } void forward_lstm_layer(layer l, network state) @@ -383,16 +383,16 @@ void backward_lstm_layer(layer l, network state) } #ifdef GPU -void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_lstm_layer_gpu(layer l, update_args a) { - update_connected_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay); + update_connected_layer_gpu(*(l.wf), a); + update_connected_layer_gpu(*(l.wi), a); + update_connected_layer_gpu(*(l.wg), a); + update_connected_layer_gpu(*(l.wo), a); + update_connected_layer_gpu(*(l.uf), a); + update_connected_layer_gpu(*(l.ui), a); + update_connected_layer_gpu(*(l.ug), a); + update_connected_layer_gpu(*(l.uo), a); } void forward_lstm_layer_gpu(layer l, network state) diff --git a/src/lstm_layer.h b/src/lstm_layer.h index 8ed387af..b9f07e64 100644 --- a/src/lstm_layer.h +++ b/src/lstm_layer.h @@ -6,15 +6,15 @@ #include "network.h" #define USET -layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize); +layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam); void forward_lstm_layer(layer l, network net); -void update_lstm_layer(layer l, int batch, float learning, float momentum, float decay); +void update_lstm_layer(layer l, update_args a); #ifdef GPU void forward_lstm_layer_gpu(layer l, network net); void backward_lstm_layer_gpu(layer l, network net); -void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_lstm_layer_gpu(layer l, update_args a); #endif #endif diff --git a/src/network.c b/src/network.c index c1e9579e..0d2773e8 100644 --- a/src/network.c +++ b/src/network.c @@ -65,9 +65,9 @@ network *load_network_p(char *cfg, char *weights, int clear) return net; } -int get_current_batch(network net) +size_t get_current_batch(network net) { - int batch_num = (*net.seen)/(net.batch*net.subdivisions); + size_t batch_num = (*net.seen)/(net.batch*net.subdivisions); return batch_num; } @@ -84,7 +84,7 @@ void reset_momentum(network net) float get_current_rate(network net) { - int batch_num = get_current_batch(net); + size_t batch_num = get_current_batch(net); int i; float rate; if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power); @@ -174,6 +174,7 @@ network make_network(int n) net.n = n; net.layers = calloc(net.n, sizeof(layer)); net.seen = calloc(1, sizeof(int)); + net.t = calloc(1, sizeof(int)); net.cost = calloc(1, sizeof(float)); return net; } @@ -199,12 +200,22 @@ void forward_network(network net) void update_network(network net) { int i; - int update_batch = net.batch*net.subdivisions; - float rate = get_current_rate(net); + update_args a = {0}; + a.batch = net.batch*net.subdivisions; + a.learning_rate = get_current_rate(net); + a.momentum = net.momentum; + a.decay = net.decay; + a.adam = net.adam; + a.B1 = net.B1; + a.B2 = net.B2; + a.eps = net.eps; + ++*net.t; + a.t = *net.t; + for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; if(l.update){ - l.update(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); + l.update(l, a); } } } diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 28aa20ec..7c955309 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -81,13 +81,22 @@ void update_network_gpu(network net) { cuda_set_device(net.gpu_index); int i; - int update_batch = net.batch*net.subdivisions; - float rate = get_current_rate(net); + update_args a = {0}; + a.batch = net.batch*net.subdivisions; + a.learning_rate = get_current_rate(net); + a.momentum = net.momentum; + a.decay = net.decay; + a.adam = net.adam; + a.B1 = net.B1; + a.B2 = net.B2; + a.eps = net.eps; + ++*net.t; + a.t = (*net.t); + for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; - l.t = get_current_batch(net); if(l.update_gpu){ - l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); + l.update_gpu(l, a); } } } diff --git a/src/parser.c b/src/parser.c index d5acc2e0..70390416 100644 --- a/src/parser.c +++ b/src/parser.c @@ -191,11 +191,6 @@ convolutional_layer parse_convolutional(list *options, size_params params) convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam); layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.dot = option_find_float_quiet(options, "dot", 0); - if(params.net.adam){ - layer.B1 = params.net.B1; - layer.B2 = params.net.B2; - layer.eps = params.net.eps; - } return layer; } @@ -224,7 +219,7 @@ layer parse_rnn(list *options, size_params params) int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int logistic = option_find_int_quiet(options, "logistic", 0); - layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic); + layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic, params.net.adam); l.shortcut = option_find_int_quiet(options, "shortcut", 0); @@ -236,7 +231,7 @@ layer parse_gru(list *options, size_params params) int output = option_find_int(options, "output",1); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); - layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize); + layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net.adam); l.tanh = option_find_int_quiet(options, "tanh", 0); return l; @@ -247,21 +242,20 @@ layer parse_lstm(list *options, size_params params) int output = option_find_int(options, "output", 1); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); - layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize); + layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net.adam); return l; } -connected_layer parse_connected(list *options, size_params params) +layer parse_connected(list *options, size_params params) { int output = option_find_int(options, "output",1); char *activation_s = option_find_str(options, "activation", "logistic"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); - connected_layer layer = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize); - - return layer; + layer l = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize, params.net.adam); + return l; } softmax_layer parse_softmax(list *options, size_params params) @@ -567,7 +561,7 @@ void parse_net_options(list *options, network *net) if(net->adam){ net->B1 = option_find_float(options, "B1", .9); net->B2 = option_find_float(options, "B2", .999); - net->eps = option_find_float(options, "eps", .00000001); + net->eps = option_find_float(options, "eps", .0000001); } net->h = option_find_int_quiet(options, "height",0); @@ -855,10 +849,6 @@ void save_convolutional_weights(layer l, FILE *fp) fwrite(l.rolling_variance, sizeof(float), l.n, fp); } fwrite(l.weights, sizeof(float), num, fp); - if(l.adam){ - //fwrite(l.m, sizeof(float), num, fp); - //fwrite(l.v, sizeof(float), num, fp); - } } void save_batchnorm_weights(layer l, FILE *fp) @@ -901,12 +891,12 @@ void save_weights_upto(network net, char *filename, int cutoff) if(!fp) file_error(filename); int major = 0; - int minor = 1; + int minor = 2; int revision = 0; fwrite(&major, sizeof(int), 1, fp); fwrite(&minor, sizeof(int), 1, fp); fwrite(&revision, sizeof(int), 1, fp); - fwrite(net.seen, sizeof(int), 1, fp); + fwrite(net.seen, sizeof(size_t), 1, fp); int i; for(i = 0; i < net.n && i < cutoff; ++i){ @@ -1068,10 +1058,6 @@ void load_convolutional_weights(layer l, FILE *fp) } } fread(l.weights, sizeof(float), num, fp); - if(l.adam){ - //fread(l.m, sizeof(float), num, fp); - //fread(l.v, sizeof(float), num, fp); - } //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1); if (l.flipped) { transpose_matrix(l.weights, l.c*l.size*l.size, l.n); @@ -1103,7 +1089,13 @@ void load_weights_upto(network *net, char *filename, int start, int cutoff) fread(&major, sizeof(int), 1, fp); fread(&minor, sizeof(int), 1, fp); fread(&revision, sizeof(int), 1, fp); - fread(net->seen, sizeof(int), 1, fp); + if ((major*10 + minor) >= 2){ + fread(net->seen, sizeof(size_t), 1, fp); + } else { + int iseen = 0; + fread(&iseen, sizeof(int), 1, fp); + *net->seen = iseen; + } int transpose = (major > 1000) || (minor > 1000); int i; diff --git a/src/rnn_layer.c b/src/rnn_layer.c index fb4f1084..6ff9a0f8 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps) #endif } -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log) +layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam) { fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; @@ -41,17 +41,17 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.input_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize); + *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize, adam); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize); + *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize, adam); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize); + *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize, adam); l.output_layer->batch = batch; l.outputs = outputs; @@ -73,11 +73,11 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, return l; } -void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay) +void update_rnn_layer(layer l, update_args a) { - update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay); - update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); + update_connected_layer(*(l.input_layer), a); + update_connected_layer(*(l.self_layer), a); + update_connected_layer(*(l.output_layer), a); } void forward_rnn_layer(layer l, network net) @@ -187,11 +187,11 @@ void push_rnn_layer(layer l) push_connected_layer(*(l.output_layer)); } -void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) +void update_rnn_layer_gpu(layer l, update_args a) { - update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); - update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); + update_connected_layer_gpu(*(l.input_layer), a); + update_connected_layer_gpu(*(l.self_layer), a); + update_connected_layer_gpu(*(l.output_layer), a); } void forward_rnn_layer_gpu(layer l, network net) diff --git a/src/rnn_layer.h b/src/rnn_layer.h index 782a90ef..37e1f1ca 100644 --- a/src/rnn_layer.h +++ b/src/rnn_layer.h @@ -7,16 +7,16 @@ #include "network.h" #define USET -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log); +layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam); void forward_rnn_layer(layer l, network net); void backward_rnn_layer(layer l, network net); -void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); +void update_rnn_layer(layer l, update_args a); #ifdef GPU void forward_rnn_layer_gpu(layer l, network net); void backward_rnn_layer_gpu(layer l, network net); -void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void update_rnn_layer_gpu(layer l, update_args a); void push_rnn_layer(layer l); void pull_rnn_layer(layer l); #endif