i hate adam. i hate adam

This commit is contained in:
Joseph Redmon 2017-06-12 16:19:08 -07:00
parent f9446acb68
commit 1467621453
38 changed files with 315 additions and 235 deletions

View File

@ -1,6 +1,6 @@
GPU=0 GPU=1
CUDNN=0 CUDNN=1
OPENCV=0 OPENCV=1
DEBUG=0 DEBUG=0
ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \

View File

@ -22,17 +22,6 @@ max_batches=10000
[gru] [gru]
batch_normalize=1 batch_normalize=1
output = 1024 output = 1024
tanh = 1
[gru]
batch_normalize=1
output = 1024
tanh = 1
[gru]
batch_normalize=1
output = 1024
tanh = 1
[connected] [connected]
output=256 output=256

View File

@ -83,7 +83,7 @@ void train_captcha(char *cfgfile, char *weightfile)
float loss = train_network(net, train); float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(i%100==0){ if(i%100==0){
char buff[256]; char buff[256];

View File

@ -25,7 +25,7 @@ void train_cifar(char *cfgfile, char *weightfile)
float loss = train_network_sgd(net, train, 1); float loss = train_network_sgd(net, train, 1);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.95 + loss*.05; avg_loss = avg_loss*.95 + loss*.05;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;
char buff[256]; char buff[256];
@ -81,7 +81,7 @@ void train_cifar_distill(char *cfgfile, char *weightfile)
float loss = train_network_sgd(net, train, 1); float loss = train_network_sgd(net, train, 1);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.95 + loss*.05; avg_loss = avg_loss*.95 + loss*.05;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;
char buff[256]; char buff[256];

View File

@ -105,7 +105,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
#endif #endif
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;

View File

@ -128,7 +128,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
i = get_current_batch(net); i = get_current_batch(net);
printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
if(i%1000==0){ if(i%1000==0){
#ifdef GPU #ifdef GPU
if(ngpus != 1) sync_nets(nets, ngpus, 0); if(ngpus != 1) sync_nets(nets, ngpus, 0);

View File

@ -31,7 +31,7 @@ void train_dice(char *cfgfile, char *weightfile)
float loss = train_network(net, train); float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), *net.seen); printf("%d: %f, %f avg, %lf seconds, %ld images\n", i, loss, avg_loss, sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if((i % 100) == 0) net.learning_rate *= .1; if((i % 100) == 0) net.learning_rate *= .1;
if(i%100==0){ if(i%100==0){

View File

@ -169,7 +169,7 @@ void train_go(char *cfgfile, char *weightfile, char *filename, int *gpus, int ng
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.95 + loss*.05; avg_loss = avg_loss*.95 + loss*.05;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;
char buff[256]; char buff[256];
@ -184,7 +184,7 @@ void train_go(char *cfgfile, char *weightfile, char *filename, int *gpus, int ng
} }
if(get_current_batch(net)%10000 == 0){ if(get_current_batch(net)%10000 == 0){
char buff[256]; char buff[256];
sprintf(buff, "%s/%s_%d.backup",backup_directory,base,get_current_batch(net)); sprintf(buff, "%s/%s_%ld.backup",backup_directory,base,get_current_batch(net));
save_weights(net, buff); save_weights(net, buff);
} }
} }

View File

@ -91,7 +91,7 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
#endif #endif
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;

View File

@ -182,7 +182,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
if (avg_loss < 0) avg_loss = loss; if (avg_loss < 0) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
int chars = get_current_batch(net)*batch; size_t chars = get_current_batch(net)*batch;
fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size); fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds, %f epochs\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), (float) chars/size);
for(j = 0; j < streams; ++j){ for(j = 0; j < streams; ++j){
@ -194,12 +194,12 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
} }
} }
if(i%1000==0){ if(i%10000==0){
char buff[256]; char buff[256];
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
save_weights(net, buff); save_weights(net, buff);
} }
if(i%10==0){ if(i%100==0){
char buff[256]; char buff[256];
sprintf(buff, "%s/%s.backup", backup_directory, base); sprintf(buff, "%s/%s.backup", backup_directory, base);
save_weights(net, buff); save_weights(net, buff);
@ -409,7 +409,7 @@ void valid_char_rnn(char *cfgfile, char *weightfile, char *seed)
input[c] = 0; input[c] = 0;
sum += log(out[next])/log2; sum += log(out[next])/log2;
c = next; c = next;
printf("%d Perplexity: %4.4f Word Perplexity: %4.4f\n", count, pow(2, -sum/count), pow(2, -sum/words)); printf("%d BPC: %4.4f Perplexity: %4.4f Word Perplexity: %4.4f\n", count, -sum/count, pow(2, -sum/count), pow(2, -sum/words));
} }
} }

View File

@ -100,7 +100,7 @@ void train_segmenter(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
#endif #endif
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;

View File

@ -58,7 +58,7 @@ void train_tag(char *cfgfile, char *weightfile, int clear)
float loss = train_network(net, train); float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){
epoch = *net.seen/N; epoch = *net.seen/N;

View File

@ -63,11 +63,11 @@ void train_writing(char *cfgfile, char *weightfile)
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); printf("%ld, %.3f: %f, %f avg, %f rate, %lf seconds, %ld images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(get_current_batch(net)%100 == 0){ if(get_current_batch(net)%100 == 0){
char buff[256]; char buff[256];
sprintf(buff, "%s/%s_batch_%d.weights", backup_directory, base, get_current_batch(net)); sprintf(buff, "%s/%s_batch_%ld.weights", backup_directory, base, get_current_batch(net));
save_weights(net, buff); save_weights(net, buff);
} }
if(*net.seen/N > epoch){ if(*net.seen/N > epoch){

View File

@ -87,6 +87,18 @@ typedef enum{
SSE, MASKED, L1, SMOOTH SSE, MASKED, L1, SMOOTH
} COST_TYPE; } COST_TYPE;
typedef struct{
int batch;
float learning_rate;
float momentum;
float decay;
int adam;
float B1;
float B2;
float eps;
int t;
} update_args;
struct network; struct network;
typedef struct network network; typedef struct network network;
@ -99,10 +111,10 @@ struct layer{
COST_TYPE cost_type; COST_TYPE cost_type;
void (*forward) (struct layer, struct network); void (*forward) (struct layer, struct network);
void (*backward) (struct layer, struct network); void (*backward) (struct layer, struct network);
void (*update) (struct layer, int, float, float, float); void (*update) (struct layer, update_args);
void (*forward_gpu) (struct layer, struct network); void (*forward_gpu) (struct layer, struct network);
void (*backward_gpu) (struct layer, struct network); void (*backward_gpu) (struct layer, struct network);
void (*update_gpu) (struct layer, int, float, float, float); void (*update_gpu) (struct layer, update_args);
int batch_normalize; int batch_normalize;
int shortcut; int shortcut;
int batch; int batch;
@ -156,12 +168,6 @@ struct layer{
int log; int log;
int tanh; int tanh;
int adam;
float B1;
float B2;
float eps;
int t;
float alpha; float alpha;
float beta; float beta;
float kappa; float kappa;
@ -395,16 +401,17 @@ typedef enum {
typedef struct network{ typedef struct network{
int n; int n;
int batch; int batch;
int *seen; size_t *seen;
int *t;
float epoch; float epoch;
int subdivisions; int subdivisions;
float momentum;
float decay;
layer *layers; layer *layers;
float *output; float *output;
learning_rate_policy policy; learning_rate_policy policy;
float learning_rate; float learning_rate;
float momentum;
float decay;
float gamma; float gamma;
float scale; float scale;
float power; float power;
@ -648,7 +655,7 @@ void draw_box_width(image a, int x1, int y1, int x2, int y2, int w, float r, flo
float get_current_rate(network net); float get_current_rate(network net);
void composite_3d(char *f1, char *f2, char *out, int delta); void composite_3d(char *f1, char *f2, char *out, int delta);
data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h); data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int h);
int get_current_batch(network net); size_t get_current_batch(network net);
void constrain_image(image im); void constrain_image(image im);
image get_network_image_layer(network net, int i); image get_network_image_layer(network net, int i);
layer get_network_output_layer(network net); layer get_network_output_layer(network net);

View File

@ -80,6 +80,7 @@ void mult_add_into_gpu(int num, float *a, float *b, float *c);
void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);

View File

@ -74,6 +74,19 @@ void add_bias_gpu(float *output, float *biases, int batch, int n, int size)
check_error(cudaPeekAtLastError()); check_error(cudaPeekAtLastError());
} }
__global__ void backward_bias_conn_kernel(float *bias_updates, float *delta, int batch, int n)
{
int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (index >= n) return;
int b;
float sum = 0;
for(b = 0; b < batch; ++b){
int i = b*n + index;
sum += delta[i];
}
bias_updates[index] += sum;
}
__global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size) __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batch, int n, int size)
{ {
__shared__ float part[BLOCK]; __shared__ float part[BLOCK];
@ -94,6 +107,16 @@ __global__ void backward_bias_kernel(float *bias_updates, float *delta, int batc
} }
} }
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
{
if(size == 1){
backward_bias_conn_kernel<<<cuda_gridsize(n), BLOCK>>>(bias_updates, delta, batch, n);
}else{
backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
}
check_error(cudaPeekAtLastError());
}
/* /*
__global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta) __global__ void dot_kernel(float *output, float scale, int batch, int n, int size, float *delta)
{ {
@ -136,12 +159,6 @@ void dot_error_gpu(layer l)
} }
*/ */
void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size)
{
backward_bias_kernel<<<n, BLOCK>>>(bias_updates, delta, batch, n, size);
check_error(cudaPeekAtLastError());
}
__global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
{ {
@ -149,7 +166,6 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float
if (index >= N) return; if (index >= N) return;
x[index] = x[index] + (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)); x[index] = x[index] + (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps));
//if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps)));
} }
extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
@ -158,6 +174,20 @@ extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2
check_error(cudaPeekAtLastError()); check_error(cudaPeekAtLastError());
} }
extern "C" void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t)
{
scal_ongpu(n, B1, m, 1);
scal_ongpu(n, B2, v, 1);
axpy_ongpu(n, -decay*batch, w, 1, d, 1);
axpy_ongpu(n, (1-B1), d, 1, m, 1);
mul_ongpu(n, d, 1, d, 1);
axpy_ongpu(n, (1-B2), d, 1, v, 1);
adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t);
fill_ongpu(n, 0, d, 1);
}
__global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial) __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, int batch, int filters, int spatial)
{ {
int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;

View File

@ -54,7 +54,7 @@ void train_compare(char *cfgfile, char *weightfile)
float loss = train_network(net, train); float loss = train_network(net, train);
if(avg_loss == -1) avg_loss = loss; if(avg_loss == -1) avg_loss = loss;
avg_loss = avg_loss*.9 + loss*.1; avg_loss = avg_loss*.9 + loss*.1;
printf("%.3f: %f, %f avg, %lf seconds, %d images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen); printf("%.3f: %f, %f avg, %lf seconds, %ld images\n", (float)*net.seen/N, loss, avg_loss, sec(clock()-time), *net.seen);
free_data(train); free_data(train);
if(i%100 == 0){ if(i%100 == 0){
char buff[256]; char buff[256];

View File

@ -11,10 +11,11 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize) layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam)
{ {
int i; int i;
connected_layer l = {0}; layer l = {0};
l.learning_rate_scale = 1;
l.type = CONNECTED; l.type = CONNECTED;
l.inputs = inputs; l.inputs = inputs;
@ -51,6 +52,14 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
l.biases[i] = 0; l.biases[i] = 0;
} }
if(adam){
l.m = calloc(l.inputs*l.outputs, sizeof(float));
l.v = calloc(l.inputs*l.outputs, sizeof(float));
l.bias_m = calloc(l.outputs, sizeof(float));
l.scale_m = calloc(l.outputs, sizeof(float));
l.bias_v = calloc(l.outputs, sizeof(float));
l.scale_v = calloc(l.outputs, sizeof(float));
}
if(batch_normalize){ if(batch_normalize){
l.scales = calloc(outputs, sizeof(float)); l.scales = calloc(outputs, sizeof(float));
l.scale_updates = calloc(outputs, sizeof(float)); l.scale_updates = calloc(outputs, sizeof(float));
@ -83,6 +92,15 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
l.output_gpu = cuda_make_array(l.output, outputs*batch); l.output_gpu = cuda_make_array(l.output, outputs*batch);
l.delta_gpu = cuda_make_array(l.delta, outputs*batch); l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
if (adam) {
l.m_gpu = cuda_make_array(0, inputs*outputs);
l.v_gpu = cuda_make_array(0, inputs*outputs);
l.bias_m_gpu = cuda_make_array(0, outputs);
l.bias_v_gpu = cuda_make_array(0, outputs);
l.scale_m_gpu = cuda_make_array(0, outputs);
l.scale_v_gpu = cuda_make_array(0, outputs);
}
if(batch_normalize){ if(batch_normalize){
l.mean_gpu = cuda_make_array(l.mean, outputs); l.mean_gpu = cuda_make_array(l.mean, outputs);
l.variance_gpu = cuda_make_array(l.variance, outputs); l.variance_gpu = cuda_make_array(l.variance, outputs);
@ -111,8 +129,12 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
return l; return l;
} }
void update_connected_layer(connected_layer l, int batch, float learning_rate, float momentum, float decay) void update_connected_layer(layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.outputs, momentum, l.bias_updates, 1); scal_cpu(l.outputs, momentum, l.bias_updates, 1);
@ -126,7 +148,7 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f
scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1); scal_cpu(l.inputs*l.outputs, momentum, l.weight_updates, 1);
} }
void forward_connected_layer(connected_layer l, network net) void forward_connected_layer(layer l, network net)
{ {
fill_cpu(l.outputs*l.batch, 0, l.output, 1); fill_cpu(l.outputs*l.batch, 0, l.output, 1);
int m = l.batch; int m = l.batch;
@ -144,7 +166,7 @@ void forward_connected_layer(connected_layer l, network net)
activate_array(l.output, l.outputs*l.batch, l.activation); activate_array(l.output, l.outputs*l.batch, l.activation);
} }
void backward_connected_layer(connected_layer l, network net) void backward_connected_layer(layer l, network net)
{ {
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
@ -210,7 +232,7 @@ void statistics_connected_layer(layer l)
#ifdef GPU #ifdef GPU
void pull_connected_layer(connected_layer l) void pull_connected_layer(layer l)
{ {
cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_pull_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
cuda_pull_array(l.biases_gpu, l.biases, l.outputs); cuda_pull_array(l.biases_gpu, l.biases, l.outputs);
@ -223,7 +245,7 @@ void pull_connected_layer(connected_layer l)
} }
} }
void push_connected_layer(connected_layer l) void push_connected_layer(layer l)
{ {
cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs); cuda_push_array(l.weights_gpu, l.weights, l.inputs*l.outputs);
cuda_push_array(l.biases_gpu, l.biases, l.outputs); cuda_push_array(l.biases_gpu, l.biases, l.outputs);
@ -236,22 +258,34 @@ void push_connected_layer(connected_layer l)
} }
} }
void update_connected_layer_gpu(connected_layer l, int batch, float learning_rate, float momentum, float decay) void update_connected_layer_gpu(layer l, update_args a)
{ {
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); float learning_rate = a.learning_rate*l.learning_rate_scale;
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1); float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.inputs*l.outputs, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.outputs, batch, a.t);
}
}else{
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
scal_ongpu(l.outputs, momentum, l.bias_updates_gpu, 1);
if(l.batch_normalize){ if(l.batch_normalize){
axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); axpy_ongpu(l.outputs, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1); scal_ongpu(l.outputs, momentum, l.scale_updates_gpu, 1);
}
axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
} }
axpy_ongpu(l.inputs*l.outputs, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
axpy_ongpu(l.inputs*l.outputs, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
scal_ongpu(l.inputs*l.outputs, momentum, l.weight_updates_gpu, 1);
} }
void forward_connected_layer_gpu(connected_layer l, network net) void forward_connected_layer_gpu(layer l, network net)
{ {
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
@ -271,9 +305,9 @@ void forward_connected_layer_gpu(connected_layer l, network net)
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
} }
void backward_connected_layer_gpu(connected_layer l, network net) void backward_connected_layer_gpu(layer l, network net)
{ {
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1);
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
if(l.batch_normalize){ if(l.batch_normalize){
backward_batchnorm_layer_gpu(l, net); backward_batchnorm_layer_gpu(l, net);

View File

@ -5,20 +5,18 @@
#include "layer.h" #include "layer.h"
#include "network.h" #include "network.h"
typedef layer connected_layer; layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize, int adam);
connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize); void forward_connected_layer(layer l, network net);
void backward_connected_layer(layer l, network net);
void forward_connected_layer(connected_layer layer, network net); void update_connected_layer(layer l, update_args a);
void backward_connected_layer(connected_layer layer, network net);
void update_connected_layer(connected_layer layer, int batch, float learning_rate, float momentum, float decay);
#ifdef GPU #ifdef GPU
void forward_connected_layer_gpu(connected_layer layer, network net); void forward_connected_layer_gpu(layer l, network net);
void backward_connected_layer_gpu(connected_layer layer, network net); void backward_connected_layer_gpu(layer l, network net);
void update_connected_layer_gpu(connected_layer layer, int batch, float learning_rate, float momentum, float decay); void update_connected_layer_gpu(layer l, update_args a);
void push_connected_layer(connected_layer layer); void push_connected_layer(layer l);
void pull_connected_layer(connected_layer layer); void pull_connected_layer(layer l);
#endif #endif
#endif #endif

View File

@ -263,10 +263,6 @@ void pull_convolutional_layer(convolutional_layer layer)
cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); cuda_pull_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); cuda_pull_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
} }
if (layer.adam){
cuda_pull_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_pull_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
}
} }
void push_convolutional_layer(convolutional_layer layer) void push_convolutional_layer(convolutional_layer layer)
@ -280,35 +276,22 @@ void push_convolutional_layer(convolutional_layer layer)
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n); cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n); cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
} }
if (layer.adam){
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
}
} }
void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t) void update_convolutional_layer_gpu(layer l, update_args a)
{ {
scal_ongpu(n, B1, m, 1); float learning_rate = a.learning_rate*l.learning_rate_scale;
scal_ongpu(n, B2, v, 1); float momentum = a.momentum;
axpy_ongpu(n, -decay*batch, w, 1, d, 1); float decay = a.decay;
int batch = a.batch;
axpy_ongpu(n, (1-B1), d, 1, m, 1);
mul_ongpu(n, d, 1, d, 1);
axpy_ongpu(n, (1-B2), d, 1, v, 1);
adam_gpu(n, w, m, v, B1, B2, rate/batch, eps, t);
fill_ongpu(n, 0, d, 1);
}
void update_convolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
{
int size = l.size*l.size*l.c*l.n; int size = l.size*l.size*l.c*l.n;
if(l.adam){ if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t); adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){ if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
} }
}else{ }else{
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);

View File

@ -234,7 +234,6 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
} }
if(adam){ if(adam){
l.adam = 1;
l.m = calloc(c*n*size*size, sizeof(float)); l.m = calloc(c*n*size*size, sizeof(float));
l.v = calloc(c*n*size*size, sizeof(float)); l.v = calloc(c*n*size*size, sizeof(float));
l.bias_m = calloc(n, sizeof(float)); l.bias_m = calloc(n, sizeof(float));
@ -507,8 +506,13 @@ void backward_convolutional_layer(convolutional_layer l, network net)
} }
} }
void update_convolutional_layer(convolutional_layer l, int batch, float learning_rate, float momentum, float decay) void update_convolutional_layer(convolutional_layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
int size = l.size*l.size*l.c*l.n; int size = l.size*l.size*l.c*l.n;
axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.n, momentum, l.bias_updates, 1); scal_cpu(l.n, momentum, l.bias_updates, 1);

View File

@ -12,7 +12,7 @@ typedef layer convolutional_layer;
#ifdef GPU #ifdef GPU
void forward_convolutional_layer_gpu(convolutional_layer layer, network net); void forward_convolutional_layer_gpu(convolutional_layer layer, network net);
void backward_convolutional_layer_gpu(convolutional_layer layer, network net); void backward_convolutional_layer_gpu(convolutional_layer layer, network net);
void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); void update_convolutional_layer_gpu(convolutional_layer layer, update_args a);
void push_convolutional_layer(convolutional_layer layer); void push_convolutional_layer(convolutional_layer layer);
void pull_convolutional_layer(convolutional_layer layer); void pull_convolutional_layer(convolutional_layer layer);
@ -28,7 +28,7 @@ void cudnn_convolutional_setup(layer *l);
convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam); convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam);
void resize_convolutional_layer(convolutional_layer *layer, int w, int h); void resize_convolutional_layer(convolutional_layer *layer, int w, int h);
void forward_convolutional_layer(const convolutional_layer layer, network net); void forward_convolutional_layer(const convolutional_layer layer, network net);
void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); void update_convolutional_layer(convolutional_layer layer, update_args a);
image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights); image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights);
void binarize_weights(float *weights, int n, int size, float *binary); void binarize_weights(float *weights, int n, int size, float *binary);
void swap_binary(convolutional_layer *l); void swap_binary(convolutional_layer *l);

View File

@ -81,11 +81,11 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
return l; return l;
} }
void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay) void update_crnn_layer(layer l, update_args a)
{ {
update_convolutional_layer(*(l.input_layer), batch, learning_rate, momentum, decay); update_convolutional_layer(*(l.input_layer), a);
update_convolutional_layer(*(l.self_layer), batch, learning_rate, momentum, decay); update_convolutional_layer(*(l.self_layer), a);
update_convolutional_layer(*(l.output_layer), batch, learning_rate, momentum, decay); update_convolutional_layer(*(l.output_layer), a);
} }
void forward_crnn_layer(layer l, network net) void forward_crnn_layer(layer l, network net)
@ -194,11 +194,11 @@ void push_crnn_layer(layer l)
push_convolutional_layer(*(l.output_layer)); push_convolutional_layer(*(l.output_layer));
} }
void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) void update_crnn_layer_gpu(layer l, update_args a)
{ {
update_convolutional_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); update_convolutional_layer_gpu(*(l.input_layer), a);
update_convolutional_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); update_convolutional_layer_gpu(*(l.self_layer), a);
update_convolutional_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); update_convolutional_layer_gpu(*(l.output_layer), a);
} }
void forward_crnn_layer_gpu(layer l, network net) void forward_crnn_layer_gpu(layer l, network net)

View File

@ -10,12 +10,12 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou
void forward_crnn_layer(layer l, network net); void forward_crnn_layer(layer l, network net);
void backward_crnn_layer(layer l, network net); void backward_crnn_layer(layer l, network net);
void update_crnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); void update_crnn_layer(layer l, update_args a);
#ifdef GPU #ifdef GPU
void forward_crnn_layer_gpu(layer l, network net); void forward_crnn_layer_gpu(layer l, network net);
void backward_crnn_layer_gpu(layer l, network net); void backward_crnn_layer_gpu(layer l, network net);
void update_crnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void update_crnn_layer_gpu(layer l, update_args a);
void push_crnn_layer(layer l); void push_crnn_layer(layer l);
void pull_crnn_layer(layer l); void pull_crnn_layer(layer l);
#endif #endif

View File

@ -109,15 +109,20 @@ extern "C" void push_deconvolutional_layer(layer l)
} }
} }
void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) void update_deconvolutional_layer_gpu(layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
int size = l.size*l.size*l.c*l.n; int size = l.size*l.size*l.c*l.n;
if(l.adam){ if(a.adam){
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t); adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t);
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
if(l.scales_gpu){ if(l.scales_gpu){
adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t); adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
} }
}else{ }else{
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);

View File

@ -79,7 +79,6 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size
l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
} }
if(adam){ if(adam){
l.adam = 1;
l.m = calloc(c*n*size*size, sizeof(float)); l.m = calloc(c*n*size*size, sizeof(float));
l.v = calloc(c*n*size*size, sizeof(float)); l.v = calloc(c*n*size*size, sizeof(float));
l.bias_m = calloc(n, sizeof(float)); l.bias_m = calloc(n, sizeof(float));
@ -252,8 +251,13 @@ void backward_deconvolutional_layer(layer l, network net)
} }
} }
void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay) void update_deconvolutional_layer(layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
int size = l.size*l.size*l.c*l.n; int size = l.size*l.size*l.c*l.n;
axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
scal_cpu(l.n, momentum, l.bias_updates, 1); scal_cpu(l.n, momentum, l.bias_updates, 1);

View File

@ -10,7 +10,7 @@
#ifdef GPU #ifdef GPU
void forward_deconvolutional_layer_gpu(layer l, network net); void forward_deconvolutional_layer_gpu(layer l, network net);
void backward_deconvolutional_layer_gpu(layer l, network net); void backward_deconvolutional_layer_gpu(layer l, network net);
void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void update_deconvolutional_layer_gpu(layer l, update_args a);
void push_deconvolutional_layer(layer l); void push_deconvolutional_layer(layer l);
void pull_deconvolutional_layer(layer l); void pull_deconvolutional_layer(layer l);
#endif #endif
@ -18,7 +18,7 @@ void pull_deconvolutional_layer(layer l);
layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam); layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam);
void resize_deconvolutional_layer(layer *l, int h, int w); void resize_deconvolutional_layer(layer *l, int h, int w);
void forward_deconvolutional_layer(const layer l, network net); void forward_deconvolutional_layer(const layer l, network net);
void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay); void update_deconvolutional_layer(layer l, update_args a);
void backward_deconvolutional_layer(layer l, network net); void backward_deconvolutional_layer(layer l, network net);
#endif #endif

View File

@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
#endif #endif
} }
layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize) layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
{ {
fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs); fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
batch = batch / steps; batch = batch / steps;
@ -38,34 +38,34 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
l.uz = malloc(sizeof(layer)); l.uz = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uz->batch = batch; l.uz->batch = batch;
l.wz = malloc(sizeof(layer)); l.wz = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wz->batch = batch; l.wz->batch = batch;
l.ur = malloc(sizeof(layer)); l.ur = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ur->batch = batch; l.ur->batch = batch;
l.wr = malloc(sizeof(layer)); l.wr = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wr->batch = batch; l.wr->batch = batch;
l.uh = malloc(sizeof(layer)); l.uh = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uh->batch = batch; l.uh->batch = batch;
l.wh = malloc(sizeof(layer)); l.wh = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wh->batch = batch; l.wh->batch = batch;
l.batch_normalize = batch_normalize; l.batch_normalize = batch_normalize;
@ -115,11 +115,14 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
return l; return l;
} }
void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay) void update_gru_layer(layer l, update_args a)
{ {
update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.ur), a);
update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.uz), a);
update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.uh), a);
update_connected_layer(*(l.wr), a);
update_connected_layer(*(l.wz), a);
update_connected_layer(*(l.wh), a);
} }
void forward_gru_layer(layer l, network net) void forward_gru_layer(layer l, network net)
@ -212,14 +215,14 @@ void push_gru_layer(layer l)
{ {
} }
void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) void update_gru_layer_gpu(layer l, update_args a)
{ {
update_connected_layer_gpu(*(l.ur), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.ur), a);
update_connected_layer_gpu(*(l.uz), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.uz), a);
update_connected_layer_gpu(*(l.uh), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.uh), a);
update_connected_layer_gpu(*(l.wr), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wr), a);
update_connected_layer_gpu(*(l.wz), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wz), a);
update_connected_layer_gpu(*(l.wh), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wh), a);
} }
void forward_gru_layer_gpu(layer l, network net) void forward_gru_layer_gpu(layer l, network net)

View File

@ -6,16 +6,16 @@
#include "layer.h" #include "layer.h"
#include "network.h" #include "network.h"
layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize); layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
void forward_gru_layer(layer l, network state); void forward_gru_layer(layer l, network state);
void backward_gru_layer(layer l, network state); void backward_gru_layer(layer l, network state);
void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay); void update_gru_layer(layer l, update_args a);
#ifdef GPU #ifdef GPU
void forward_gru_layer_gpu(layer l, network state); void forward_gru_layer_gpu(layer l, network state);
void backward_gru_layer_gpu(layer l, network state); void backward_gru_layer_gpu(layer l, network state);
void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void update_gru_layer_gpu(layer l, update_args a);
void push_gru_layer(layer l); void push_gru_layer(layer l);
void pull_gru_layer(layer l); void pull_gru_layer(layer l);
#endif #endif

View File

@ -164,8 +164,13 @@ void backward_local_layer(local_layer l, network net)
} }
} }
void update_local_layer(local_layer l, int batch, float learning_rate, float momentum, float decay) void update_local_layer(local_layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
int locations = l.out_w*l.out_h; int locations = l.out_w*l.out_h;
int size = l.size*l.size*l.c*l.n*locations; int size = l.size*l.size*l.c*l.n*locations;
axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1); axpy_cpu(l.outputs, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
@ -253,8 +258,13 @@ void backward_local_layer_gpu(local_layer l, network net)
} }
} }
void update_local_layer_gpu(local_layer l, int batch, float learning_rate, float momentum, float decay) void update_local_layer_gpu(local_layer l, update_args a)
{ {
float learning_rate = a.learning_rate*l.learning_rate_scale;
float momentum = a.momentum;
float decay = a.decay;
int batch = a.batch;
int locations = l.out_w*l.out_h; int locations = l.out_w*l.out_h;
int size = l.size*l.size*l.c*l.n*locations; int size = l.size*l.size*l.c*l.n*locations;
axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); axpy_ongpu(l.outputs, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);

View File

@ -12,7 +12,7 @@ typedef layer local_layer;
#ifdef GPU #ifdef GPU
void forward_local_layer_gpu(local_layer layer, network net); void forward_local_layer_gpu(local_layer layer, network net);
void backward_local_layer_gpu(local_layer layer, network net); void backward_local_layer_gpu(local_layer layer, network net);
void update_local_layer_gpu(local_layer layer, int batch, float learning_rate, float momentum, float decay); void update_local_layer_gpu(local_layer layer, update_args a);
void push_local_layer(local_layer layer); void push_local_layer(local_layer layer);
void pull_local_layer(local_layer layer); void pull_local_layer(local_layer layer);
@ -22,7 +22,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
void forward_local_layer(const local_layer layer, network net); void forward_local_layer(const local_layer layer, network net);
void backward_local_layer(local_layer layer, network net); void backward_local_layer(local_layer layer, network net);
void update_local_layer(local_layer layer, int batch, float learning_rate, float momentum, float decay); void update_local_layer(local_layer layer, update_args a);
void bias_output(float *output, float *biases, int batch, int n, int size); void bias_output(float *output, float *biases, int batch, int n, int size);
void backward_bias(float *bias_updates, float *delta, int batch, int n, int size); void backward_bias(float *bias_updates, float *delta, int batch, int n, int size);

View File

@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
#endif #endif
} }
layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize) layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam)
{ {
fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs); fprintf(stderr, "LSTM Layer: %d inputs, %d outputs\n", inputs, outputs);
batch = batch / steps; batch = batch / steps;
@ -38,42 +38,42 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
l.uf = malloc(sizeof(layer)); l.uf = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uf->batch = batch; l.uf->batch = batch;
l.wf = malloc(sizeof(layer)); l.wf = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wf->batch = batch; l.wf->batch = batch;
l.ui = malloc(sizeof(layer)); l.ui = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ui->batch = batch; l.ui->batch = batch;
l.wi = malloc(sizeof(layer)); l.wi = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wi->batch = batch; l.wi->batch = batch;
l.ug = malloc(sizeof(layer)); l.ug = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.ug->batch = batch; l.ug->batch = batch;
l.wg = malloc(sizeof(layer)); l.wg = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wg->batch = batch; l.wg->batch = batch;
l.uo = malloc(sizeof(layer)); l.uo = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize, adam);
l.uo->batch = batch; l.uo->batch = batch;
l.wo = malloc(sizeof(layer)); l.wo = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize, adam);
l.wo->batch = batch; l.wo->batch = batch;
l.batch_normalize = batch_normalize; l.batch_normalize = batch_normalize;
@ -141,16 +141,16 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
return l; return l;
} }
void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay) void update_lstm_layer(layer l, update_args a)
{ {
update_connected_layer(*(l.wf), batch, learning_rate, momentum, decay); update_connected_layer(*(l.wf), a);
update_connected_layer(*(l.wi), batch, learning_rate, momentum, decay); update_connected_layer(*(l.wi), a);
update_connected_layer(*(l.wg), batch, learning_rate, momentum, decay); update_connected_layer(*(l.wg), a);
update_connected_layer(*(l.wo), batch, learning_rate, momentum, decay); update_connected_layer(*(l.wo), a);
update_connected_layer(*(l.uf), batch, learning_rate, momentum, decay); update_connected_layer(*(l.uf), a);
update_connected_layer(*(l.ui), batch, learning_rate, momentum, decay); update_connected_layer(*(l.ui), a);
update_connected_layer(*(l.ug), batch, learning_rate, momentum, decay); update_connected_layer(*(l.ug), a);
update_connected_layer(*(l.uo), batch, learning_rate, momentum, decay); update_connected_layer(*(l.uo), a);
} }
void forward_lstm_layer(layer l, network state) void forward_lstm_layer(layer l, network state)
@ -383,16 +383,16 @@ void backward_lstm_layer(layer l, network state)
} }
#ifdef GPU #ifdef GPU
void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) void update_lstm_layer_gpu(layer l, update_args a)
{ {
update_connected_layer_gpu(*(l.wf), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wf), a);
update_connected_layer_gpu(*(l.wi), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wi), a);
update_connected_layer_gpu(*(l.wg), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wg), a);
update_connected_layer_gpu(*(l.wo), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.wo), a);
update_connected_layer_gpu(*(l.uf), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.uf), a);
update_connected_layer_gpu(*(l.ui), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.ui), a);
update_connected_layer_gpu(*(l.ug), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.ug), a);
update_connected_layer_gpu(*(l.uo), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.uo), a);
} }
void forward_lstm_layer_gpu(layer l, network state) void forward_lstm_layer_gpu(layer l, network state)

View File

@ -6,15 +6,15 @@
#include "network.h" #include "network.h"
#define USET #define USET
layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize); layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_normalize, int adam);
void forward_lstm_layer(layer l, network net); void forward_lstm_layer(layer l, network net);
void update_lstm_layer(layer l, int batch, float learning, float momentum, float decay); void update_lstm_layer(layer l, update_args a);
#ifdef GPU #ifdef GPU
void forward_lstm_layer_gpu(layer l, network net); void forward_lstm_layer_gpu(layer l, network net);
void backward_lstm_layer_gpu(layer l, network net); void backward_lstm_layer_gpu(layer l, network net);
void update_lstm_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void update_lstm_layer_gpu(layer l, update_args a);
#endif #endif
#endif #endif

View File

@ -65,9 +65,9 @@ network *load_network_p(char *cfg, char *weights, int clear)
return net; return net;
} }
int get_current_batch(network net) size_t get_current_batch(network net)
{ {
int batch_num = (*net.seen)/(net.batch*net.subdivisions); size_t batch_num = (*net.seen)/(net.batch*net.subdivisions);
return batch_num; return batch_num;
} }
@ -84,7 +84,7 @@ void reset_momentum(network net)
float get_current_rate(network net) float get_current_rate(network net)
{ {
int batch_num = get_current_batch(net); size_t batch_num = get_current_batch(net);
int i; int i;
float rate; float rate;
if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power); if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
@ -174,6 +174,7 @@ network make_network(int n)
net.n = n; net.n = n;
net.layers = calloc(net.n, sizeof(layer)); net.layers = calloc(net.n, sizeof(layer));
net.seen = calloc(1, sizeof(int)); net.seen = calloc(1, sizeof(int));
net.t = calloc(1, sizeof(int));
net.cost = calloc(1, sizeof(float)); net.cost = calloc(1, sizeof(float));
return net; return net;
} }
@ -199,12 +200,22 @@ void forward_network(network net)
void update_network(network net) void update_network(network net)
{ {
int i; int i;
int update_batch = net.batch*net.subdivisions; update_args a = {0};
float rate = get_current_rate(net); a.batch = net.batch*net.subdivisions;
a.learning_rate = get_current_rate(net);
a.momentum = net.momentum;
a.decay = net.decay;
a.adam = net.adam;
a.B1 = net.B1;
a.B2 = net.B2;
a.eps = net.eps;
++*net.t;
a.t = *net.t;
for(i = 0; i < net.n; ++i){ for(i = 0; i < net.n; ++i){
layer l = net.layers[i]; layer l = net.layers[i];
if(l.update){ if(l.update){
l.update(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); l.update(l, a);
} }
} }
} }

View File

@ -81,13 +81,22 @@ void update_network_gpu(network net)
{ {
cuda_set_device(net.gpu_index); cuda_set_device(net.gpu_index);
int i; int i;
int update_batch = net.batch*net.subdivisions; update_args a = {0};
float rate = get_current_rate(net); a.batch = net.batch*net.subdivisions;
a.learning_rate = get_current_rate(net);
a.momentum = net.momentum;
a.decay = net.decay;
a.adam = net.adam;
a.B1 = net.B1;
a.B2 = net.B2;
a.eps = net.eps;
++*net.t;
a.t = (*net.t);
for(i = 0; i < net.n; ++i){ for(i = 0; i < net.n; ++i){
layer l = net.layers[i]; layer l = net.layers[i];
l.t = get_current_batch(net);
if(l.update_gpu){ if(l.update_gpu){
l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); l.update_gpu(l, a);
} }
} }
} }

View File

@ -191,11 +191,6 @@ convolutional_layer parse_convolutional(list *options, size_params params)
convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam); convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam);
layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.flipped = option_find_int_quiet(options, "flipped", 0);
layer.dot = option_find_float_quiet(options, "dot", 0); layer.dot = option_find_float_quiet(options, "dot", 0);
if(params.net.adam){
layer.B1 = params.net.B1;
layer.B2 = params.net.B2;
layer.eps = params.net.eps;
}
return layer; return layer;
} }
@ -224,7 +219,7 @@ layer parse_rnn(list *options, size_params params)
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
int logistic = option_find_int_quiet(options, "logistic", 0); int logistic = option_find_int_quiet(options, "logistic", 0);
layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic); layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic, params.net.adam);
l.shortcut = option_find_int_quiet(options, "shortcut", 0); l.shortcut = option_find_int_quiet(options, "shortcut", 0);
@ -236,7 +231,7 @@ layer parse_gru(list *options, size_params params)
int output = option_find_int(options, "output",1); int output = option_find_int(options, "output",1);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize); layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net.adam);
l.tanh = option_find_int_quiet(options, "tanh", 0); l.tanh = option_find_int_quiet(options, "tanh", 0);
return l; return l;
@ -247,21 +242,20 @@ layer parse_lstm(list *options, size_params params)
int output = option_find_int(options, "output", 1); int output = option_find_int(options, "output", 1);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize); layer l = make_lstm_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize, params.net.adam);
return l; return l;
} }
connected_layer parse_connected(list *options, size_params params) layer parse_connected(list *options, size_params params)
{ {
int output = option_find_int(options, "output",1); int output = option_find_int(options, "output",1);
char *activation_s = option_find_str(options, "activation", "logistic"); char *activation_s = option_find_str(options, "activation", "logistic");
ACTIVATION activation = get_activation(activation_s); ACTIVATION activation = get_activation(activation_s);
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
connected_layer layer = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize); layer l = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize, params.net.adam);
return l;
return layer;
} }
softmax_layer parse_softmax(list *options, size_params params) softmax_layer parse_softmax(list *options, size_params params)
@ -567,7 +561,7 @@ void parse_net_options(list *options, network *net)
if(net->adam){ if(net->adam){
net->B1 = option_find_float(options, "B1", .9); net->B1 = option_find_float(options, "B1", .9);
net->B2 = option_find_float(options, "B2", .999); net->B2 = option_find_float(options, "B2", .999);
net->eps = option_find_float(options, "eps", .00000001); net->eps = option_find_float(options, "eps", .0000001);
} }
net->h = option_find_int_quiet(options, "height",0); net->h = option_find_int_quiet(options, "height",0);
@ -855,10 +849,6 @@ void save_convolutional_weights(layer l, FILE *fp)
fwrite(l.rolling_variance, sizeof(float), l.n, fp); fwrite(l.rolling_variance, sizeof(float), l.n, fp);
} }
fwrite(l.weights, sizeof(float), num, fp); fwrite(l.weights, sizeof(float), num, fp);
if(l.adam){
//fwrite(l.m, sizeof(float), num, fp);
//fwrite(l.v, sizeof(float), num, fp);
}
} }
void save_batchnorm_weights(layer l, FILE *fp) void save_batchnorm_weights(layer l, FILE *fp)
@ -901,12 +891,12 @@ void save_weights_upto(network net, char *filename, int cutoff)
if(!fp) file_error(filename); if(!fp) file_error(filename);
int major = 0; int major = 0;
int minor = 1; int minor = 2;
int revision = 0; int revision = 0;
fwrite(&major, sizeof(int), 1, fp); fwrite(&major, sizeof(int), 1, fp);
fwrite(&minor, sizeof(int), 1, fp); fwrite(&minor, sizeof(int), 1, fp);
fwrite(&revision, sizeof(int), 1, fp); fwrite(&revision, sizeof(int), 1, fp);
fwrite(net.seen, sizeof(int), 1, fp); fwrite(net.seen, sizeof(size_t), 1, fp);
int i; int i;
for(i = 0; i < net.n && i < cutoff; ++i){ for(i = 0; i < net.n && i < cutoff; ++i){
@ -1068,10 +1058,6 @@ void load_convolutional_weights(layer l, FILE *fp)
} }
} }
fread(l.weights, sizeof(float), num, fp); fread(l.weights, sizeof(float), num, fp);
if(l.adam){
//fread(l.m, sizeof(float), num, fp);
//fread(l.v, sizeof(float), num, fp);
}
//if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1); //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1);
if (l.flipped) { if (l.flipped) {
transpose_matrix(l.weights, l.c*l.size*l.size, l.n); transpose_matrix(l.weights, l.c*l.size*l.size, l.n);
@ -1103,7 +1089,13 @@ void load_weights_upto(network *net, char *filename, int start, int cutoff)
fread(&major, sizeof(int), 1, fp); fread(&major, sizeof(int), 1, fp);
fread(&minor, sizeof(int), 1, fp); fread(&minor, sizeof(int), 1, fp);
fread(&revision, sizeof(int), 1, fp); fread(&revision, sizeof(int), 1, fp);
fread(net->seen, sizeof(int), 1, fp); if ((major*10 + minor) >= 2){
fread(net->seen, sizeof(size_t), 1, fp);
} else {
int iseen = 0;
fread(&iseen, sizeof(int), 1, fp);
*net->seen = iseen;
}
int transpose = (major > 1000) || (minor > 1000); int transpose = (major > 1000) || (minor > 1000);
int i; int i;

View File

@ -26,7 +26,7 @@ static void increment_layer(layer *l, int steps)
#endif #endif
} }
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log) layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam)
{ {
fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs); fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs);
batch = batch / steps; batch = batch / steps;
@ -41,17 +41,17 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
l.input_layer = malloc(sizeof(layer)); l.input_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize); *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize, adam);
l.input_layer->batch = batch; l.input_layer->batch = batch;
l.self_layer = malloc(sizeof(layer)); l.self_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize); *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize, adam);
l.self_layer->batch = batch; l.self_layer->batch = batch;
l.output_layer = malloc(sizeof(layer)); l.output_layer = malloc(sizeof(layer));
fprintf(stderr, "\t\t"); fprintf(stderr, "\t\t");
*(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize); *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize, adam);
l.output_layer->batch = batch; l.output_layer->batch = batch;
l.outputs = outputs; l.outputs = outputs;
@ -73,11 +73,11 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps,
return l; return l;
} }
void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay) void update_rnn_layer(layer l, update_args a)
{ {
update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.input_layer), a);
update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.self_layer), a);
update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay); update_connected_layer(*(l.output_layer), a);
} }
void forward_rnn_layer(layer l, network net) void forward_rnn_layer(layer l, network net)
@ -187,11 +187,11 @@ void push_rnn_layer(layer l)
push_connected_layer(*(l.output_layer)); push_connected_layer(*(l.output_layer));
} }
void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) void update_rnn_layer_gpu(layer l, update_args a)
{ {
update_connected_layer_gpu(*(l.input_layer), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.input_layer), a);
update_connected_layer_gpu(*(l.self_layer), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.self_layer), a);
update_connected_layer_gpu(*(l.output_layer), batch, learning_rate, momentum, decay); update_connected_layer_gpu(*(l.output_layer), a);
} }
void forward_rnn_layer_gpu(layer l, network net) void forward_rnn_layer_gpu(layer l, network net)

View File

@ -7,16 +7,16 @@
#include "network.h" #include "network.h"
#define USET #define USET
layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log); layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log, int adam);
void forward_rnn_layer(layer l, network net); void forward_rnn_layer(layer l, network net);
void backward_rnn_layer(layer l, network net); void backward_rnn_layer(layer l, network net);
void update_rnn_layer(layer l, int batch, float learning_rate, float momentum, float decay); void update_rnn_layer(layer l, update_args a);
#ifdef GPU #ifdef GPU
void forward_rnn_layer_gpu(layer l, network net); void forward_rnn_layer_gpu(layer l, network net);
void backward_rnn_layer_gpu(layer l, network net); void backward_rnn_layer_gpu(layer l, network net);
void update_rnn_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); void update_rnn_layer_gpu(layer l, update_args a);
void push_rnn_layer(layer l); void push_rnn_layer(layer l);
void pull_rnn_layer(layer l); void pull_rnn_layer(layer l);
#endif #endif