diff --git a/Makefile b/Makefile index c9b6ecac..de515d32 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ -GPU=0 -OPENCV=0 +GPU=1 +OPENCV=1 DEBUG=0 ARCH= --gpu-architecture=compute_20 --gpu-code=compute_20 diff --git a/cfg/rnn.cfg b/cfg/rnn.cfg index f57ec67c..a67e1fa1 100644 --- a/cfg/rnn.cfg +++ b/cfg/rnn.cfg @@ -1,11 +1,11 @@ [net] subdivisions=1 inputs=256 -batch = 1 -time_steps = 1 +batch = 128 momentum=0.9 -decay=0.0005 -max_batches = 50000000 +decay=0.001 +max_batches = 50000 +time_steps=900 learning_rate=0.1 [rnn] diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu index d5607daa..99933c86 100644 --- a/src/activation_kernels.cu +++ b/src/activation_kernels.cu @@ -9,6 +9,7 @@ extern "C" { __device__ float linear_activate_kernel(float x){return x;} __device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));} +__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;} __device__ float relu_activate_kernel(float x){return x*(x>0);} __device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);} __device__ float relie_activate_kernel(float x){return x*(x>0);} @@ -24,6 +25,11 @@ __device__ float plse_activate_kernel(float x) __device__ float linear_gradient_kernel(float x){return 1;} __device__ float logistic_gradient_kernel(float x){return (1-x)*x;} +__device__ float loggy_gradient_kernel(float x) +{ + float y = (x+1.)/2.; + return 2*(1-y)*y; +} __device__ float relu_gradient_kernel(float x){return (x>0);} __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);} __device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;} @@ -39,6 +45,8 @@ __device__ float activate_kernel(float x, ACTIVATION a) return linear_activate_kernel(x); case LOGISTIC: return logistic_activate_kernel(x); + case LOGGY: + return loggy_activate_kernel(x); case RELU: return relu_activate_kernel(x); case ELU: @@ -64,6 +72,8 @@ __device__ float gradient_kernel(float x, ACTIVATION a) return linear_gradient_kernel(x); case LOGISTIC: return logistic_gradient_kernel(x); + case LOGGY: + return loggy_gradient_kernel(x); case RELU: return relu_gradient_kernel(x); case ELU: diff --git a/src/activations.c b/src/activations.c index 5a62ef51..07e3a459 100644 --- a/src/activations.c +++ b/src/activations.c @@ -10,6 +10,8 @@ char *get_activation_string(ACTIVATION a) switch(a){ case LOGISTIC: return "logistic"; + case LOGGY: + return "loggy"; case RELU: return "relu"; case ELU: @@ -35,6 +37,7 @@ char *get_activation_string(ACTIVATION a) ACTIVATION get_activation(char *s) { if (strcmp(s, "logistic")==0) return LOGISTIC; + if (strcmp(s, "loggy")==0) return LOGGY; if (strcmp(s, "relu")==0) return RELU; if (strcmp(s, "elu")==0) return ELU; if (strcmp(s, "relie")==0) return RELIE; @@ -54,6 +57,8 @@ float activate(float x, ACTIVATION a) return linear_activate(x); case LOGISTIC: return logistic_activate(x); + case LOGGY: + return loggy_activate(x); case RELU: return relu_activate(x); case ELU: @@ -87,6 +92,8 @@ float gradient(float x, ACTIVATION a) return linear_gradient(x); case LOGISTIC: return logistic_gradient(x); + case LOGGY: + return loggy_gradient(x); case RELU: return relu_gradient(x); case ELU: diff --git a/src/activations.h b/src/activations.h index d824d1e7..78060258 100644 --- a/src/activations.h +++ b/src/activations.h @@ -4,7 +4,7 @@ #include "math.h" typedef enum{ - LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU + LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY }ACTIVATION; ACTIVATION get_activation(char *s); @@ -21,6 +21,7 @@ void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta); static inline float linear_activate(float x){return x;} static inline float logistic_activate(float x){return 1./(1. + exp(-x));} +static inline float loggy_activate(float x){return 2./(1. + exp(-x)) - 1;} static inline float relu_activate(float x){return x*(x>0);} static inline float elu_activate(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);} static inline float relie_activate(float x){return x*(x>0);} @@ -36,6 +37,11 @@ static inline float plse_activate(float x) static inline float linear_gradient(float x){return 1;} static inline float logistic_gradient(float x){return (1-x)*x;} +static inline float loggy_gradient(float x) +{ + float y = (x+1.)/2.; + return 2*(1-y)*y; +} static inline float relu_gradient(float x){return (x>0);} static inline float elu_gradient(float x){return (x >= 0) + (x < 0)*(x + 1);} static inline float relie_gradient(float x){return (x>0) ? 1 : .01;} diff --git a/src/darknet.c b/src/darknet.c index 938609ea..c4006cee 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -206,7 +206,6 @@ int main(int argc, char **argv) gpu_index = find_int_arg(argc, argv, "-i", 0); if(find_arg(argc, argv, "-nogpu")) { gpu_index = -1; - printf("nogpu\n"); } #ifndef GPU diff --git a/src/layer.h b/src/layer.h index fc76234f..91042a21 100644 --- a/src/layer.h +++ b/src/layer.h @@ -34,6 +34,7 @@ struct layer{ ACTIVATION activation; COST_TYPE cost_type; int batch_normalize; + int shortcut; int batch; int forced; int flipped; diff --git a/src/nightmare.c b/src/nightmare.c index bc2060f2..2b1c76cd 100644 --- a/src/nightmare.c +++ b/src/nightmare.c @@ -13,7 +13,7 @@ float abs_mean(float *x, int n) int i; float sum = 0; for (i = 0; i < n; ++i){ - sum += abs(x[i]); + sum += fabs(x[i]); } return sum/n; } diff --git a/src/parser.c b/src/parser.c index a48f207c..68a9f049 100644 --- a/src/parser.c +++ b/src/parser.c @@ -176,8 +176,11 @@ layer parse_rnn(list *options, size_params params) char *activation_s = option_find_str(options, "activation", "logistic"); ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); + int logistic = option_find_int_quiet(options, "logistic", 0); - layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize); + layer l = make_rnn_layer(params.batch, params.inputs, hidden, output, params.time_steps, activation, batch_normalize, logistic); + + l.shortcut = option_find_int_quiet(options, "shortcut", 0); return l; } diff --git a/src/rnn.c b/src/rnn.c index d3e7e51f..aee53ffb 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -19,6 +19,12 @@ float_pair get_rnn_data(char *text, int len, int batch, int steps) int i,j; for(i = 0; i < batch; ++i){ int index = rand() %(len - steps - 1); + int done = 1; + while(!done){ + index = rand() %(len - steps - 1); + while(index < len-steps-1 && text[index++] != '\n'); + if (index < len-steps-1) done = 1; + } for(j = 0; j < steps; ++j){ x[(j*batch + i)*256 + text[index + j]] = 1; y[(j*batch + i)*256 + text[index + j + 1]] = 1; @@ -48,13 +54,13 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename) srand(time(0)); data_seed = time(0); char *base = basecfg(cfgfile); - printf("%s\n", base); + fprintf(stderr, "%s\n", base); float avg_loss = -1; network net = parse_network_cfg(cfgfile); if(weightfile){ load_weights(&net, weightfile); } - printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int batch = net.batch; int steps = net.time_steps; int i = (*net.seen)/net.batch; @@ -71,7 +77,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename) if (avg_loss < 0) avg_loss = loss; avg_loss = avg_loss*.9 + loss*.1; - printf("%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time)); + fprintf(stderr, "%d: %f, %f avg, %f rate, %lf seconds\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time)); if(i%100==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); @@ -92,7 +98,7 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t { srand(rseed); char *base = basecfg(cfgfile); - printf("%s\n", base); + fprintf(stderr, "%s\n", base); network net = parse_network_cfg(cfgfile); if(weightfile){ @@ -128,6 +134,43 @@ void test_char_rnn(char *cfgfile, char *weightfile, int num, char *seed, float t printf("\n"); } +void valid_char_rnn(char *cfgfile, char *weightfile, char *filename) +{ + FILE *fp = fopen(filename, "r"); + //FILE *fp = fopen("data/ab.txt", "r"); + //FILE *fp = fopen("data/grrm/asoiaf.txt", "r"); + + fseek(fp, 0, SEEK_END); + size_t size = ftell(fp); + fseek(fp, 0, SEEK_SET); + + char *text = calloc(size, sizeof(char)); + fread(text, 1, size, fp); + fclose(fp); + + char *base = basecfg(cfgfile); + fprintf(stderr, "%s\n", base); + + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + + int i; + char c; + float *input = calloc(256, sizeof(float)); + float sum = 0; + for(i = 0; i < size-1; ++i){ + c = text[i]; + input[(int)c] = 1; + float *out = network_predict(net, input); + input[(int)c] = 0; + sum += log(out[(int)text[i+1]]); + } + printf("Log Probability: %f\n", sum); +} + + void run_char_rnn(int argc, char **argv) { if(argc < 4){ @@ -143,5 +186,6 @@ void run_char_rnn(int argc, char **argv) char *cfg = argv[3]; char *weights = (argc > 4) ? argv[4] : 0; if(0==strcmp(argv[2], "train")) train_char_rnn(cfg, weights, filename); + else if(0==strcmp(argv[2], "valid")) valid_char_rnn(cfg, weights, filename); else if(0==strcmp(argv[2], "test")) test_char_rnn(cfg, weights, len, seed, temp, rseed); } diff --git a/src/rnn_layer.c b/src/rnn_layer.c index 63582858..e58e0a4b 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -11,9 +11,9 @@ #include -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize) +layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log) { - printf("%d %d\n", batch, steps); + fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs); batch = batch / steps; layer l = {0}; l.batch = batch; @@ -25,14 +25,17 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.state = calloc(batch*hidden, sizeof(float)); l.input_layer = malloc(sizeof(layer)); + fprintf(stderr, "\t\t"); *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); - *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, activation, batch_normalize); + fprintf(stderr, "\t\t"); + *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); + fprintf(stderr, "\t\t"); *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize); l.output_layer->batch = batch; @@ -46,7 +49,6 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.delta_gpu = l.output_layer->delta_gpu; #endif - fprintf(stderr, "RNN Layer: %d inputs, %d outputs\n", inputs, outputs); return l; } diff --git a/src/rnn_layer.h b/src/rnn_layer.h index 8d4f5854..00dc1be9 100644 --- a/src/rnn_layer.h +++ b/src/rnn_layer.h @@ -6,7 +6,7 @@ #include "layer.h" #include "network.h" -layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize); +layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, ACTIVATION activation, int batch_normalize, int log); void forward_rnn_layer(layer l, network_state state); void backward_rnn_layer(layer l, network_state state);