diff --git a/.gitignore b/.gitignore index 7913d676..deb3dcc5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ *.o *.dSYM *.csv +*.out +mnist/ images/ opencv/ convnet/ diff --git a/nist.cfg b/nist.cfg index cc9282c0..946fb8e7 100644 --- a/nist.cfg +++ b/nist.cfg @@ -2,7 +2,7 @@ width=28 height=28 channels=1 -filters=4 +filters=6 size=5 stride=1 activation=ramp @@ -11,7 +11,7 @@ activation=ramp stride=2 [conv] -filters=12 +filters=16 size=5 stride=1 activation=ramp @@ -20,7 +20,7 @@ activation=ramp stride=2 [conv] -filters=10 +filters=120 size=3 stride=1 activation=ramp @@ -28,6 +28,10 @@ activation=ramp [maxpool] stride=2 +[conn] +output = 80 +activation=ramp + [conn] output = 10 activation=ramp diff --git a/src/activations.c b/src/activations.c index a255f0fd..b8bb79d9 100644 --- a/src/activations.c +++ b/src/activations.c @@ -8,15 +8,16 @@ ACTIVATION get_activation(char *s) { if (strcmp(s, "sigmoid")==0) return SIGMOID; if (strcmp(s, "relu")==0) return RELU; - if (strcmp(s, "identity")==0) return IDENTITY; + if (strcmp(s, "linear")==0) return LINEAR; if (strcmp(s, "ramp")==0) return RAMP; + if (strcmp(s, "tanh")==0) return TANH; fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s); return RELU; } double activate(double x, ACTIVATION a){ switch(a){ - case IDENTITY: + case LINEAR: return x; case SIGMOID: return 1./(1.+exp(-x)); @@ -24,12 +25,14 @@ double activate(double x, ACTIVATION a){ return x*(x>0); case RAMP: return x*(x>0) + .1*x; + case TANH: + return (exp(2*x)-1)/(exp(2*x)+1); } return 0; } double gradient(double x, ACTIVATION a){ switch(a){ - case IDENTITY: + case LINEAR: return 1; case SIGMOID: return (1.-x)*x; @@ -37,35 +40,9 @@ double gradient(double x, ACTIVATION a){ return (x>0); case RAMP: return (x>0) + .1; + case TANH: + return 1-x*x; } return 0; } -double identity_activation(double x) -{ - return x; -} -double identity_gradient(double x) -{ - return 1; -} - -double relu_activation(double x) -{ - return x*(x>0); -} -double relu_gradient(double x) -{ - return (x>0); -} - -double sigmoid_activation(double x) -{ - return 1./(1.+exp(-x)); -} - -double sigmoid_gradient(double x) -{ - return x*(1.-x); -} - diff --git a/src/activations.h b/src/activations.h index 15d96d3d..889453f6 100644 --- a/src/activations.h +++ b/src/activations.h @@ -2,7 +2,7 @@ #define ACTIVATIONS_H typedef enum{ - SIGMOID, RELU, IDENTITY, RAMP + SIGMOID, RELU, LINEAR, RAMP, TANH }ACTIVATION; ACTIVATION get_activation(char *s); diff --git a/src/connected_layer.c b/src/connected_layer.c index 99f146b5..d769e1fe 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -8,7 +8,7 @@ connected_layer *make_connected_layer(int inputs, int outputs, ACTIVATION activation) { - printf("Connected Layer: %d inputs, %d outputs\n", inputs, outputs); + fprintf(stderr, "Connected Layer: %d inputs, %d outputs\n", inputs, outputs); int i; connected_layer *layer = calloc(1, sizeof(connected_layer)); layer->inputs = inputs; @@ -29,7 +29,7 @@ connected_layer *make_connected_layer(int inputs, int outputs, ACTIVATION activa layer->biases = calloc(outputs, sizeof(double)); for(i = 0; i < outputs; ++i) //layer->biases[i] = rand_normal()*scale + scale; - layer->biases[i] = 1; + layer->biases[i] = 0; layer->activation = activation; return layer; diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 6d77700b..45b55b8e 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -39,7 +39,7 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si layer->w = w; layer->c = c; layer->n = n; - layer->edge = 0; + layer->edge = 1; layer->stride = stride; layer->kernels = calloc(n, sizeof(image)); layer->kernel_updates = calloc(n, sizeof(image)); @@ -47,10 +47,10 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si layer->biases = calloc(n, sizeof(double)); layer->bias_updates = calloc(n, sizeof(double)); layer->bias_momentum = calloc(n, sizeof(double)); - double scale = 20./(size*size*c); + double scale = 2./(size*size); for(i = 0; i < n; ++i){ //layer->biases[i] = rand_normal()*scale + scale; - layer->biases[i] = 1; + layer->biases[i] = 0; layer->kernels[i] = make_random_kernel(size, c, scale); layer->kernel_updates[i] = make_random_kernel(size, c, 0); layer->kernel_momentum[i] = make_random_kernel(size, c, 0); @@ -63,7 +63,7 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si out_h = (layer->h - layer->size)/layer->stride+1; out_w = (layer->h - layer->size)/layer->stride+1; } - printf("Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); + fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); layer->output = calloc(out_h * out_w * n, sizeof(double)); layer->delta = calloc(out_h * out_w * n, sizeof(double)); layer->upsampled = make_image(h,w,n); @@ -124,15 +124,22 @@ void backward_convolutional_layer2(convolutional_layer layer, double *input, dou } } -void learn_convolutional_layer(convolutional_layer layer, double *input) +void gradient_delta_convolutional_layer(convolutional_layer layer) { int i; - image in_image = double_to_image(layer.h, layer.w, layer.c, input); image out_delta = get_convolutional_delta(layer); image out_image = get_convolutional_image(layer); for(i = 0; i < out_image.h*out_image.w*out_image.c; ++i){ out_delta.data[i] *= gradient(out_image.data[i], layer.activation); } +} + +void learn_convolutional_layer(convolutional_layer layer, double *input) +{ + int i; + image in_image = double_to_image(layer.h, layer.w, layer.c, input); + image out_delta = get_convolutional_delta(layer); + gradient_delta_convolutional_layer(layer); for(i = 0; i < layer.n; ++i){ kernel_update(in_image, layer.kernel_updates[i], layer.stride, i, out_delta, layer.edge); layer.bias_updates[i] += avg_image_layer(out_delta, i); diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c index 5a82e0b2..ccf9bee0 100644 --- a/src/maxpool_layer.c +++ b/src/maxpool_layer.c @@ -19,7 +19,7 @@ image get_maxpool_delta(maxpool_layer layer) maxpool_layer *make_maxpool_layer(int h, int w, int c, int stride) { - printf("Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride); + fprintf(stderr, "Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride); maxpool_layer *layer = calloc(1, sizeof(maxpool_layer)); layer->h = h; layer->w = w; diff --git a/src/network.c b/src/network.c index cce673c2..faedb8cb 100644 --- a/src/network.c +++ b/src/network.c @@ -276,10 +276,10 @@ void print_network(network net) } double mean = mean_array(output, n); double vari = variance_array(output, n); - printf("Layer %d - Mean: %f, Variance: %f\n",i,mean, vari); + fprintf(stderr, "Layer %d - Mean: %f, Variance: %f\n",i,mean, vari); if(n > 100) n = 100; - for(j = 0; j < n; ++j) printf("%f, ", output[j]); - if(n == 100)printf(".....\n"); - printf("\n"); + for(j = 0; j < n; ++j) fprintf(stderr, "%f, ", output[j]); + if(n == 100)fprintf(stderr,".....\n"); + fprintf(stderr, "\n"); } } diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 28696b70..b213e5b0 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -5,7 +5,7 @@ softmax_layer *make_softmax_layer(int inputs) { - printf("Softmax Layer: %d inputs\n", inputs); + fprintf(stderr, "Softmax Layer: %d inputs\n", inputs); softmax_layer *layer = calloc(1, sizeof(softmax_layer)); layer->inputs = inputs; layer->output = calloc(inputs, sizeof(double)); diff --git a/src/tests.c b/src/tests.c index 722de1ae..c221042a 100644 --- a/src/tests.c +++ b/src/tests.c @@ -15,7 +15,6 @@ void test_convolve() { image dog = load_image("dog.jpg"); - //show_image_layers(dog, "Dog"); printf("dog channels %d\n", dog.c); image kernel = make_random_image(3,3,dog.c); image edge = make_image(dog.h, dog.w, 1); @@ -88,7 +87,7 @@ void verify_convolutional_layer() image out_delta = get_convolutional_delta(layer); for(i = 0; i < out.h*out.w*out.c; ++i){ out_delta.data[i] = 1; - backward_convolutional_layer2(layer, test.data, in_delta.data); + backward_convolutional_layer(layer, test.data, in_delta.data); image partial = copy_image(in_delta); jacobian2[i] = partial.data; out_delta.data[i] = 0; @@ -156,7 +155,7 @@ void test_parser() int count = 0; double avgerr = 0; - while(1){ + while(++count < 100000000){ double v = ((double)rand()/RAND_MAX); double truth = v*v; input[0] = v; @@ -165,8 +164,7 @@ void test_parser() double *delta = get_network_delta(net); double err = pow((out[0]-truth),2.); avgerr = .99 * avgerr + .01 * err; - //if(++count % 100000 == 0) printf("%f\n", avgerr); - if(++count % 1000000 == 0) printf("%f %f :%f AVG %f \n", truth, out[0], err, avgerr); + if(count % 1000000 == 0) printf("%f %f :%f AVG %f \n", truth, out[0], err, avgerr); delta[0] = truth - out[0]; learn_network(net, input); update_network(net, .001); @@ -197,15 +195,16 @@ void test_full() } } -double error_network(network net, matrix m, double *truth) +double error_network(network net, matrix m, double **truth) { int i; int correct = 0; + int k = get_network_output_size(net); for(i = 0; i < m.rows; ++i){ forward_network(net, m.vals[i]); double *out = get_network_output(net); - double err = truth[i] - out[0]; - if(fabs(err) < .5) ++correct; + int guess = max_index(out, k); + if(truth[i][guess]) ++correct; } return (double)correct/m.rows; } @@ -224,24 +223,35 @@ double **one_hot(double *a, int n, int k) void test_nist() { + srand(999999); network net = parse_network_cfg("nist.cfg"); - matrix m = csv_to_matrix("images/nist_train.csv"); - matrix ho = hold_out_matrix(&m, 3000); + matrix m = csv_to_matrix("mnist/mnist_train.csv"); + matrix test = csv_to_matrix("mnist/mnist_test.csv"); double *truth_1d = pop_column(&m, 0); double **truth = one_hot(truth_1d, m.rows, 10); - double *ho_truth_1d = pop_column(&ho, 0); - double **ho_truth = one_hot(ho_truth_1d, ho.rows, 10); + double *test_truth_1d = pop_column(&test, 0); + double **test_truth = one_hot(test_truth_1d, test.rows, 10); int i,j; clock_t start = clock(), end; + for(i = 0; i < test.rows; ++i){ + normalize_array(test.vals[i], 28*28); + //scale_array(m.vals[i], 28*28, 1./255.); + //translate_array(m.vals[i], 28*28, -.1); + } + for(i = 0; i < m.rows; ++i){ + normalize_array(m.vals[i], 28*28); + //scale_array(m.vals[i], 28*28, 1./255.); + //translate_array(m.vals[i], 28*28, -.1); + } int count = 0; - double lr = .0001; - while(++count <= 3000000){ + double lr = .0005; + while(++count <= 300){ //lr *= .99; int index = 0; int correct = 0; - for(i = 0; i < 1000; ++i){ + int number = 1000; + for(i = 0; i < number; ++i){ index = rand()%m.rows; - normalize_array(m.vals[index], 28*28); forward_network(net, m.vals[index]); double *out = get_network_output(net); double *delta = get_network_delta(net); @@ -260,19 +270,29 @@ void test_nist() } print_network(net); image input = double_to_image(28,28,1, m.vals[index]); - show_image(input, "Input"); + //show_image(input, "Input"); image o = get_network_image(net); - show_image_collapsed(o, "Output"); + //show_image_collapsed(o, "Output"); visualize_network(net); - cvWaitKey(100); + cvWaitKey(10); //double test_acc = error_network(net, m, truth); - //double valid_acc = error_network(net, ho, ho_truth); - //printf("%f, %f\n", test_acc, valid_acc); - fprintf(stderr, "%5d: %f %f\n",count, (double)correct/1000, lr); - //if(valid_acc > .70) break; + fprintf(stderr, "\n%5d: %f %f\n\n",count, (double)correct/number, lr); + if(count % 10 == 0 && 0){ + double train_acc = error_network(net, m, truth); + fprintf(stderr, "\nTRAIN: %f\n", train_acc); + double test_acc = error_network(net, test, test_truth); + fprintf(stderr, "TEST: %f\n\n", test_acc); + printf("%d, %f, %f\n", count, train_acc, test_acc); + } + if(count % (m.rows/number) == 0) lr /= 2; } + double train_acc = error_network(net, m, truth); + fprintf(stderr, "\nTRAIN: %f\n", train_acc); + double test_acc = error_network(net, test, test_truth); + fprintf(stderr, "TEST: %f\n\n", test_acc); + printf("%d, %f, %f\n", count, train_acc, test_acc); end = clock(); - printf("Neural Net Learning: %lf seconds\n", (double)(end-start)/CLOCKS_PER_SEC); + //printf("Neural Net Learning: %lf seconds\n", (double)(end-start)/CLOCKS_PER_SEC); } void test_kernel_update() @@ -281,14 +301,14 @@ void test_kernel_update() double delta[] = {.1}; double input[] = {.3, .5, .3, .5, .5, .5, .5, .0, .5}; double kernel[] = {1,2,3,4,5,6,7,8,9}; - convolutional_layer layer = *make_convolutional_layer(3, 3, 1, 1, 3, 1, IDENTITY); + convolutional_layer layer = *make_convolutional_layer(3, 3, 1, 1, 3, 1, LINEAR); layer.kernels[0].data = kernel; layer.delta = delta; learn_convolutional_layer(layer, input); print_image(layer.kernels[0]); print_image(get_convolutional_delta(layer)); print_image(layer.kernel_updates[0]); - + } void test_random_classify() @@ -311,15 +331,15 @@ void test_random_classify() double *delta = get_network_delta(net); //printf("%f\n", out[0]); delta[0] = truth[index] - out[0]; - // printf("%f\n", delta[0]); + // printf("%f\n", delta[0]); //printf("%f %f\n", truth[index], out[0]); learn_network(net, m.vals[index]); update_network(net, .00001); } - double test_acc = error_network(net, m, truth); - double valid_acc = error_network(net, ho, ho_truth); - printf("%f, %f\n", test_acc, valid_acc); - fprintf(stderr, "%5d: %f Valid: %f\n",count, test_acc, valid_acc); + //double test_acc = error_network(net, m, truth); + //double valid_acc = error_network(net, ho, ho_truth); + //printf("%f, %f\n", test_acc, valid_acc); + //fprintf(stderr, "%5d: %f Valid: %f\n",count, test_acc, valid_acc); //if(valid_acc > .70) break; } end = clock(); @@ -362,8 +382,8 @@ void test_random_preprocess() int main() { //test_kernel_update(); - //test_nist(); - test_full(); + test_nist(); + //test_full(); //test_random_preprocess(); //test_random_classify(); //test_parser(); diff --git a/src/utils.c b/src/utils.c index 8229b2d0..3b8b5a80 100644 --- a/src/utils.c +++ b/src/utils.c @@ -180,6 +180,35 @@ void normalize_array(double *a, int n) sigma = sqrt(variance_array(a,n)); } +void translate_array(double *a, int n, double s) +{ + int i; + for(i = 0; i < n; ++i){ + a[i] += s; + } +} + +void scale_array(double *a, int n, double s) +{ + int i; + for(i = 0; i < n; ++i){ + a[i] *= s; + } +} +int max_index(double *a, int n) +{ + if(n <= 0) return -1; + int i, max_i = 0; + double max = a[0]; + for(i = 1; i < n; ++i){ + if(a[i] > max){ + max = a[i]; + max_i = i; + } + } + return max_i; +} + double rand_normal() { int i; diff --git a/src/utils.h b/src/utils.h index 35217782..04747a4d 100644 --- a/src/utils.h +++ b/src/utils.h @@ -15,6 +15,9 @@ char *copy_string(char *s); int count_fields(char *line); double *parse_fields(char *line, int n); void normalize_array(double *a, int n); +void scale_array(double *a, int n, double s); +void translate_array(double *a, int n, double s); +int max_index(double *a, int n); double constrain(double a, double max); double rand_normal(); double mean_array(double *a, int n);