diff --git a/.gitignore b/.gitignore
index 7913d676..deb3dcc5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,8 @@
 *.o
 *.dSYM
 *.csv
+*.out
+mnist/
 images/
 opencv/
 convnet/
diff --git a/nist.cfg b/nist.cfg
index cc9282c0..946fb8e7 100644
--- a/nist.cfg
+++ b/nist.cfg
@@ -2,7 +2,7 @@
 width=28
 height=28
 channels=1
-filters=4
+filters=6
 size=5
 stride=1
 activation=ramp
@@ -11,7 +11,7 @@ activation=ramp
 stride=2
 
 [conv]
-filters=12
+filters=16
 size=5
 stride=1
 activation=ramp
@@ -20,7 +20,7 @@ activation=ramp
 stride=2
 
 [conv]
-filters=10
+filters=120
 size=3
 stride=1
 activation=ramp
@@ -28,6 +28,10 @@ activation=ramp
 [maxpool]
 stride=2
 
+[conn]
+output = 80
+activation=ramp
+
 [conn]
 output = 10
 activation=ramp
diff --git a/src/activations.c b/src/activations.c
index a255f0fd..b8bb79d9 100644
--- a/src/activations.c
+++ b/src/activations.c
@@ -8,15 +8,16 @@ ACTIVATION get_activation(char *s)
 {
     if (strcmp(s, "sigmoid")==0) return SIGMOID;
     if (strcmp(s, "relu")==0) return RELU;
-    if (strcmp(s, "identity")==0) return IDENTITY;
+    if (strcmp(s, "linear")==0) return LINEAR;
     if (strcmp(s, "ramp")==0) return RAMP;
+    if (strcmp(s, "tanh")==0) return TANH;
     fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s);
     return RELU;
 }
 
 double activate(double x, ACTIVATION a){
     switch(a){
-        case IDENTITY:
+        case LINEAR:
             return x;
         case SIGMOID:
             return 1./(1.+exp(-x));
@@ -24,12 +25,14 @@ double activate(double x, ACTIVATION a){
             return x*(x>0);
         case RAMP:
             return x*(x>0) + .1*x;
+        case TANH:
+            return (exp(2*x)-1)/(exp(2*x)+1);
     }
     return 0;
 }
 double gradient(double x, ACTIVATION a){
     switch(a){
-        case IDENTITY:
+        case LINEAR:
             return 1;
         case SIGMOID:
             return (1.-x)*x;
@@ -37,35 +40,9 @@ double gradient(double x, ACTIVATION a){
             return (x>0);
         case RAMP:
             return (x>0) + .1;
+        case TANH:
+            return 1-x*x;
     }
     return 0;
 }
 
-double identity_activation(double x)
-{
-    return x;
-}
-double identity_gradient(double x)
-{
-    return 1;
-}
-
-double relu_activation(double x)
-{
-    return x*(x>0);
-}
-double relu_gradient(double x)
-{
-    return (x>0);
-}
-
-double sigmoid_activation(double x)
-{
-    return 1./(1.+exp(-x));
-}
-
-double sigmoid_gradient(double x)
-{
-    return x*(1.-x);
-}
-
diff --git a/src/activations.h b/src/activations.h
index 15d96d3d..889453f6 100644
--- a/src/activations.h
+++ b/src/activations.h
@@ -2,7 +2,7 @@
 #define ACTIVATIONS_H
 
 typedef enum{
-    SIGMOID, RELU, IDENTITY, RAMP
+    SIGMOID, RELU, LINEAR, RAMP, TANH
 }ACTIVATION;
 
 ACTIVATION get_activation(char *s);
diff --git a/src/connected_layer.c b/src/connected_layer.c
index 99f146b5..d769e1fe 100644
--- a/src/connected_layer.c
+++ b/src/connected_layer.c
@@ -8,7 +8,7 @@
 
 connected_layer *make_connected_layer(int inputs, int outputs, ACTIVATION activation)
 {
-    printf("Connected Layer: %d inputs, %d outputs\n", inputs, outputs);
+    fprintf(stderr, "Connected Layer: %d inputs, %d outputs\n", inputs, outputs);
     int i;
     connected_layer *layer = calloc(1, sizeof(connected_layer));
     layer->inputs = inputs;
@@ -29,7 +29,7 @@ connected_layer *make_connected_layer(int inputs, int outputs, ACTIVATION activa
     layer->biases = calloc(outputs, sizeof(double));
     for(i = 0; i < outputs; ++i)
         //layer->biases[i] = rand_normal()*scale + scale;
-        layer->biases[i] = 1;
+        layer->biases[i] = 0;
 
     layer->activation = activation;
     return layer;
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index 6d77700b..45b55b8e 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -39,7 +39,7 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si
     layer->w = w;
     layer->c = c;
     layer->n = n;
-    layer->edge = 0;
+    layer->edge = 1;
     layer->stride = stride;
     layer->kernels = calloc(n, sizeof(image));
     layer->kernel_updates = calloc(n, sizeof(image));
@@ -47,10 +47,10 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si
     layer->biases = calloc(n, sizeof(double));
     layer->bias_updates = calloc(n, sizeof(double));
     layer->bias_momentum = calloc(n, sizeof(double));
-    double scale = 20./(size*size*c);
+    double scale = 2./(size*size);
     for(i = 0; i < n; ++i){
         //layer->biases[i] = rand_normal()*scale + scale;
-        layer->biases[i] = 1;
+        layer->biases[i] = 0;
         layer->kernels[i] = make_random_kernel(size, c, scale);
         layer->kernel_updates[i] = make_random_kernel(size, c, 0);
         layer->kernel_momentum[i] = make_random_kernel(size, c, 0);
@@ -63,7 +63,7 @@ convolutional_layer *make_convolutional_layer(int h, int w, int c, int n, int si
         out_h = (layer->h - layer->size)/layer->stride+1;
         out_w = (layer->h - layer->size)/layer->stride+1;
     }
-    printf("Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+    fprintf(stderr, "Convolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
     layer->output = calloc(out_h * out_w * n, sizeof(double));
     layer->delta  = calloc(out_h * out_w * n, sizeof(double));
     layer->upsampled = make_image(h,w,n);
@@ -124,15 +124,22 @@ void backward_convolutional_layer2(convolutional_layer layer, double *input, dou
     }
 }
 
-void learn_convolutional_layer(convolutional_layer layer, double *input)
+void gradient_delta_convolutional_layer(convolutional_layer layer)
 {
     int i;
-    image in_image = double_to_image(layer.h, layer.w, layer.c, input);
     image out_delta = get_convolutional_delta(layer);
     image out_image = get_convolutional_image(layer);
     for(i = 0; i < out_image.h*out_image.w*out_image.c; ++i){
         out_delta.data[i] *= gradient(out_image.data[i], layer.activation);
     }
+}
+
+void learn_convolutional_layer(convolutional_layer layer, double *input)
+{
+    int i;
+    image in_image = double_to_image(layer.h, layer.w, layer.c, input);
+    image out_delta = get_convolutional_delta(layer);
+    gradient_delta_convolutional_layer(layer);
     for(i = 0; i < layer.n; ++i){
         kernel_update(in_image, layer.kernel_updates[i], layer.stride, i, out_delta, layer.edge);
         layer.bias_updates[i] += avg_image_layer(out_delta, i);
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 5a82e0b2..ccf9bee0 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -19,7 +19,7 @@ image get_maxpool_delta(maxpool_layer layer)
 
 maxpool_layer *make_maxpool_layer(int h, int w, int c, int stride)
 {
-    printf("Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride);
+    fprintf(stderr, "Maxpool Layer: %d x %d x %d image, %d stride\n", h,w,c,stride);
     maxpool_layer *layer = calloc(1, sizeof(maxpool_layer));
     layer->h = h;
     layer->w = w;
diff --git a/src/network.c b/src/network.c
index cce673c2..faedb8cb 100644
--- a/src/network.c
+++ b/src/network.c
@@ -276,10 +276,10 @@ void print_network(network net)
         }
         double mean = mean_array(output, n);
         double vari = variance_array(output, n);
-        printf("Layer %d - Mean: %f, Variance: %f\n",i,mean, vari);
+        fprintf(stderr, "Layer %d - Mean: %f, Variance: %f\n",i,mean, vari);
         if(n > 100) n = 100;
-        for(j = 0; j < n; ++j) printf("%f, ", output[j]);
-        if(n == 100)printf(".....\n");
-        printf("\n");
+        for(j = 0; j < n; ++j) fprintf(stderr, "%f, ", output[j]);
+        if(n == 100)fprintf(stderr,".....\n");
+        fprintf(stderr, "\n");
     }
 }
diff --git a/src/softmax_layer.c b/src/softmax_layer.c
index 28696b70..b213e5b0 100644
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@@ -5,7 +5,7 @@
 
 softmax_layer *make_softmax_layer(int inputs)
 {
-    printf("Softmax Layer: %d inputs\n", inputs);
+    fprintf(stderr, "Softmax Layer: %d inputs\n", inputs);
     softmax_layer *layer = calloc(1, sizeof(softmax_layer));
     layer->inputs = inputs;
     layer->output = calloc(inputs, sizeof(double));
diff --git a/src/tests.c b/src/tests.c
index 722de1ae..c221042a 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -15,7 +15,6 @@
 void test_convolve()
 {
     image dog = load_image("dog.jpg");
-    //show_image_layers(dog, "Dog");
     printf("dog channels %d\n", dog.c);
     image kernel = make_random_image(3,3,dog.c);
     image edge = make_image(dog.h, dog.w, 1);
@@ -88,7 +87,7 @@ void verify_convolutional_layer()
     image out_delta = get_convolutional_delta(layer);
     for(i = 0; i < out.h*out.w*out.c; ++i){
         out_delta.data[i] = 1;
-        backward_convolutional_layer2(layer, test.data, in_delta.data);
+        backward_convolutional_layer(layer, test.data, in_delta.data);
         image partial = copy_image(in_delta);
         jacobian2[i] = partial.data;
         out_delta.data[i] = 0;
@@ -156,7 +155,7 @@ void test_parser()
     int count = 0;
         
     double avgerr = 0;
-    while(1){
+    while(++count < 100000000){
         double v = ((double)rand()/RAND_MAX);
         double truth = v*v;
         input[0] = v;
@@ -165,8 +164,7 @@ void test_parser()
         double *delta = get_network_delta(net);
         double err = pow((out[0]-truth),2.);
         avgerr = .99 * avgerr + .01 * err;
-        //if(++count % 100000 == 0) printf("%f\n", avgerr);
-        if(++count % 1000000 == 0) printf("%f %f :%f AVG %f \n", truth, out[0], err, avgerr);
+        if(count % 1000000 == 0) printf("%f %f :%f AVG %f \n", truth, out[0], err, avgerr);
         delta[0] = truth - out[0];
         learn_network(net, input);
         update_network(net, .001);
@@ -197,15 +195,16 @@ void test_full()
     }
 }
 
-double error_network(network net, matrix m, double *truth)
+double error_network(network net, matrix m, double **truth)
 {
     int i;
     int correct = 0;
+    int k = get_network_output_size(net);
     for(i = 0; i < m.rows; ++i){
         forward_network(net, m.vals[i]);
         double *out = get_network_output(net);
-        double err = truth[i] - out[0];
-        if(fabs(err) < .5) ++correct;
+        int guess = max_index(out, k);
+        if(truth[i][guess]) ++correct;
     }
     return (double)correct/m.rows;
 }
@@ -224,24 +223,35 @@ double **one_hot(double *a, int n, int k)
 
 void test_nist()
 {
+    srand(999999);
     network net = parse_network_cfg("nist.cfg");
-    matrix m = csv_to_matrix("images/nist_train.csv");
-    matrix ho = hold_out_matrix(&m, 3000);
+    matrix m = csv_to_matrix("mnist/mnist_train.csv");
+    matrix test = csv_to_matrix("mnist/mnist_test.csv");
     double *truth_1d = pop_column(&m, 0);
     double **truth = one_hot(truth_1d, m.rows, 10);
-    double *ho_truth_1d = pop_column(&ho, 0);
-    double **ho_truth = one_hot(ho_truth_1d, ho.rows, 10);
+    double *test_truth_1d = pop_column(&test, 0);
+    double **test_truth = one_hot(test_truth_1d, test.rows, 10);
     int i,j;
     clock_t start = clock(), end;
+    for(i = 0; i < test.rows; ++i){
+        normalize_array(test.vals[i], 28*28);
+        //scale_array(m.vals[i], 28*28, 1./255.);
+        //translate_array(m.vals[i], 28*28, -.1);
+    }
+    for(i = 0; i < m.rows; ++i){
+        normalize_array(m.vals[i], 28*28);
+        //scale_array(m.vals[i], 28*28, 1./255.);
+        //translate_array(m.vals[i], 28*28, -.1);
+    }
     int count = 0;
-    double lr = .0001;
-    while(++count <= 3000000){
+    double lr = .0005;
+    while(++count <= 300){
         //lr *= .99;
         int index = 0;
         int correct = 0;
-        for(i = 0; i < 1000; ++i){
+        int number = 1000;
+        for(i = 0; i < number; ++i){
             index = rand()%m.rows;
-            normalize_array(m.vals[index], 28*28);
             forward_network(net, m.vals[index]);
             double *out = get_network_output(net);
             double *delta = get_network_delta(net);
@@ -260,19 +270,29 @@ void test_nist()
         }
         print_network(net);
         image input = double_to_image(28,28,1, m.vals[index]);
-        show_image(input, "Input");
+        //show_image(input, "Input");
         image o = get_network_image(net);
-        show_image_collapsed(o, "Output");
+        //show_image_collapsed(o, "Output");
         visualize_network(net);
-        cvWaitKey(100);
+        cvWaitKey(10);
         //double test_acc = error_network(net, m, truth);
-        //double valid_acc = error_network(net, ho, ho_truth);
-        //printf("%f, %f\n", test_acc, valid_acc);
-        fprintf(stderr, "%5d: %f %f\n",count, (double)correct/1000, lr);
-        //if(valid_acc > .70) break;
+        fprintf(stderr, "\n%5d: %f %f\n\n",count, (double)correct/number, lr);
+        if(count % 10 == 0 && 0){
+            double train_acc = error_network(net, m, truth);
+            fprintf(stderr, "\nTRAIN: %f\n", train_acc);
+            double test_acc = error_network(net, test, test_truth);
+            fprintf(stderr, "TEST: %f\n\n", test_acc);
+            printf("%d, %f, %f\n", count, train_acc, test_acc);
+        }
+        if(count % (m.rows/number) == 0) lr /= 2; 
     }
+            double train_acc = error_network(net, m, truth);
+            fprintf(stderr, "\nTRAIN: %f\n", train_acc);
+            double test_acc = error_network(net, test, test_truth);
+            fprintf(stderr, "TEST: %f\n\n", test_acc);
+            printf("%d, %f, %f\n", count, train_acc, test_acc);
     end = clock();
-    printf("Neural Net Learning: %lf seconds\n", (double)(end-start)/CLOCKS_PER_SEC);
+    //printf("Neural Net Learning: %lf seconds\n", (double)(end-start)/CLOCKS_PER_SEC);
 }
 
 void test_kernel_update()
@@ -281,14 +301,14 @@ void test_kernel_update()
     double delta[] = {.1};
     double input[] = {.3, .5, .3, .5, .5, .5, .5, .0, .5};
     double kernel[] = {1,2,3,4,5,6,7,8,9};
-    convolutional_layer layer = *make_convolutional_layer(3, 3, 1, 1, 3, 1, IDENTITY);
+    convolutional_layer layer = *make_convolutional_layer(3, 3, 1, 1, 3, 1, LINEAR);
     layer.kernels[0].data = kernel;
     layer.delta = delta;
     learn_convolutional_layer(layer, input);
     print_image(layer.kernels[0]);
     print_image(get_convolutional_delta(layer));
     print_image(layer.kernel_updates[0]);
-    
+
 }
 
 void test_random_classify()
@@ -311,15 +331,15 @@ void test_random_classify()
             double *delta = get_network_delta(net);
             //printf("%f\n", out[0]);
             delta[0] = truth[index] - out[0];
-           // printf("%f\n", delta[0]);
+            // printf("%f\n", delta[0]);
             //printf("%f %f\n", truth[index], out[0]);
             learn_network(net, m.vals[index]);
             update_network(net, .00001);
         }
-        double test_acc = error_network(net, m, truth);
-        double valid_acc = error_network(net, ho, ho_truth);
-        printf("%f, %f\n", test_acc, valid_acc);
-        fprintf(stderr, "%5d: %f Valid: %f\n",count, test_acc, valid_acc);
+        //double test_acc = error_network(net, m, truth);
+        //double valid_acc = error_network(net, ho, ho_truth);
+        //printf("%f, %f\n", test_acc, valid_acc);
+        //fprintf(stderr, "%5d: %f Valid: %f\n",count, test_acc, valid_acc);
         //if(valid_acc > .70) break;
     }
     end = clock();
@@ -362,8 +382,8 @@ void test_random_preprocess()
 int main()
 {
     //test_kernel_update();
-    //test_nist();
-    test_full();
+    test_nist();
+    //test_full();
     //test_random_preprocess();
     //test_random_classify();
     //test_parser();
diff --git a/src/utils.c b/src/utils.c
index 8229b2d0..3b8b5a80 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -180,6 +180,35 @@ void normalize_array(double *a, int n)
     sigma = sqrt(variance_array(a,n));
 }
 
+void translate_array(double *a, int n, double s)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        a[i] += s;
+    }
+}
+
+void scale_array(double *a, int n, double s)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        a[i] *= s;
+    }
+}
+int max_index(double *a, int n)
+{
+    if(n <= 0) return -1;
+    int i, max_i = 0;
+    double max = a[0];
+    for(i = 1; i < n; ++i){
+        if(a[i] > max){
+            max = a[i];
+            max_i = i;
+        }
+    }
+    return max_i;
+}
+
 double rand_normal()
 {
     int i;
diff --git a/src/utils.h b/src/utils.h
index 35217782..04747a4d 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -15,6 +15,9 @@ char *copy_string(char *s);
 int count_fields(char *line);
 double *parse_fields(char *line, int n);
 void normalize_array(double *a, int n);
+void scale_array(double *a, int n, double s);
+void translate_array(double *a, int n, double s);
+int max_index(double *a, int n);
 double constrain(double a, double max);
 double rand_normal();
 double mean_array(double *a, int n);