diff --git a/Makefile b/Makefile
index 91164081..04e30f35 100644
--- a/Makefile
+++ b/Makefile
@@ -34,7 +34,7 @@ CFLAGS+= -DGPU
 LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
 endif
 
-OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o
 ifeq ($(GPU), 1) 
 OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o yolo_kernels.o coco_kernels.o
 endif
diff --git a/cfg/msr_152.cfg b/cfg/msr_152.cfg
index 5d5a3b25..b19c999d 100644
--- a/cfg/msr_152.cfg
+++ b/cfg/msr_152.cfg
@@ -1,13 +1,16 @@
 [net]
-batch=256
-subdivisions=16
+batch=128
+subdivisions=8
 height=256
 width=256
 channels=3
 momentum=0.9
-decay=0.0005
+decay=0.0001
 
-learning_rate=0.02
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=500000
 
 [crop]
 crop_height=224
@@ -57,10 +60,22 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+activation=linear
+filters=256
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -84,11 +99,13 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
 [convolutional]
 batch_normalize=1
 filters=64
@@ -111,11 +128,13 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
 
 ##### Conv 3_x #####
 
@@ -141,10 +160,24 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=512
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
+
 
 [convolutional]
 batch_normalize=1
@@ -168,11 +201,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -195,11 +231,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -222,11 +261,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -249,11 +291,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -276,11 +321,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -303,11 +351,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=128
@@ -330,11 +381,14 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 
 ##### Conv 4_x #####
 
@@ -360,10 +414,24 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=1024
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
+
 
 [convolutional]
 batch_normalize=1
@@ -387,11 +455,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -414,11 +485,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -441,11 +515,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -468,11 +545,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -495,11 +575,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -522,11 +605,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -549,11 +635,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -576,11 +665,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -603,11 +695,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -630,11 +725,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -657,11 +755,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -684,11 +785,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -711,11 +815,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -738,11 +845,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -765,11 +875,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -792,11 +905,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -819,11 +935,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -846,11 +965,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -873,11 +995,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -900,11 +1025,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -927,11 +1055,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -954,11 +1085,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -981,11 +1115,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1008,11 +1145,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1035,11 +1175,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1062,11 +1205,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1089,11 +1235,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1116,11 +1265,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1143,11 +1295,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1170,11 +1325,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1197,11 +1355,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1224,11 +1385,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1251,11 +1415,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1278,11 +1445,14 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
+
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1305,11 +1475,13 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
 
 ##### Conv 5_x #####
 
@@ -1335,10 +1507,24 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=2048
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
+
 
 [convolutional]
 batch_normalize=1
@@ -1362,11 +1548,13 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
 [convolutional]
 batch_normalize=1
 filters=512
@@ -1389,11 +1577,13 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
 
+activation=leaky
+
 [avgpool]
 
 [connected]
diff --git a/cfg/msr_34.cfg b/cfg/msr_34.cfg
index e561d45c..5ae23cf5 100644
--- a/cfg/msr_34.cfg
+++ b/cfg/msr_34.cfg
@@ -12,11 +12,6 @@ policy=poly
 power=4
 max_batches=500000
 
-#policy=sigmoid
-#gamma=.00008
-#step=100000
-#max_batches=200000
-
 [crop]
 crop_height=224
 crop_width=224
diff --git a/cfg/msr_50.cfg b/cfg/msr_50.cfg
index 31685f21..2edd21c1 100644
--- a/cfg/msr_50.cfg
+++ b/cfg/msr_50.cfg
@@ -1,13 +1,18 @@
 [net]
 batch=128
-subdivisions=4
+subdivisions=8
 height=256
 width=256
 channels=3
 momentum=0.9
-decay=0.0005
+decay=0.0001
+
+learning_rate=0.05
+policy=poly
+power=4
+max_batches=500000
+
 
-learning_rate=0.01
 
 [crop]
 crop_height=224
@@ -57,10 +62,22 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+activation=linear
+filters=256
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -84,10 +101,11 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -111,10 +129,11 @@ filters=256
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 
 ##### Conv 3_x #####
@@ -141,10 +160,22 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=512
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -168,10 +199,11 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -195,10 +227,11 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -222,10 +255,11 @@ filters=512
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 
 ##### Conv 4_x #####
@@ -252,10 +286,23 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=1024
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
+
 
 [convolutional]
 batch_normalize=1
@@ -279,10 +326,11 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -306,10 +354,11 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -333,10 +382,11 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -360,10 +410,11 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -387,10 +438,11 @@ filters=1024
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 
 ##### Conv 5_x #####
@@ -417,10 +469,24 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
+
+
+[route]
+layers=-4
+
+[convolutional]
+batch_normalize=1
+size=1
+stride=2
+pad=1
+activation=linear
+filters=2048
 
 [shortcut]
-from = -4
+from = -3
+activation=leaky
+
 
 [convolutional]
 batch_normalize=1
@@ -444,10 +510,11 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [convolutional]
 batch_normalize=1
@@ -471,10 +538,11 @@ filters=2048
 size=1
 stride=1
 pad=1
-activation=leaky
+activation=linear
 
 [shortcut]
 from = -4
+activation=leaky
 
 [avgpool]
 
diff --git a/data/dog.jpg b/data/dog.jpg
index deadd795..665a81c5 100644
Binary files a/data/dog.jpg and b/data/dog.jpg differ
diff --git a/src/activation_layer.c b/src/activation_layer.c
new file mode 100644
index 00000000..49e638d4
--- /dev/null
+++ b/src/activation_layer.c
@@ -0,0 +1,58 @@
+#include "activation_layer.h"
+#include "utils.h"
+#include "cuda.h"
+#include "blas.h"
+#include "gemm.h"
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+layer make_activation_layer(int batch, int inputs, ACTIVATION activation)
+{
+    layer l = {0};
+    l.type = ACTIVE;
+
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.batch=batch;
+
+    l.output = calloc(batch*inputs, sizeof(float*));
+    l.delta = calloc(batch*inputs, sizeof(float*));
+
+#ifdef GPU
+    l.output_gpu = cuda_make_array(l.output, inputs*batch);
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch);
+#endif
+    l.activation = activation;
+    fprintf(stderr, "Activation Layer: %d inputs\n", inputs);
+    return l;
+}
+
+void forward_activation_layer(layer l, network_state state)
+{
+    copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs*l.batch, l.activation);
+}
+
+void backward_activation_layer(layer l, network_state state)
+{
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_activation_layer_gpu(layer l, network_state state)
+{
+    copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
+}
+
+void backward_activation_layer_gpu(layer l, network_state state)
+{
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
+}
+#endif
diff --git a/src/activation_layer.h b/src/activation_layer.h
new file mode 100644
index 00000000..a09756aa
--- /dev/null
+++ b/src/activation_layer.h
@@ -0,0 +1,19 @@
+#ifndef ACTIVATION_LAYER_H
+#define ACTIVATION_LAYER_H
+
+#include "activations.h"
+#include "layer.h"
+#include "network.h"
+
+layer make_activation_layer(int batch, int inputs, ACTIVATION activation);
+
+void forward_activation_layer(layer l, network_state state);
+void backward_activation_layer(layer l, network_state state);
+
+#ifdef GPU
+void forward_activation_layer_gpu(layer l, network_state state);
+void backward_activation_layer_gpu(layer l, network_state state);
+#endif
+
+#endif
+
diff --git a/src/avgpool_layer.c b/src/avgpool_layer.c
index 8eccde62..0feae710 100644
--- a/src/avgpool_layer.c
+++ b/src/avgpool_layer.c
@@ -28,8 +28,9 @@ avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
 
 void resize_avgpool_layer(avgpool_layer *l, int w, int h)
 {
-    l->h = h;
     l->w = w;
+    l->h = h;
+    l->inputs = h*w*l->c;
 }
 
 void forward_avgpool_layer(const avgpool_layer l, network_state state)
diff --git a/src/blas.c b/src/blas.c
index 556603cf..8769df35 100644
--- a/src/blas.c
+++ b/src/blas.c
@@ -1,15 +1,26 @@
 #include "blas.h"
 #include "math.h"
+#include <assert.h>
 
-void shortcut_cpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2)
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
 {
+    int stride = w1/w2;
+    int sample = w2/w1;
+    assert(stride == h1/h2);
+    assert(sample == h2/h1);
+    if(stride < 1) stride = 1;
+    if(sample < 1) sample = 1;
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
     int i,j,k,b;
     for(b = 0; b < batch; ++b){
-        for(k = 0; k < c && k < c2; ++k){
-            for(j = 0; j < h/sample; ++j){
-                for(i = 0; i < w/sample; ++i){
-                    int out_index = i*sample + w*(j*sample + h*(k + c*b));
-                    int add_index = b*w*stride/sample*h*stride/sample*c2 + i*stride + w*stride/sample*(j*stride + h*stride/sample*k);
+        for(k = 0; k < minc; ++k){
+            for(j = 0; j < minh; ++j){
+                for(i = 0; i < minw; ++i){
+                    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+                    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
                     out[out_index] += add[add_index];
                 }
             }
diff --git a/src/blas.h b/src/blas.h
index 208fdaa5..aecdc593 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -16,7 +16,7 @@ void scal_cpu(int N, float ALPHA, float *X, int INCX);
 void fill_cpu(int N, float ALPHA, float * X, int INCX);
 float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
 void test_gpu_blas();
-void shortcut_cpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2);
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
 
 void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
@@ -45,6 +45,6 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc
 
 void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
-void shortcut_gpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2);
+void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
 #endif
 #endif
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index 8f05eb94..49406db2 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -1,6 +1,7 @@
 #include "cuda_runtime.h"
 #include "curand.h"
 #include "cublas_v2.h"
+#include <assert.h>
 
 extern "C" {
 #include "blas.h"
@@ -374,26 +375,37 @@ extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX)
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void shortcut_kernel(int size, float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2, int min_c)
+__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
 {
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (id >= size) return;
-    int i = id % (w/sample);
-    id /= (w/sample);
-    int j = id % (h/sample);
-    id /= (h/sample);
-    int k = id % min_c;
-    id /= min_c;
-    int b = id;
-    int out_index = i*sample + w*(j*sample + h*(k + c*b));
-    int add_index = b*w*stride/sample*h*stride/sample*c2 + i*stride + w*stride/sample*(j*stride + h*stride/sample*k);
+    int i = id % minw;
+    id /= minw;
+    int j = id % minh;
+    id /= minh;
+    int k = id % minc;
+    id /= minc;
+    int b = id % batch;
+
+    int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
+    int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
     out[out_index] += add[add_index];
 }
 
-extern "C" void shortcut_gpu(float *out, int w, int h, int c, int batch, int sample, float *add, int stride, int c2)
+extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
 {
-    int min_c = (c < c2) ? c : c2;
-    int size = batch * w/sample * h/sample * min_c;
-    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, out, w, h, c, batch, sample, add, stride, c2, min_c);
+    int minw = (w1 < w2) ? w1 : w2;
+    int minh = (h1 < h2) ? h1 : h2;
+    int minc = (c1 < c2) ? c1 : c2;
+
+    int stride = w1/w2;
+    int sample = w2/w1;
+    assert(stride == h1/h2);
+    assert(sample == h2/h1);
+    if(stride < 1) stride = 1;
+    if(sample < 1) sample = 1;
+
+    int size = batch * minw * minh * minc;
+    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
     check_error(cudaPeekAtLastError());
 }
diff --git a/src/classifier.c b/src/classifier.c
index ddd88b19..9924c371 100644
--- a/src/classifier.c
+++ b/src/classifier.c
@@ -2,6 +2,7 @@
 #include "utils.h"
 #include "parser.h"
 #include "option_list.h"
+#include "blas.h"
 
 #ifdef OPENCV
 #include "opencv2/highgui/highgui_c.h"
@@ -183,6 +184,145 @@ void validate_classifier(char *datacfg, char *filename, char *weightfile)
     }
 }
 
+void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int *indexes = calloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class = j;
+                break;
+            }
+        }
+        image im = load_image_color(paths[i], 256, 256);
+        image images[10];
+        images[0] = crop_image(im, -16, -16, 256, 256);
+        images[1] = crop_image(im, 16, -16, 256, 256);
+        images[2] = crop_image(im, 0, 0, 256, 256);
+        images[3] = crop_image(im, -16, 16, 256, 256);
+        images[4] = crop_image(im, 16, 16, 256, 256);
+        flip_image(im);
+        images[5] = crop_image(im, -16, -16, 256, 256);
+        images[6] = crop_image(im, 16, -16, 256, 256);
+        images[7] = crop_image(im, 0, 0, 256, 256);
+        images[8] = crop_image(im, -16, 16, 256, 256);
+        images[9] = crop_image(im, 16, 16, 256, 256);
+        float *pred = calloc(classes, sizeof(float));
+        for(j = 0; j < 10; ++j){
+            float *p = network_predict(net, images[j].data);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            free_image(images[j]);
+        }
+        free_image(im);
+        top_k(pred, classes, topk, indexes);
+        free(pred);
+        if(indexes[0] == class) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+}
+
+void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
+{
+    int i, j;
+    network net = parse_network_cfg(filename);
+    set_batch_network(&net, 1);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    char *label_list = option_find_str(options, "labels", "data/labels.list");
+    char *valid_list = option_find_str(options, "valid", "data/train.list");
+    int classes = option_find_int(options, "classes", 2);
+    int topk = option_find_int(options, "top", 1);
+
+    char **labels = get_labels(label_list);
+    list *plist = get_paths(valid_list);
+    int scales[] = {224, 256, 384, 480, 640};
+    int nscales = sizeof(scales)/sizeof(scales[0]);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    float avg_acc = 0;
+    float avg_topk = 0;
+    int *indexes = calloc(topk, sizeof(int));
+
+    for(i = 0; i < m; ++i){
+        int class = -1;
+        char *path = paths[i];
+        for(j = 0; j < classes; ++j){
+            if(strstr(path, labels[j])){
+                class = j;
+                break;
+            }
+        }
+        float *pred = calloc(classes, sizeof(float));
+        image im = load_image_color(paths[i], 0, 0);
+        for(j = 0; j < nscales; ++j){
+            int w, h;
+            if(im.w < im.h){
+                w = scales[j];
+                h = (im.h*w)/im.w;
+            } else {
+                h = scales[j];
+                w = (im.w * h) / im.h;
+            }
+            resize_network(&net, w, h);
+            image r = resize_image(im, w, h);
+            float *p = network_predict(net, r.data);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            flip_image(r);
+            p = network_predict(net, r.data);
+            axpy_cpu(classes, 1, p, 1, pred, 1);
+            free_image(r);
+        }
+        free_image(im);
+        top_k(pred, classes, topk, indexes);
+        free(pred);
+        if(indexes[0] == class) avg_acc += 1;
+        for(j = 0; j < topk; ++j){
+            if(indexes[j] == class) avg_topk += 1;
+        }
+
+        printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
+    }
+}
+
 void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename)
 {
     network net = parse_network_cfg(cfgfile);
@@ -296,7 +436,7 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
 
         free_matrix(pred);
 
-        fprintf(stderr, "%lf seconds, %d images\n", sec(clock()-time), val.X.rows);
+        fprintf(stderr, "%lf seconds, %d images, %d total\n", sec(clock()-time), val.X.rows, curr);
         free_data(val);
     }
 }
@@ -319,6 +459,8 @@ void run_classifier(int argc, char **argv)
     else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights);
     else if(0==strcmp(argv[2], "test")) test_classifier(data, cfg, weights, layer);
     else if(0==strcmp(argv[2], "valid")) validate_classifier(data, cfg, weights);
+    else if(0==strcmp(argv[2], "valid10")) validate_classifier_10(data, cfg, weights);
+    else if(0==strcmp(argv[2], "validmulti")) validate_classifier_multi(data, cfg, weights);
 }
 
 
diff --git a/src/connected_layer.c b/src/connected_layer.c
index 2d83dd91..c0a9d8b3 100644
--- a/src/connected_layer.c
+++ b/src/connected_layer.c
@@ -32,7 +32,7 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
     //float scale = 1./sqrt(inputs);
     float scale = sqrt(2./inputs);
     for(i = 0; i < outputs*inputs; ++i){
-        l.weights[i] = 2*scale*rand_uniform() - scale;
+        l.weights[i] = scale*rand_uniform(-1, 1);
     }
 
     for(i = 0; i < outputs; ++i){
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index e97b00d7..871a84e1 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -65,7 +65,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
 
     // float scale = 1./sqrt(size*size*c);
     float scale = sqrt(2./(size*size*c));
-    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = 2*scale*rand_uniform() - scale;
+    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1, 1);
     int out_h = convolutional_out_height(l);
     int out_w = convolutional_out_width(l);
     l.out_h = out_h;
diff --git a/src/crop_layer.c b/src/crop_layer.c
index 7b340841..66f11ebc 100644
--- a/src/crop_layer.c
+++ b/src/crop_layer.c
@@ -19,33 +19,51 @@ crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int
     l.h = h;
     l.w = w;
     l.c = c;
+    l.scale = (float)crop_height / h;
     l.flip = flip;
     l.angle = angle;
     l.saturation = saturation;
     l.exposure = exposure;
-    l.crop_width = crop_width;
-    l.crop_height = crop_height;
     l.out_w = crop_width;
     l.out_h = crop_height;
     l.out_c = c;
     l.inputs = l.w * l.h * l.c;
     l.outputs = l.out_w * l.out_h * l.out_c;
-    l.output = calloc(crop_width*crop_height * c*batch, sizeof(float));
+    l.output = calloc(l.outputs*batch, sizeof(float));
     #ifdef GPU
-    l.output_gpu = cuda_make_array(l.output, crop_width*crop_height*c*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
     l.rand_gpu   = cuda_make_array(0, l.batch*8);
     #endif
     return l;
 }
 
+void resize_crop_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+
+    l->out_w =  l->scale*w;
+    l->out_h =  l->scale*h;
+
+    l->inputs = l->w * l->h * l->c;
+    l->outputs = l->out_h * l->out_w * l->out_c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    #ifdef GPU
+    cuda_free(l->output_gpu);
+    l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
+    #endif
+}
+
+
 void forward_crop_layer(const crop_layer l, network_state state)
 {
     int i,j,c,b,row,col;
     int index;
     int count = 0;
     int flip = (l.flip && rand()%2);
-    int dh = rand()%(l.h - l.crop_height + 1);
-    int dw = rand()%(l.w - l.crop_width + 1);
+    int dh = rand()%(l.h - l.out_h + 1);
+    int dw = rand()%(l.w - l.out_w + 1);
     float scale = 2;
     float trans = -1;
     if(l.noadjust){
@@ -54,13 +72,13 @@ void forward_crop_layer(const crop_layer l, network_state state)
     }
     if(!state.train){
         flip = 0;
-        dh = (l.h - l.crop_height)/2;
-        dw = (l.w - l.crop_width)/2;
+        dh = (l.h - l.out_h)/2;
+        dw = (l.w - l.out_w)/2;
     }
     for(b = 0; b < l.batch; ++b){
         for(c = 0; c < l.c; ++c){
-            for(i = 0; i < l.crop_height; ++i){
-                for(j = 0; j < l.crop_width; ++j){
+            for(i = 0; i < l.out_h; ++i){
+                for(j = 0; j < l.out_w; ++j){
                     if(flip){
                         col = l.w - dw - j - 1;    
                     }else{
diff --git a/src/crop_layer.h b/src/crop_layer.h
index b4093510..12112f02 100644
--- a/src/crop_layer.h
+++ b/src/crop_layer.h
@@ -11,6 +11,7 @@ typedef layer crop_layer;
 image get_crop_image(crop_layer l);
 crop_layer make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip, float angle, float saturation, float exposure);
 void forward_crop_layer(const crop_layer l, network_state state);
+void resize_crop_layer(layer *l, int w, int h);
 
 #ifdef GPU
 void forward_crop_layer_gpu(crop_layer l, network_state state);
diff --git a/src/crop_layer_kernels.cu b/src/crop_layer_kernels.cu
index 8891030a..da635a6f 100644
--- a/src/crop_layer_kernels.cu
+++ b/src/crop_layer_kernels.cu
@@ -198,9 +198,9 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
     levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
     check_error(cudaPeekAtLastError());
 
-    size = layer.batch*layer.c*layer.crop_width*layer.crop_height;
+    size = layer.batch*layer.c*layer.out_w*layer.out_h;
 
-    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.crop_height, layer.crop_width, state.train, layer.flip, radians, layer.output_gpu);
+    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.out_h, layer.out_w, state.train, layer.flip, radians, layer.output_gpu);
     check_error(cudaPeekAtLastError());
 
 /*
diff --git a/src/data.c b/src/data.c
index 8d762765..88c89917 100644
--- a/src/data.c
+++ b/src/data.c
@@ -427,10 +427,10 @@ data load_data_region(int n, char **paths, int m, int w, int h, int size, int cl
         int dw = (ow*jitter);
         int dh = (oh*jitter);
 
-        int pleft  = (rand_uniform() * 2*dw - dw);
-        int pright = (rand_uniform() * 2*dw - dw);
-        int ptop   = (rand_uniform() * 2*dh - dh);
-        int pbot   = (rand_uniform() * 2*dh - dh);
+        int pleft  = rand_uniform(-dw, dw);
+        int pright = rand_uniform(-dw, dw);
+        int ptop   = rand_uniform(-dh, dh);
+        int pbot   = rand_uniform(-dh, dh);
 
         int swidth =  ow - pleft - pright;
         int sheight = oh - ptop - pbot;
@@ -543,10 +543,10 @@ data load_data_swag(char **paths, int n, int classes, float jitter)
     int dw = w*jitter;
     int dh = h*jitter;
 
-    int pleft  = (rand_uniform() * 2*dw - dw);
-    int pright = (rand_uniform() * 2*dw - dw);
-    int ptop   = (rand_uniform() * 2*dh - dh);
-    int pbot   = (rand_uniform() * 2*dh - dh);
+    int pleft  = rand_uniform(-dw, dw);
+    int pright = rand_uniform(-dw, dw);
+    int ptop   = rand_uniform(-dh, dh);
+    int pbot   = rand_uniform(-dh, dh);
 
     int swidth =  w - pleft - pright;
     int sheight = h - ptop - pbot;
@@ -594,10 +594,10 @@ data load_data_detection(int n, char **paths, int m, int classes, int w, int h,
         int dw = ow/10;
         int dh = oh/10;
 
-        int pleft  = (rand_uniform() * 2*dw - dw);
-        int pright = (rand_uniform() * 2*dw - dw);
-        int ptop   = (rand_uniform() * 2*dh - dh);
-        int pbot   = (rand_uniform() * 2*dh - dh);
+        int pleft  = rand_uniform(-dw, dw);
+        int pright = rand_uniform(-dw, dw);
+        int ptop   = rand_uniform(-dh, dh);
+        int pbot   = rand_uniform(-dh, dh);
 
         int swidth =  ow - pleft - pright;
         int sheight = oh - ptop - pbot;
diff --git a/src/dropout_layer.c b/src/dropout_layer.c
index 97dd47f5..bb410dcf 100644
--- a/src/dropout_layer.c
+++ b/src/dropout_layer.c
@@ -37,7 +37,7 @@ void forward_dropout_layer(dropout_layer l, network_state state)
     int i;
     if (!state.train) return;
     for(i = 0; i < l.batch * l.inputs; ++i){
-        float r = rand_uniform();
+        float r = rand_uniform(0, 1);
         l.rand[i] = r;
         if(r < l.probability) state.input[i] = 0;
         else state.input[i] *= l.scale;
diff --git a/src/image.c b/src/image.c
index 51871b55..d7d57d56 100644
--- a/src/image.c
+++ b/src/image.c
@@ -390,6 +390,17 @@ image make_image(int w, int h, int c)
     return out;
 }
 
+image make_random_image(int w, int h, int c)
+{
+    image out = make_empty_image(w,h,c);
+    out.data = calloc(h*w*c, sizeof(float));
+    int i;
+    for(i = 0; i < w*h*c; ++i){
+        out.data[i] = (rand_normal() * .25) + .5;
+    }
+    return out;
+}
+
 image float_to_image(int w, int h, int c, float *data)
 {
     image out = make_empty_image(w,h,c);
@@ -692,6 +703,8 @@ image resize_image(image im, int w, int h)
     return resized;
 }
 
+#include "cuda.h"
+
 void test_resize(char *filename)
 {
     image im = load_image(filename, 0,0, 3);
@@ -709,14 +722,27 @@ void test_resize(char *filename)
     image exp5 = copy_image(im);
     exposure_image(exp5, .5);
 
-    image r = resize_image(im, im.w/2, im.h/2);
+    #ifdef GPU
+    image r = resize_image(im, im.w, im.h);
+    image black = make_image(im.w*2 + 3, im.h*2 + 3, 9);
+    image black2 = make_image(im.w, im.h, 3);
 
-    image black = make_image(im.w, im.h, im.c);
-    shortcut_cpu(black.data, im.w, im.h, im.c, 1, 2, r.data, 1, r.c);
+    float *r_gpu = cuda_make_array(r.data, r.w*r.h*r.c);
+    float *black_gpu = cuda_make_array(black.data, black.w*black.h*black.c);
+    float *black2_gpu = cuda_make_array(black2.data, black2.w*black2.h*black2.c);
+    shortcut_gpu(3, r.w, r.h, 1, r_gpu, black.w, black.h, 3, black_gpu);
+    //flip_image(r);
+    //shortcut_gpu(3, r.w, r.h, 1, r.data, black.w, black.h, 3, black.data);
+
+    shortcut_gpu(3, black.w, black.h, 3, black_gpu, black2.w, black2.h, 1, black2_gpu);
+    cuda_pull_array(black_gpu, black.data, black.w*black.h*black.c);
+    cuda_pull_array(black2_gpu, black2.data, black2.w*black2.h*black2.c);
+    show_image_layers(black, "Black");
+    show_image(black2, "Recreate");
+    #endif
 
     show_image(im, "Original");
     show_image(gray, "Gray");
-    show_image(black, "Black");
     show_image(sat2, "Saturation-2");
     show_image(sat5, "Saturation-.5");
     show_image(exp2, "Exposure-2");
diff --git a/src/image.h b/src/image.h
index c3e1a78b..4846bc19 100644
--- a/src/image.h
+++ b/src/image.h
@@ -58,6 +58,7 @@ void save_image_jpg(image p, char *name);
 void print_image(image m);
 
 image make_image(int w, int h, int c);
+image make_random_image(int w, int h, int c);
 image make_empty_image(int w, int h, int c);
 image float_to_image(int w, int h, int c, float *data);
 image copy_image(image p);
diff --git a/src/imagenet.c b/src/imagenet.c
index dece9528..4c4d2bd6 100644
--- a/src/imagenet.c
+++ b/src/imagenet.c
@@ -21,7 +21,7 @@ void train_imagenet(char *cfgfile, char *weightfile)
     printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
     int imgs = 1024;
     char **labels = get_labels("data/inet.labels.list");
-    list *plist = get_paths("/data/imagenet/cls.train.list");
+    list *plist = get_paths("data/inet.train.list");
     char **paths = (char **)list_to_array(plist);
     printf("%d\n", plist->size);
     int N = plist->size;
@@ -62,6 +62,11 @@ void train_imagenet(char *cfgfile, char *weightfile)
             sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
             save_weights(net, buff);
         }
+        if(*net.seen%1000 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
     }
     char buff[256];
     sprintf(buff, "%s/%s.weights", backup_directory, base);
diff --git a/src/layer.h b/src/layer.h
index 1b120096..d8af6e40 100644
--- a/src/layer.h
+++ b/src/layer.h
@@ -20,7 +20,8 @@ typedef enum {
     NORMALIZATION,
     AVGPOOL,
     LOCAL,
-    SHORTCUT
+    SHORTCUT,
+    ACTIVE
 } LAYER_TYPE;
 
 typedef enum{
@@ -46,8 +47,6 @@ struct layer{
     int side;
     int stride;
     int pad;
-    int crop_width;
-    int crop_height;
     int sqrt;
     int flip;
     int index;
diff --git a/src/local_layer.c b/src/local_layer.c
index c0f52cbc..9a5750fc 100644
--- a/src/local_layer.c
+++ b/src/local_layer.c
@@ -55,7 +55,7 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
 
     // float scale = 1./sqrt(size*size*c);
     float scale = sqrt(2./(size*size*c));
-    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = 2*scale*rand_uniform() - scale;
+    for(i = 0; i < c*n*size*size; ++i) l.filters[i] = scale*rand_uniform(-1,1);
 
     l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
     l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c
index 20176277..7af49546 100644
--- a/src/maxpool_layer.c
+++ b/src/maxpool_layer.c
@@ -51,6 +51,7 @@ void resize_maxpool_layer(maxpool_layer *l, int w, int h)
     int stride = l->stride;
     l->h = h;
     l->w = w;
+    l->inputs = h*w*l->c;
 
     l->out_w = (w-1)/stride + 1;
     l->out_h = (h-1)/stride + 1;
diff --git a/src/network.c b/src/network.c
index 8dee8cce..79579f10 100644
--- a/src/network.c
+++ b/src/network.c
@@ -10,6 +10,7 @@
 #include "connected_layer.h"
 #include "local_layer.h"
 #include "convolutional_layer.h"
+#include "activation_layer.h"
 #include "deconvolutional_layer.h"
 #include "detection_layer.h"
 #include "normalization_layer.h"
@@ -73,6 +74,8 @@ char *get_layer_string(LAYER_TYPE a)
     switch(a){
         case CONVOLUTIONAL:
             return "convolutional";
+        case ACTIVE:
+            return "activation";
         case LOCAL:
             return "local";
         case DECONVOLUTIONAL:
@@ -131,6 +134,8 @@ void forward_network(network net, network_state state)
             forward_convolutional_layer(l, state);
         } else if(l.type == DECONVOLUTIONAL){
             forward_deconvolutional_layer(l, state);
+        } else if(l.type == ACTIVE){
+            forward_activation_layer(l, state);
         } else if(l.type == LOCAL){
             forward_local_layer(l, state);
         } else if(l.type == NORMALIZATION){
@@ -231,6 +236,8 @@ void backward_network(network net, network_state state)
             backward_convolutional_layer(l, state);
         } else if(l.type == DECONVOLUTIONAL){
             backward_deconvolutional_layer(l, state);
+        } else if(l.type == ACTIVE){
+            backward_activation_layer(l, state);
         } else if(l.type == NORMALIZATION){
             backward_normalization_layer(l, state);
         } else if(l.type == MAXPOOL){
@@ -360,11 +367,12 @@ int resize_network(network *net, int w, int h)
         layer l = net->layers[i];
         if(l.type == CONVOLUTIONAL){
             resize_convolutional_layer(&l, w, h);
+        }else if(l.type == CROP){
+            resize_crop_layer(&l, w, h);
         }else if(l.type == MAXPOOL){
             resize_maxpool_layer(&l, w, h);
         }else if(l.type == AVGPOOL){
             resize_avgpool_layer(&l, w, h);
-            break;
         }else if(l.type == NORMALIZATION){
             resize_normalization_layer(&l, w, h);
         }else if(l.type == COST){
@@ -376,6 +384,7 @@ int resize_network(network *net, int w, int h)
         net->layers[i] = l;
         w = l.out_w;
         h = l.out_h;
+        if(l.type == AVGPOOL) break;
     }
     //fprintf(stderr, " Done!\n");
     return 0;
diff --git a/src/network_kernels.cu b/src/network_kernels.cu
index 0b506477..a83293da 100644
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@@ -18,6 +18,7 @@ extern "C" {
 #include "connected_layer.h"
 #include "detection_layer.h"
 #include "convolutional_layer.h"
+#include "activation_layer.h"
 #include "deconvolutional_layer.h"
 #include "maxpool_layer.h"
 #include "avgpool_layer.h"
@@ -48,6 +49,8 @@ void forward_network_gpu(network net, network_state state)
             forward_convolutional_layer_gpu(l, state);
         } else if(l.type == DECONVOLUTIONAL){
             forward_deconvolutional_layer_gpu(l, state);
+        } else if(l.type == ACTIVE){
+            forward_activation_layer_gpu(l, state);
         } else if(l.type == LOCAL){
             forward_local_layer_gpu(l, state);
         } else if(l.type == DETECTION){
@@ -97,6 +100,8 @@ void backward_network_gpu(network net, network_state state)
             backward_convolutional_layer_gpu(l, state);
         } else if(l.type == DECONVOLUTIONAL){
             backward_deconvolutional_layer_gpu(l, state);
+        } else if(l.type == ACTIVE){
+            backward_activation_layer_gpu(l, state);
         } else if(l.type == LOCAL){
             backward_local_layer_gpu(l, state);
         } else if(l.type == MAXPOOL){
diff --git a/src/nightmare.c b/src/nightmare.c
index 1a78dd5a..ccc75255 100644
--- a/src/nightmare.c
+++ b/src/nightmare.c
@@ -108,6 +108,69 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
 
 }
 
+void smooth(image recon, image update, float lambda, int num)
+{
+    int i, j, k;
+    int ii, jj;
+    for(k = 0; k < recon.c; ++k){
+        for(j = 0; j < recon.h; ++j){
+            for(i = 0; i < recon.w; ++i){
+                int out_index = i + recon.w*(j + recon.h*k);
+                for(jj = j-num; jj <= j + num && jj < recon.h; ++jj){
+                    if (jj < 0) continue;
+                    for(ii = i-num; ii <= i + num && ii < recon.w; ++ii){
+                        if (ii < 0) continue;
+                        int in_index = ii + recon.w*(jj + recon.h*k);
+                        update.data[out_index] += lambda * (recon.data[in_index] - recon.data[out_index]);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size)
+{
+    scale_image(recon, 2);
+    translate_image(recon, -1);
+
+    image delta = make_image(recon.w, recon.h, recon.c);
+
+    network_state state = {0};
+#ifdef GPU
+    state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
+    state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
+    state.truth = cuda_make_array(features, get_network_output_size(net));
+
+    forward_network_gpu(net, state);
+    backward_network_gpu(net, state);
+
+    cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c);
+
+    cuda_free(state.input);
+    cuda_free(state.delta);
+    cuda_free(state.truth);
+#else
+    state.input = recon.data;
+    state.delta = delta.data;
+    state.truth = features;
+
+    forward_network(net, state);
+    backward_network(net, state);
+#endif
+
+    axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1);
+    smooth(recon, update, lambda, smooth_size);
+
+    axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1);
+    scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1);
+
+    translate_image(recon, 1);
+    scale_image(recon, .5);
+    constrain_image(recon);
+    free_image(delta);
+}
+
 
 void run_nightmare(int argc, char **argv)
 {
@@ -131,7 +194,11 @@ void run_nightmare(int argc, char **argv)
     float rate = find_float_arg(argc, argv, "-rate", .04);
     float thresh = find_float_arg(argc, argv, "-thresh", 1.);
     float rotate = find_float_arg(argc, argv, "-rotate", 0);
+    float momentum = find_float_arg(argc, argv, "-momentum", .9);
+    float lambda = find_float_arg(argc, argv, "-lambda", .01);
     char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    int reconstruct = find_arg(argc, argv, "-reconstruct");
+    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
 
     network net = parse_network_cfg(cfg);
     load_weights(&net, weights);
@@ -151,17 +218,38 @@ void run_nightmare(int argc, char **argv)
         im = resized;
     }
 
+    float *features;
+    image update;
+    if (reconstruct){
+        resize_network(&net, im.w, im.h);
+        int size = get_network_output_size(net);
+        features = calloc(size, sizeof(float));
+        float *out = network_predict(net, im.data);
+        copy_cpu(size, out, 1, features, 1);
+        free_image(im);
+        im = make_random_image(im.w, im.h, im.c);
+        update = make_image(im.w, im.h, im.c);
+    }
+
     int e;
     int n;
     for(e = 0; e < rounds; ++e){
-            fprintf(stderr, "Iteration: ");
-            fflush(stderr);
+        fprintf(stderr, "Iteration: ");
+        fflush(stderr);
         for(n = 0; n < iters; ++n){  
             fprintf(stderr, "%d, ", n);
             fflush(stderr);
-            int layer = max_layer + rand()%range - range/2;
-            int octave = rand()%octaves;
-            optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+            if(reconstruct){
+                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size);
+                show_image(im, "reconstruction");
+                #ifdef OPENCV
+                cvWaitKey(10);
+                #endif
+            }else{
+                int layer = max_layer + rand()%range - range/2;
+                int octave = rand()%octaves;
+                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+            }
         }
         fprintf(stderr, "done\n");
         if(0){
diff --git a/src/parser.c b/src/parser.c
index 8efafad8..218fd27a 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -7,6 +7,7 @@
 #include "crop_layer.h"
 #include "cost_layer.h"
 #include "convolutional_layer.h"
+#include "activation_layer.h"
 #include "normalization_layer.h"
 #include "deconvolutional_layer.h"
 #include "connected_layer.h"
@@ -29,6 +30,7 @@ typedef struct{
 
 int is_network(section *s);
 int is_convolutional(section *s);
+int is_activation(section *s);
 int is_local(section *s);
 int is_deconvolutional(section *s);
 int is_connected(section *s);
@@ -301,10 +303,31 @@ layer parse_shortcut(list *options, size_params params, network net)
     layer from = net.layers[index];
 
     layer s = make_shortcut_layer(batch, index, params.w, params.h, params.c, from.out_w, from.out_h, from.out_c);
+
+    char *activation_s = option_find_str(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+    s.activation = activation;
     return s;
 }
 
 
+layer parse_activation(list *options, size_params params)
+{
+    char *activation_s = option_find_str(options, "activation", "linear");
+    ACTIVATION activation = get_activation(activation_s);
+
+    layer l = make_activation_layer(params.batch, params.inputs, activation);
+
+    l.out_h = params.h;
+    l.out_w = params.w;
+    l.out_c = params.c;
+    l.h = params.h;
+    l.w = params.w;
+    l.c = params.c;
+
+    return l;
+}
+
 route_layer parse_route(list *options, size_params params, network net)
 {
     char *l = option_find(options, "layers");   
@@ -447,6 +470,8 @@ network parse_network_cfg(char *filename)
             l = parse_convolutional(options, params);
         }else if(is_local(s)){
             l = parse_local(options, params);
+        }else if(is_activation(s)){
+            l = parse_activation(options, params);
         }else if(is_deconvolutional(s)){
             l = parse_deconvolutional(options, params);
         }else if(is_connected(s)){
@@ -530,6 +555,10 @@ int is_convolutional(section *s)
     return (strcmp(s->type, "[conv]")==0
             || strcmp(s->type, "[convolutional]")==0);
 }
+int is_activation(section *s)
+{
+    return (strcmp(s->type, "[activation]")==0);
+}
 int is_network(section *s)
 {
     return (strcmp(s->type, "[net]")==0
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index ff1d50f0..bf455162 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -10,22 +10,15 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     layer l = {0};
     l.type = SHORTCUT;
     l.batch = batch;
-    l.w = w;
-    l.h = h;
-    l.c = c;
+    l.w = w2;
+    l.h = h2;
+    l.c = c2;
     l.out_w = w;
     l.out_h = h;
     l.out_c = c;
     l.outputs = w*h*c;
-    l.inputs = w*h*c;
-    int stride = w2 / w;
+    l.inputs = l.outputs;
 
-    assert(stride * w == w2);
-    assert(stride * h == h2);
-    assert(c >= c2);
-
-    l.stride = stride;
-    l.n = c2;
     l.index = index;
 
     l.delta =  calloc(l.outputs*batch, sizeof(float));
@@ -40,25 +33,29 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
 void forward_shortcut_layer(const layer l, network_state state)
 {
     copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
-    shortcut_cpu(l.output, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output, l.stride, l.n);
+    shortcut_cpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+    activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
 void backward_shortcut_layer(const layer l, network_state state)
 {
-    copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
-    shortcut_cpu(state.net.layers[l.index].delta, l.w*l.stride, l.h*l.stride, l.n, l.batch, l.stride, l.delta, 1, l.c);
+    gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
+    axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, state.delta, 1);
+    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, state.net.layers[l.index].delta);
 }
 
 #ifdef GPU
 void forward_shortcut_layer_gpu(const layer l, network_state state)
 {
     copy_ongpu(l.outputs*l.batch, state.input, 1, l.output_gpu, 1);
-    shortcut_gpu(l.output_gpu, l.w, l.h, l.c, l.batch, 1, state.net.layers[l.index].output_gpu, l.stride, l.n);
+    shortcut_gpu(l.batch, l.w, l.h, l.c, state.net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
+    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
 void backward_shortcut_layer_gpu(const layer l, network_state state)
 {
-    copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
-    shortcut_gpu(state.net.layers[l.index].delta_gpu, l.w*l.stride, l.h*l.stride, l.n, l.batch, l.stride, l.delta_gpu, 1, l.c);
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
+    axpy_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1, state.delta, 1);
+    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, state.net.layers[l.index].delta_gpu);
 }
 #endif
diff --git a/src/utils.c b/src/utils.c
index 3ad09329..d49d0ce9 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -485,9 +485,9 @@ float rand_normal()
    }
  */
 
-float rand_uniform()
+float rand_uniform(float min, float max)
 {
-    return (float)rand()/RAND_MAX;
+    return ((float)rand()/RAND_MAX * (max - min)) + min;
 }
 
 float **one_hot_encode(float *a, int n, int k)
diff --git a/src/utils.h b/src/utils.h
index 7e13e86f..96bd6cfe 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -34,7 +34,7 @@ int max_index(float *a, int n);
 float constrain(float min, float max, float a);
 float mse_array(float *a, int n);
 float rand_normal();
-float rand_uniform();
+float rand_uniform(float min, float max);
 float sum_array(float *a, int n);
 float mean_array(float *a, int n);
 void mean_arrays(float **a, int n, int els, float *avg);
diff --git a/src/yolo.c b/src/yolo.c
index a6c1e784..6bd4e6b2 100644
--- a/src/yolo.c
+++ b/src/yolo.c
@@ -14,7 +14,7 @@ image voc_labels[20];
 
 void train_yolo(char *cfgfile, char *weightfile)
 {
-    char *train_images = "data/voc.0712.trainval";
+    char *train_images = "/data/voc/train.txt";
     char *backup_directory = "/home/pjreddie/backup/";
     srand(time(0));
     data_seed = time(0);