:eyeofthetiger::noseofthetiger::eyeofthetiger2:

2023-08-10 21:13:14 +03:00 · 2017-03-26 23:42:30 -07:00
parent b61bcf544e
commit 60e952ba69
53 changed files with 3486 additions and 661 deletions
--- a/6
+++ b/6
@ -10,7 +10,7 @@ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
      -gencode arch=compute_52,code=[sm_52,compute_52]

 # This is what I use, uncomment if you know your arch and want to specify
-# ARCH=  -gencode arch=compute_52,code=compute_52
+ARCH=  -gencode arch=compute_52,code=compute_52

 VPATH=./src/
 EXEC=darknet
@ -48,10 +48,10 @@ CFLAGS+= -DCUDNN
 LDFLAGS+= -lcudnn
 endif

-OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o super.o voxel.o tree.o
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o regressor.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o lsd.o super.o voxel.o tree.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
-OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
+OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o
 endif

 OBJS = $(addprefix $(OBJDIR), $(OBJ))
--- a/cfg/coco.data
+++ b/cfg/coco.data
@ -1,7 +1,7 @@
 classes= 80
 train  = /home/pjreddie/data/coco/trainvalno5k.txt
-#valid  = coco_testdev
-valid = data/coco_val_5k.list
+valid  = coco_testdev
+#valid = data/coco_val_5k.list
 names = data/coco.names
 backup = /home/pjreddie/backup/
 eval=coco
--- a/cfg/go.cfg
+++ b/cfg/go.cfg
@ -0,0 +1,135 @@
+[net]
+batch=512
+subdivisions=1
+height=19
+width=19
+channels=1
+momentum=0.9
+decay=0.0005
+
+burn_in=1000
+learning_rate=0.1
+policy=poly
+power=4
+max_batches=10000000
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=256
+size=3
+stride=1
+pad=1
+activation=relu
+batch_normalize=1
+
+[convolutional]
+filters=1
+size=1
+stride=1
+pad=1
+activation=linear
+
+[reorg]
+extra=1
+stride=1
+
+[softmax]
+
+[cost]
+type=sse
+
--- a/cfg/go.test.cfg
+++ b/cfg/go.test.cfg
@ -7,13 +7,13 @@ channels=1
 momentum=0.9
 decay=0.0005

-learning_rate=0.1
+learning_rate=0.01
 policy=poly
 power=4
-max_batches=400000
+max_batches=100000

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -21,7 +21,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -29,7 +29,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -37,7 +37,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -45,7 +45,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -53,7 +53,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -61,7 +61,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -69,7 +69,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -77,7 +77,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -85,7 +85,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -93,7 +93,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -101,7 +101,7 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
@ -109,14 +109,13 @@ activation=relu
 batch_normalize=1

 [convolutional]
-filters=192
+filters=256
 size=3
 stride=1
 pad=1
 activation=relu
 batch_normalize=1

-
 [convolutional]
 filters=1
 size=1
@ -124,6 +123,10 @@ stride=1
 pad=1
 activation=linear

+[reorg]
+extra=1
+stride=1
+
 [softmax]

 [cost]
--- a/cfg/tiny-yolo-voc.cfg
+++ b/cfg/tiny-yolo-voc.cfg
@ -12,7 +12,7 @@ exposure = 1.5
 hue=.1

 learning_rate=0.001
-max_batches = 40100
+max_batches = 40200
 policy=steps
 steps=-1,100,20000,30000
 scales=.1,10,.1,.1
--- a/cfg/yolo-voc.2.0.cfg
+++ b/cfg/yolo-voc.2.0.cfg
@ -0,0 +1,244 @@
+[net]
+batch=64
+subdivisions=8
+height=416
+width=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.0001
+max_batches = 45000
+policy=steps
+steps=100,25000,35000
+scales=10,.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+
+#######
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[route]
+layers=-9
+
+[reorg]
+stride=2
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=125
+activation=linear
+
+[region]
+anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
+bias_match=1
+classes=20
+coords=4
+num=5
+softmax=1
+jitter=.2
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+absolute=1
+thresh = .6
+random=0
--- a/cfg/yolo-voc.cfg
+++ b/cfg/yolo-voc.cfg
@ -11,11 +11,12 @@ saturation = 1.5
 exposure = 1.5
 hue=.1

-learning_rate=0.0001
-max_batches = 45000
+learning_rate=0.001
+burn_in=1000
+max_batches = 80200
 policy=steps
-steps=100,25000,35000
-scales=10,.1,.1
+steps=40000,60000
+scales=.1,.1

 [convolutional]
 batch_normalize=1
@ -203,11 +204,19 @@ activation=leaky
 [route]
 layers=-9

+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
 [reorg]
 stride=2

 [route]
-layers=-1,-3
+layers=-1,-4

 [convolutional]
 batch_normalize=1
@ -224,14 +233,15 @@ pad=1
 filters=125
 activation=linear

+
 [region]
-anchors = 1.08,1.19,  3.42,4.41,  6.63,11.38,  9.42,5.11,  16.62,10.52
+anchors =  1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071
 bias_match=1
 classes=20
 coords=4
 num=5
 softmax=1
-jitter=.2
+jitter=.3
 rescore=1

 object_scale=5
@ -241,4 +251,4 @@ coord_scale=1

 absolute=1
 thresh = .6
-random=0
+random=1
--- a/cfg/yolo.2.0.cfg
+++ b/cfg/yolo.2.0.cfg
@ -0,0 +1,244 @@
+[net]
+batch=1
+subdivisions=1
+width=416
+height=416
+channels=3
+momentum=0.9
+decay=0.0005
+angle=0
+saturation = 1.5
+exposure = 1.5
+hue=.1
+
+learning_rate=0.001
+max_batches = 120000
+policy=steps
+steps=-1,100,80000,100000
+scales=.1,10,.1,.1
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+
+#######
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[route]
+layers=-9
+
+[reorg]
+stride=2
+
+[route]
+layers=-1,-3
+
+[convolutional]
+batch_normalize=1
+size=3
+stride=1
+pad=1
+filters=1024
+activation=leaky
+
+[convolutional]
+size=1
+stride=1
+pad=1
+filters=425
+activation=linear
+
+[region]
+anchors = 0.738768,0.874946,  2.42204,2.65704,  4.30971,7.04493,  10.246,4.59428,  12.6868,11.8741
+bias_match=1
+classes=80
+coords=4
+num=5
+softmax=1
+jitter=.2
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+absolute=1
+thresh = .6
+random=0
--- a/cfg/yolo.cfg
+++ b/cfg/yolo.cfg
@ -1,8 +1,8 @@
 [net]
-batch=1
-subdivisions=1
-width=416
-height=416
+batch=64
+subdivisions=8
+height=608
+width=608
 channels=3
 momentum=0.9
 decay=0.0005
@ -12,10 +12,11 @@ exposure = 1.5
 hue=.1

 learning_rate=0.001
-max_batches = 120000
+burn_in=1000
+max_batches = 500200
 policy=steps
-steps=-1,100,80000,100000
-scales=.1,10,.1,.1
+steps=400000,450000
+scales=.1,.1

 [convolutional]
 batch_normalize=1
@ -203,11 +204,19 @@ activation=leaky
 [route]
 layers=-9

+[convolutional]
+batch_normalize=1
+size=1
+stride=1
+pad=1
+filters=64
+activation=leaky
+
 [reorg]
 stride=2

 [route]
-layers=-1,-3
+layers=-1,-4

 [convolutional]
 batch_normalize=1
@ -224,14 +233,15 @@ pad=1
 filters=425
 activation=linear

+
 [region]
-anchors = 0.738768,0.874946,  2.42204,2.65704,  4.30971,7.04493,  10.246,4.59428,  12.6868,11.8741
+anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
 bias_match=1
 classes=80
 coords=4
 num=5
 softmax=1
-jitter=.2
+jitter=.3
 rescore=1

 object_scale=5
@ -241,4 +251,4 @@ coord_scale=1

 absolute=1
 thresh = .6
-random=0
+random=1
--- a/scripts/voc_label.py
+++ b/scripts/voc_label.py
@ -10,10 +10,10 @@ classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat"


 def convert(size, box):
-    dw = 1./size[0]
-    dh = 1./size[1]
-    x = (box[0] + box[1])/2.0
-    y = (box[2] + box[3])/2.0
+    dw = 1./(size[0])
+    dh = 1./(size[1])
+    x = (box[0] + box[1])/2.0 - 1
+    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
@ -34,7 +34,7 @@ def convert_annotation(year, image_id):
    for obj in root.iter('object'):
        difficult = obj.find('difficult').text
        cls = obj.find('name').text
-        if cls not in classes or int(difficult) == 1:
+        if cls not in classes or int(difficult)==1:
            continue
        cls_id = classes.index(cls)
        xmlbox = obj.find('bndbox')
@ -54,3 +54,6 @@ for year, image_set in sets:
        convert_annotation(year, image_id)
    list_file.close()

+os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt")
+os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt")
+
--- a/src/batchnorm_layer.c
+++ b/src/batchnorm_layer.c
@ -1,3 +1,4 @@
+#include "convolutional_layer.h"
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include <stdio.h>
@ -5,55 +6,67 @@
 layer make_batchnorm_layer(int batch, int w, int h, int c)
 {
    fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c);
-    layer layer = {0};
-    layer.type = BATCHNORM;
-    layer.batch = batch;
-    layer.h = layer.out_h = h;
-    layer.w = layer.out_w = w;
-    layer.c = layer.out_c = c;
-    layer.output = calloc(h * w * c * batch, sizeof(float));
-    layer.delta  = calloc(h * w * c * batch, sizeof(float));
-    layer.inputs = w*h*c;
-    layer.outputs = layer.inputs;
+    layer l = {0};
+    l.type = BATCHNORM;
+    l.batch = batch;
+    l.h = l.out_h = h;
+    l.w = l.out_w = w;
+    l.c = l.out_c = c;
+    l.output = calloc(h * w * c * batch, sizeof(float));
+    l.delta  = calloc(h * w * c * batch, sizeof(float));
+    l.inputs = w*h*c;
+    l.outputs = l.inputs;

-    layer.scales = calloc(c, sizeof(float));
-    layer.scale_updates = calloc(c, sizeof(float));
+    l.scales = calloc(c, sizeof(float));
+    l.scale_updates = calloc(c, sizeof(float));
+    l.biases = calloc(c, sizeof(float));
+    l.bias_updates = calloc(c, sizeof(float));
    int i;
    for(i = 0; i < c; ++i){
-        layer.scales[i] = 1;
+        l.scales[i] = 1;
    }

-    layer.mean = calloc(c, sizeof(float));
-    layer.variance = calloc(c, sizeof(float));
+    l.mean = calloc(c, sizeof(float));
+    l.variance = calloc(c, sizeof(float));

-    layer.rolling_mean = calloc(c, sizeof(float));
-    layer.rolling_variance = calloc(c, sizeof(float));
+    l.rolling_mean = calloc(c, sizeof(float));
+    l.rolling_variance = calloc(c, sizeof(float));

-    layer.forward = forward_batchnorm_layer;
-    layer.backward = backward_batchnorm_layer;
+    l.forward = forward_batchnorm_layer;
+    l.backward = backward_batchnorm_layer;
 #ifdef GPU
-    layer.forward_gpu = forward_batchnorm_layer_gpu;
-    layer.backward_gpu = backward_batchnorm_layer_gpu;
+    l.forward_gpu = forward_batchnorm_layer_gpu;
+    l.backward_gpu = backward_batchnorm_layer_gpu;

-    layer.output_gpu =  cuda_make_array(layer.output, h * w * c * batch);
-    layer.delta_gpu =   cuda_make_array(layer.delta, h * w * c * batch);
+    l.output_gpu =  cuda_make_array(l.output, h * w * c * batch);
+    l.delta_gpu =   cuda_make_array(l.delta, h * w * c * batch);

-    layer.scales_gpu = cuda_make_array(layer.scales, c);
-    layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c);
+    l.biases_gpu = cuda_make_array(l.biases, c);
+    l.bias_updates_gpu = cuda_make_array(l.bias_updates, c);

-    layer.mean_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_gpu = cuda_make_array(layer.variance, c);
+    l.scales_gpu = cuda_make_array(l.scales, c);
+    l.scale_updates_gpu = cuda_make_array(l.scale_updates, c);

-    layer.rolling_mean_gpu = cuda_make_array(layer.mean, c);
-    layer.rolling_variance_gpu = cuda_make_array(layer.variance, c);
+    l.mean_gpu = cuda_make_array(l.mean, c);
+    l.variance_gpu = cuda_make_array(l.variance, c);

-    layer.mean_delta_gpu = cuda_make_array(layer.mean, c);
-    layer.variance_delta_gpu = cuda_make_array(layer.variance, c);
+    l.rolling_mean_gpu = cuda_make_array(l.mean, c);
+    l.rolling_variance_gpu = cuda_make_array(l.variance, c);

-    layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
-    layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs);
+    l.mean_delta_gpu = cuda_make_array(l.mean, c);
+    l.variance_delta_gpu = cuda_make_array(l.variance, c);
+
+    l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs);
+    #ifdef CUDNN
+    cudnnCreateTensorDescriptor(&l.normTensorDesc);
+    cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+    cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+    cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
+
+    #endif
 #endif
-    return layer;
+    return l;
 }

 void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates)
@ -108,7 +121,7 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del
        for(f = 0; f < filters; ++f){
            for(k = 0; k < spatial; ++k){
                int index = j*filters*spatial + f*spatial + k;
-                delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+                delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
            }
        }
    }
@ -130,10 +143,10 @@ void forward_batchnorm_layer(layer l, network_state state)
        mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
        variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);

-        scal_cpu(l.out_c, .9, l.rolling_mean, 1);
-        axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
-        scal_cpu(l.out_c, .9, l.rolling_variance, 1);
-        axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
+        scal_cpu(l.out_c, .99, l.rolling_mean, 1);
+        axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1);
+        scal_cpu(l.out_c, .99, l.rolling_variance, 1);
+        axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1);

        copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
        normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);   
@ -142,10 +155,12 @@ void forward_batchnorm_layer(layer l, network_state state)
        normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
    }
    scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
+    add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w);
 }

 void backward_batchnorm_layer(const layer l, network_state state)
 {
+    backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);

    scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
@ -179,6 +194,28 @@ void forward_batchnorm_layer_gpu(layer l, network_state state)
        l.out_h = l.out_w = 1;
    }
    if (state.train) {
+#ifdef CUDNN
+        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
+        float one = 1;
+        float zero = 0;
+        cudnnBatchNormalizationForwardTraining(cudnn_handle(),
+                    CUDNN_BATCHNORM_SPATIAL,
+                    &one,
+                    &zero,
+                    l.dstTensorDesc,
+                    l.x_gpu,
+                    l.dstTensorDesc,
+                    l.output_gpu,
+                    l.normTensorDesc,
+                    l.scales_gpu,
+                    l.biases_gpu,
+                    .01,
+                    l.rolling_mean_gpu,
+                    l.rolling_variance_gpu,
+                    .00001,
+                    l.mean_gpu,
+                    l.variance_gpu);
+#else
        fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu);
        fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu);

@ -190,15 +227,45 @@ void forward_batchnorm_layer_gpu(layer l, network_state state)
        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1);
        normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
        copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1);
+
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+#endif
    } else {
        normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    }

-    scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
 }

 void backward_batchnorm_layer_gpu(const layer l, network_state state)
 {
+#ifdef CUDNN
+    float one = 1;
+    float zero = 0;
+    cudnnBatchNormalizationBackward(cudnn_handle(),
+            CUDNN_BATCHNORM_SPATIAL,
+            &one,
+            &zero,
+            &one,
+            &one,
+            l.dstTensorDesc,
+            l.x_gpu,
+            l.dstTensorDesc,
+            l.delta_gpu,
+            l.dstTensorDesc,
+            l.x_norm_gpu,
+            l.normTensorDesc,
+            l.scales_gpu,
+            l.scale_updates_gpu,
+            l.bias_updates_gpu,
+            .00001,
+            l.mean_gpu,
+            l.variance_gpu);
+    copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1);
+#else
+    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h);
    backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu);

    scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w);
@ -206,6 +273,7 @@ void backward_batchnorm_layer_gpu(const layer l, network_state state)
    fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu);
    fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu);
    normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu);
+#endif
    if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1);
 }
 #endif
--- a/src/blas.c
+++ b/src/blas.c
@ -179,11 +179,21 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
        }
        else {
            error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff < 0) ? 1 : -1;
        }
    }
 }

+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float diff = truth[i] - pred[i];
+        error[i] = fabs(diff);
+        delta[i] = diff > 0 ? 1 : -1;
+    }
+}
+
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
    int i;
@ -202,21 +212,32 @@ float dot_cpu(int N, float *X, int INCX, float *Y, int INCY)
    return dot;
 }

-void softmax(float *input, int n, float temp, float *output)
+void softmax(float *input, int n, float temp, int stride, float *output)
 {
    int i;
    float sum = 0;
    float largest = -FLT_MAX;
    for(i = 0; i < n; ++i){
-        if(input[i] > largest) largest = input[i];
+        if(input[i*stride] > largest) largest = input[i*stride];
    }
    for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = exp(input[i*stride]/temp - largest/temp);
        sum += e;
-        output[i] = e;
+        output[i*stride] = e;
    }
    for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
+    }
+}
+
+
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
+{
+    int g, b;
+    for(b = 0; b < batch; ++b){
+        for(g = 0; g < groups; ++g){
+            softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
+        }
    }
 }

--- a/src/blas.h
+++ b/src/blas.h
@ -33,9 +33,11 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del

 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);

-void softmax(float *input, int n, float temp, float *output);
+void softmax(float *input, int n, float temp, int stride, float *output);
+void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);

 #ifdef GPU
 #include "cuda.h"
@ -45,6 +47,7 @@ void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float
 void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
 void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void scal_ongpu(int N, float ALPHA, float * X, int INCX);
+void add_ongpu(int N, float ALPHA, float * X, int INCX);
 void supp_ongpu(int N, float ALPHA, float * X, int INCX);
 void mask_ongpu(int N, float * X, float mask_num, float * mask);
 void const_ongpu(int N, float ALPHA, float *X, int INCX);
@ -72,13 +75,14 @@ void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int

 void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
 void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
 void mult_add_into_gpu(int num, float *a, float *b, float *c);

 void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);

-void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output);
+void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
 void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);

 void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out);
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@ -161,7 +161,7 @@ __global__ void normalize_kernel(int N, float *x, float *mean, float *variance,
    if (index >= N) return;
    int f = (index/spatial)%filters;
    
-    x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f);
+    x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f));
 }

 __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@ -170,7 +170,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari
    if (index >= N) return;
    int f = (index/spatial)%filters;
    
-    delta[index] = delta[index] * 1./(sqrt(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+    delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
 }

 extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@ -192,7 +192,7 @@ __global__ void  variance_delta_kernel(float *x, float *delta, float *mean, floa
            variance_delta[i] += delta[index]*(x[index] - mean[i]);
        }
    }
-    variance_delta[i] *= -.5 * pow(variance[i] + .000001f, (float)(-3./2.));
+    variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
 }

 __global__ void accumulate_kernel(float *x, int n, int groups, float *sum)
@ -224,12 +224,14 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch,
        }
    }

+    __syncthreads();
+
    if(id == 0){
        mean_delta[filter] = 0;
        for(i = 0; i < threads; ++i){
            mean_delta[filter] += local[i];
        }
-        mean_delta[filter] *= (-1./sqrt(variance[filter] + .000001f));
+        mean_delta[filter] *= (-1./sqrt(variance[filter] + .00001f));
    }
 }

@ -252,12 +254,14 @@ __global__ void  fast_variance_delta_kernel(float *x, float *delta, float *mean,
        }
    }

+    __syncthreads();
+
    if(id == 0){
        variance_delta[filter] = 0;
        for(i = 0; i < threads; ++i){
            variance_delta[filter] += local[i];
        }
-        variance_delta[filter] *= -.5 * pow(variance[filter] + .000001f, (float)(-3./2.));
+        variance_delta[filter] *= -.5 * pow(variance[filter] + .00001f, (float)(-3./2.));
    }
 }

@ -274,7 +278,7 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int
            mean_delta[i] += delta[index];
        }
    }
-    mean_delta[i] *= (-1./sqrt(variance[i] + .000001f));
+    mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
 }

 extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
@ -391,6 +395,12 @@ __global__ void supp_kernel(int N, float ALPHA, float *X, int INCX)
    }
 }

+__global__ void add_kernel(int N, float ALPHA, float *X, int INCX)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < N) X[i*INCX] += ALPHA;
+}
+
 __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX)
 {
    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -447,6 +457,8 @@ __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial,
        }
    }

+    __syncthreads();
+
    if(id == 0){
        mean[filter] = 0;
        for(i = 0; i < threads; ++i){
@ -475,6 +487,8 @@ __global__ void  fast_variance_kernel(float *x, float *mean, int batch, int filt
        }
    }

+    __syncthreads();
+
    if(id == 0){
        variance[filter] = 0;
        for(i = 0; i < threads; ++i){
@ -593,6 +607,12 @@ extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX)
 }


+extern "C" void add_ongpu(int N, float ALPHA, float * X, int INCX)
+{
+    add_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
+    check_error(cudaPeekAtLastError());
+}
+
 extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX)
 {
    scal_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
@ -658,7 +678,7 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta,
        }
        else {
            error[i] = 2*abs_val - 1;
-            delta[i] = (diff < 0) ? -1 : 1;
+            delta[i] = (diff > 0) ? 1 : -1;
        }
    }
 }
@ -685,6 +705,23 @@ extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *er
    check_error(cudaPeekAtLastError());
 }

+__global__ void l1_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float diff = truth[i] - pred[i];
+        error[i] = abs(diff);
+        delta[i] = (diff > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void l1_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    l1_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+


 __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *c)
@ -732,36 +769,36 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c)
 }


-__device__ void softmax_device(int n, float *input, float temp, float *output)
+__device__ void softmax_device(float *input, int n, float temp, int stride, float *output)
 {
    int i;
    float sum = 0;
    float largest = -INFINITY;
    for(i = 0; i < n; ++i){
-        int val = input[i];
+        int val = input[i*stride];
        largest = (val>largest) ? val : largest;
    }
    for(i = 0; i < n; ++i){
-        float e = exp(input[i]/temp - largest/temp);
+        float e = exp(input[i*stride]/temp - largest/temp);
        sum += e;
-        output[i] = e;
+        output[i*stride] = e;
    }
    for(i = 0; i < n; ++i){
-        output[i] /= sum;
+        output[i*stride] /= sum;
    }
 }

-__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output)
+__global__ void softmax_kernel(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
 {
-    int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(b >= batch) return;
-    softmax_device(n, input + b*offset, temp, output + b*offset);
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (id >= batch*groups) return;
+    int b = id / groups;
+    int g = id % groups;
+    softmax_device(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
 }

-extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output)
+extern "C" void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
 {
-    int inputs = n;
-    int batch = groups;
-    softmax_kernel<<<cuda_gridsize(batch), BLOCK>>>(inputs, offset, batch, input, temp, output);
+    softmax_kernel<<<cuda_gridsize(batch*groups), BLOCK>>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
    check_error(cudaPeekAtLastError());
 }
--- a/src/box.c
+++ b/src/box.c
@ -3,13 +3,13 @@
 #include <math.h>
 #include <stdlib.h>

-box float_to_box(float *f)
+box float_to_box(float *f, int stride)
 {
    box b;
    b.x = f[0];
-    b.y = f[1];
-    b.w = f[2];
-    b.h = f[3];
+    b.y = f[1*stride];
+    b.w = f[2*stride];
+    b.h = f[3*stride];
    return b;
 }

--- a/src/box.h
+++ b/src/box.h
@ -9,7 +9,7 @@ typedef struct{
    float dx, dy, dw, dh;
 } dbox;

-box float_to_box(float *f);
+box float_to_box(float *f, int stride);
 float box_iou(box a, box b);
 float box_rmse(box a, box b);
 dbox diou(box a, box b);
--- a/src/classifier.c
+++ b/src/classifier.c
@ -379,7 +379,7 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile)
        float *pred = calloc(classes, sizeof(float));
        for(j = 0; j < 10; ++j){
            float *p = network_predict(net, images[j].data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1, 1);
            axpy_cpu(classes, 1, p, 1, pred, 1);
            free_image(images[j]);
        }
@ -440,7 +440,7 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile)
        //show_image(crop, "cropped");
        //cvWaitKey(0);
        float *pred = network_predict(net, resized.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1, 1);

        free_image(im);
        free_image(resized);
@ -502,7 +502,7 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
        //show_image(crop, "cropped");
        //cvWaitKey(0);
        float *pred = network_predict(net, crop.data);
-        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1);
+        if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1, 1);

        if(resized.data != im.data) free_image(resized);
        free_image(im);
@ -563,7 +563,7 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile)
            image r = resize_min(im, scales[j]);
            resize_network(&net, r.w, r.h);
            float *p = network_predict(net, r.data);
-            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1);
+            if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1 , 1);
            axpy_cpu(classes, 1, p, 1, pred, 1);
            flip_image(r);
            p = network_predict(net, r.data);
@ -703,7 +703,7 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
        float *X = r.data;
        time=clock();
        float *predictions = network_predict(net, X);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0);
+        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0, 1);
        top_k(predictions, net.outputs, top, indexes);
        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
        for(i = 0; i < top; ++i){
@ -1084,7 +1084,7 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
        show_image(in, "Classifier");

        float *predictions = network_predict(net, in_s.data);
-        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1);
+        if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1, 1);
        top_predictions(net, top, indexes);

        printf("\033[2J");
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -117,26 +117,70 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)

    if (l.batch_normalize) {
        forward_batchnorm_layer_gpu(l, state);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
-    add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);

    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
    //if(l.dot > 0) dot_error_gpu(l);
    if(l.binary || l.xnor) swap_binary(&l);
 }

+__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta)
+{
+    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(id >= n) return;
+
+    int j = id % w;
+    id /= w;
+    int i = id % h;
+    id /= h;
+    int k = id % c;
+    id /= c;
+    int b = id;
+
+    int w_offset = -(size/2.);
+    int h_offset = -(size/2.);
+
+    int out_index = j + w*(i + h*(k + c*b));
+    int l, m;
+    for(l = 0; l < size; ++l){
+        for(m = 0; m < size; ++m){
+            int cur_h = h_offset + i + l;
+            int cur_w = w_offset + j + m;
+            int index = cur_w + w*(cur_h + h*(k + b*c));
+            int valid = (cur_h >= 0 && cur_h < h &&
+                    cur_w >= 0 && cur_w < w);
+            delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0;
+        }
+    }
+}
+
+extern "C" void smooth_layer(layer l, int size, float rate)
+{
+    int h = l.out_h;
+    int w = l.out_w;
+    int c = l.out_c;
+
+    size_t n = h*w*c*l.batch;
+
+    smooth_kernel<<<cuda_gridsize(n), BLOCK>>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu);
+    check_error(cudaPeekAtLastError());
+}
+
 void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 {
+    if(l.smooth){
+        smooth_layer(l, 5, l.smooth);
+    }
    //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);

-    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);

    if(l.batch_normalize){
        backward_batchnorm_layer_gpu(l, state);
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.x_gpu, 1, l.delta_gpu, 1);
    } else {
-        //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.output_gpu, 1, l.delta_gpu, 1);
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
    }
    float *original_input = state.input;

--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -23,11 +23,11 @@ void swap_binary(convolutional_layer *l)
    l->weights = l->binary_weights;
    l->binary_weights = swap;

-    #ifdef GPU
+#ifdef GPU
    swap = l->weights_gpu;
    l->weights_gpu = l->binary_weights_gpu;
    l->binary_weights_gpu = swap;
-    #endif
+#endif
 }

 void binarize_weights(float *weights, int n, int size, float *binary)
@ -80,23 +80,15 @@ int convolutional_out_width(convolutional_layer l)

 image get_convolutional_image(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.output);
 }

 image get_convolutional_delta(convolutional_layer l)
 {
-    int h,w,c;
-    h = convolutional_out_height(l);
-    w = convolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
+    return float_to_image(l.out_w,l.out_h,l.out_c,l.delta);
 }

-size_t get_workspace_size(layer l){
+static size_t get_workspace_size(layer l){
 #ifdef CUDNN
    if(gpu_index >= 0){
        size_t most = 0;
@ -127,7 +119,7 @@ size_t get_workspace_size(layer l){
        if (s > most) most = s;
        return most;
    }
-    #endif
+#endif
    return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float);
 }

@ -141,6 +133,7 @@ void cudnn_convolutional_setup(layer *l)

    cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); 
    cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+    cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
    cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); 
    cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION);
    cudnnGetConvolutionForwardAlgorithm(cudnn_handle(),
@ -198,8 +191,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
    // float scale = 1./sqrt(size*size*c);
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
-    int out_h = convolutional_out_height(l);
    int out_w = convolutional_out_width(l);
+    int out_h = convolutional_out_height(l);
    l.out_h = out_h;
    l.out_w = out_w;
    l.out_c = n;
@ -291,6 +284,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
            l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
        }
 #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
        cudnnCreateTensorDescriptor(&l.srcTensorDesc);
        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
        cudnnCreateFilterDescriptor(&l.weightDesc);
@ -426,8 +420,8 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size

 void forward_convolutional_layer(convolutional_layer l, network_state state)
 {
-    int out_h = convolutional_out_height(l);
-    int out_w = convolutional_out_width(l);
+    int out_h = l.out_h;
+    int out_w = l.out_w;
    int i;

    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
@ -458,8 +452,9 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)

    if(l.batch_normalize){
        forward_batchnorm_layer(l, state);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);
    }
-    add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w);

    activate_array(l.output, m*n*l.batch, l.activation);
    if(l.binary || l.xnor) swap_binary(&l);
@ -470,14 +465,14 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
    int i;
    int m = l.n;
    int n = l.size*l.size*l.c;
-    int k = convolutional_out_height(l)*
-        convolutional_out_width(l);
+    int k = l.out_w*l.out_h;

    gradient_array(l.output, m*k*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);

    if(l.batch_normalize){
        backward_batchnorm_layer(l, state);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, k);
    }

    for(i = 0; i < l.batch; ++i){
--- a/src/cost_layer.c
+++ b/src/cost_layer.c
@ -12,6 +12,7 @@ COST_TYPE get_cost_type(char *s)
    if (strcmp(s, "sse")==0) return SSE;
    if (strcmp(s, "masked")==0) return MASKED;
    if (strcmp(s, "smooth")==0) return SMOOTH;
+    if (strcmp(s, "L1")==0) return L1;
    fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
    return SSE;
 }
@ -25,6 +26,8 @@ char *get_cost_string(COST_TYPE a)
            return "masked";
        case SMOOTH:
            return "smooth";
+        case L1:
+            return "L1";
    }
    return "sse";
 }
@ -81,6 +84,8 @@ void forward_cost_layer(cost_layer l, network_state state)
    }
    if(l.cost_type == SMOOTH){
        smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
+    }else if(l.cost_type == L1){
+        l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
    } else {
        l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output);
    }
@ -116,12 +121,18 @@ int float_abs_compare (const void * a, const void * b)
 void forward_cost_layer_gpu(cost_layer l, network_state state)
 {
    if (!state.truth) return;
+    if(l.smooth){
+        scal_ongpu(l.batch*l.inputs, (1-l.smooth), state.truth, 1);
+        add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, state.truth, 1);
+    }
    if (l.cost_type == MASKED) {
        mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth);
    }

    if(l.cost_type == SMOOTH){
        smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == L1){
+        l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
    } else {
        l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu);
    }
@ -136,6 +147,10 @@ void forward_cost_layer_gpu(cost_layer l, network_state state)
        supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1);
    }

+    if(l.thresh){
+        supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1);
+    }
+
    cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs);
    l.cost[0] = sum_array(l.output, l.batch*l.inputs);
 }
--- a/src/cuda.c
+++ b/src/cuda.c
@ -157,4 +157,13 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n)
    check_error(status);
 }

+float cuda_mag_array(float *x_gpu, size_t n)
+{
+    float *temp = calloc(n, sizeof(float));
+    cuda_pull_array(x_gpu, temp, n);
+    float m = mag_array(temp, n);
+    free(temp);
+    return m;
+}
+
 #endif
--- a/src/cuda.h
+++ b/src/cuda.h
@ -26,6 +26,7 @@ void cuda_free(float *x_gpu);
 void cuda_random(float *x_gpu, size_t n);
 float cuda_compare(float *x_gpu, float *x, size_t n, char *s);
 dim3 cuda_gridsize(size_t n);
+float cuda_mag_array(float *x_gpu, size_t n);

 #ifdef CUDNN
 cudnnHandle_t cudnn_handle();
--- a/src/darknet.c
+++ b/src/darknet.c
@ -24,6 +24,7 @@ extern void run_nightmare(int argc, char **argv);
 extern void run_dice(int argc, char **argv);
 extern void run_compare(int argc, char **argv);
 extern void run_classifier(int argc, char **argv);
+extern void run_regressor(int argc, char **argv);
 extern void run_char_rnn(int argc, char **argv);
 extern void run_vid_rnn(int argc, char **argv);
 extern void run_tag(int argc, char **argv);
@ -31,6 +32,7 @@ extern void run_cifar(int argc, char **argv);
 extern void run_go(int argc, char **argv);
 extern void run_art(int argc, char **argv);
 extern void run_super(int argc, char **argv);
+extern void run_lsd(int argc, char **argv);

 void average(int argc, char *argv[])
 {
@ -95,7 +97,7 @@ void speed(char *cfgfile, int tics)
    set_batch_network(&net, 1);
    int i;
    time_t start = time(0);
-    image im = make_image(net.w, net.h, net.c);
+    image im = make_image(net.w, net.h, net.c*net.batch);
    for(i = 0; i < tics; ++i){
        network_predict(net, im.data);
    }
@ -150,12 +152,24 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile)
    save_weights(net, outfile);
 }

+void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights_upto(&net, weightfile, 0, net.n);
+        load_weights_upto(&net, weightfile, l, net.n);
+    }
+    *net.seen = 0;
+    save_weights_upto(net, outfile, net.n);
+}
+
 void partial(char *cfgfile, char *weightfile, char *outfile, int max)
 {
    gpu_index = -1;
    network net = parse_network_cfg(cfgfile);
    if(weightfile){
-        load_weights_upto(&net, weightfile, max);
+        load_weights_upto(&net, weightfile, 0, max);
    }
    *net.seen = 0;
    save_weights_upto(net, outfile, max);
@ -380,6 +394,8 @@ int main(int argc, char **argv)
        run_voxel(argc, argv);
    } else if (0 == strcmp(argv[1], "super")){
        run_super(argc, argv);
+    } else if (0 == strcmp(argv[1], "lsd")){
+        run_lsd(argc, argv);
    } else if (0 == strcmp(argv[1], "detector")){
        run_detector(argc, argv);
    } else if (0 == strcmp(argv[1], "detect")){
@ -400,6 +416,8 @@ int main(int argc, char **argv)
        predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
    } else if (0 == strcmp(argv[1], "classifier")){
        run_classifier(argc, argv);
+    } else if (0 == strcmp(argv[1], "regressor")){
+        run_regressor(argc, argv);
    } else if (0 == strcmp(argv[1], "art")){
        run_art(argc, argv);
    } else if (0 == strcmp(argv[1], "tag")){
@ -436,6 +454,8 @@ int main(int argc, char **argv)
        speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0);
    } else if (0 == strcmp(argv[1], "oneoff")){
        oneoff(argv[2], argv[3], argv[4]);
+    } else if (0 == strcmp(argv[1], "oneoff2")){
+        oneoff2(argv[2], argv[3], argv[4], atoi(argv[5]));
    } else if (0 == strcmp(argv[1], "partial")){
        partial(argv[2], argv[3], argv[4], atoi(argv[5]));
    } else if (0 == strcmp(argv[1], "average")){
--- a/src/data.c
+++ b/src/data.c
@ -317,7 +317,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
        h =  boxes[i].h;
        id = boxes[i].id;

-        if ((w < .005 || h < .005)) continue;
+        if ((w < .001 || h < .001)) continue;

        truth[i*5+0] = x;
        truth[i*5+1] = y;
@ -393,7 +393,7 @@ void fill_truth(char *path, char **labels, int k, float *truth)
            ++count;
        }
    }
-    if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path);
+    if(count != 1 && (k != 1 || count != 0)) printf("Too many or too few labels: %d, %s\n", count, path);
 }

 void fill_hierarchy(float *truth, int k, tree *hierarchy)
@ -428,6 +428,24 @@ void fill_hierarchy(float *truth, int k, tree *hierarchy)
    }
 }

+matrix load_regression_labels_paths(char **paths, int n)
+{
+    matrix y = make_matrix(n, 1);
+    int i;
+    for(i = 0; i < n; ++i){
+        char labelpath[4096];
+        find_replace(paths[i], "images", "targets", labelpath);
+        find_replace(labelpath, "JPEGImages", "targets", labelpath);
+        find_replace(labelpath, ".jpg", ".txt", labelpath);
+        find_replace(labelpath, ".png", ".txt", labelpath);
+        
+        FILE *file = fopen(labelpath, "r");
+        fscanf(file, "%f", &(y.vals[i][0]));
+        fclose(file);
+    }
+    return y;
+}
+
 matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy)
 {
    matrix y = make_matrix(n, k);
@ -673,45 +691,44 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in
    d.y = make_matrix(n, 5*boxes);
    for(i = 0; i < n; ++i){
        image orig = load_image_color(random_paths[i], 0, 0);
+        image sized = make_image(w, h, orig.c);
+        fill_image(sized, .5);
+        
+        float dw = jitter * orig.w;
+        float dh = jitter * orig.h;

-        int oh = orig.h;
-        int ow = orig.w;
+        float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
+        float scale = rand_uniform(.25, 2);

-        int dw = (ow*jitter);
-        int dh = (oh*jitter);
+        float nw, nh;
+        
+        if(new_ar < 1){
+            nh = scale * h;
+            nw = nh * new_ar;
+        } else {
+            nw = scale * w;
+            nh = nw / new_ar;
+        }

-        int pleft  = rand_uniform(-dw, dw);
-        int pright = rand_uniform(-dw, dw);
-        int ptop   = rand_uniform(-dh, dh);
-        int pbot   = rand_uniform(-dh, dh);
+        float dx = rand_uniform(0, w - nw);
+        float dy = rand_uniform(0, h - nh);

-        int swidth =  ow - pleft - pright;
-        int sheight = oh - ptop - pbot;
+        place_image(orig, nw, nh, dx, dy, sized);

-        float sx = (float)swidth  / ow;
-        float sy = (float)sheight / oh;
-
-        int flip = rand()%2;
-        image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
-
-        float dx = ((float)pleft/ow)/sx;
-        float dy = ((float)ptop /oh)/sy;
-
-        image sized = resize_image(cropped, w, h);
-        if(flip) flip_image(sized);
        random_distort_image(sized, hue, saturation, exposure);
+        int flip = rand()%2;
+        if(flip) flip_image(sized);
        d.X.vals[i] = sized.data;

-        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy);
+
+        fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);

        free_image(orig);
-        free_image(cropped);
    }
    free(random_paths);
    return d;
 }

-
 void *load_thread(void *ptr)
 {
    //printf("Loading data: %d\n", rand());
@ -722,6 +739,8 @@ void *load_thread(void *ptr)

    if (a.type == OLD_CLASSIFICATION_DATA){
        *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
+    } else if (a.type == REGRESSION_DATA){
+        *a.d = load_data_regression(a.paths, a.n, a.m, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
    } else if (a.type == CLASSIFICATION_DATA){
        *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
    } else if (a.type == SUPER_DATA){
@ -739,6 +758,9 @@ void *load_thread(void *ptr)
    } else if (a.type == IMAGE_DATA){
        *(a.im) = load_image_color(a.path, 0, 0);
        *(a.resized) = resize_image(*(a.im), a.w, a.h);
+    } else if (a.type == LETTERBOX_DATA){
+        *(a.im) = load_image_color(a.path, 0, 0);
+        *(a.resized) = letterbox_image(*(a.im), a.w, a.h);
    } else if (a.type == TAG_DATA){
        *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
    }
@ -863,6 +885,17 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale)
    return d;
 }

+data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+{
+    if(m) paths = get_random_paths(paths, n, m);
+    data d = {0};
+    d.shallow = 0;
+    d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure);
+    d.y = load_regression_labels_paths(paths, n);
+    if(m) free(paths);
+    return d;
+}
+
 data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
 {
    if(m) paths = get_random_paths(paths, n, m);
@ -962,7 +995,6 @@ data load_cifar10_data(char *filename)
            X.vals[i][j] = (double)bytes[j+1];
        }
    }
-    //translate_data_rows(d, -128);
    scale_data_rows(d, 1./255);
    //normalize_data_rows(d);
    fclose(fp);
@ -1029,7 +1061,6 @@ data load_all_cifar10()
        fclose(fp);
    }
    //normalize_data_rows(d);
-    //translate_data_rows(d, -128);
    scale_data_rows(d, 1./255);
    smooth_data(d);
    return d;
@ -1113,6 +1144,19 @@ void translate_data_rows(data d, float s)
    }
 }

+data copy_data(data d)
+{
+    data c = {0};
+    c.w = d.w;
+    c.h = d.h;
+    c.shallow = 0;
+    c.num_boxes = d.num_boxes;
+    c.boxes = d.boxes;
+    c.X = copy_matrix(d.X);
+    c.y = copy_matrix(d.y);
+    return c;
+}
+
 void normalize_data_rows(data d)
 {
    int i;
--- a/src/data.h
+++ b/src/data.h
@ -28,7 +28,7 @@ typedef struct{
 } data;

 typedef enum {
-    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA
+    CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA
 } data_type;

 typedef struct load_args{
@ -83,6 +83,7 @@ data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size
 matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
 data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
 data load_go(char *filename);

 box_label *read_boxes(char *filename, int *n);
@ -106,5 +107,6 @@ data *split_data(data d, int part, int total);
 data concat_data(data d1, data d2);
 data concat_datas(data *d, int n);
 void fill_truth(char *path, char **labels, int k, float *truth);
+data copy_data(data d);

 #endif
--- a/src/deconvolutional_kernels.cu
+++ b/src/deconvolutional_kernels.cu
@ -5,6 +5,7 @@
 extern "C" {
 #include "convolutional_layer.h"
 #include "deconvolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "gemm.h"
 #include "blas.h"
 #include "im2col.h"
@ -13,97 +14,119 @@ extern "C" {
 #include "cuda.h"
 }

-extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void forward_deconvolutional_layer_gpu(layer l, network_state state)
 {
    int i;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
+    int out_h = l.out_h;
+    int out_w = l.out_w;
    int size = out_h*out_w;

-    int m = layer.size*layer.size*layer.n;
-    int n = layer.h*layer.w;
-    int k = layer.c;
+    int m = l.size*l.size*l.n;
+    int n = l.h*l.w;
+    int k = l.c;

-    fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1);
+    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);

-    for(i = 0; i < layer.batch; ++i){
-        float *a = layer.weights_gpu;
-        float *b = state.input + i*layer.c*layer.h*layer.w;
-        float *c = layer.col_image_gpu;
+    for(i = 0; i < l.batch; ++i){
+        float *a = l.weights_gpu;
+        float *b = state.input + i*l.c*l.h*l.w;
+        float *c = state.workspace;

        gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);

-        col2im_ongpu(c, layer.n, out_h, out_w, layer.size, layer.stride, 0, layer.output_gpu+i*layer.n*size);
+        col2im_ongpu(c, l.n, out_h, out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.n*size);
    }
-    add_bias_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, size);
-    activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation);
+    if (l.batch_normalize) {
+        forward_batchnorm_layer_gpu(l, state);
+    } else {
+        add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }
+    activate_array_ongpu(l.output_gpu, l.batch*l.n*size, l.activation);
 }

-extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state)
+extern "C" void backward_deconvolutional_layer_gpu(layer l, network_state state)
 {
-    float alpha = 1./layer.batch;
-    int out_h = deconvolutional_out_height(layer);
-    int out_w = deconvolutional_out_width(layer);
+    int out_h = l.out_h;
+    int out_w = l.out_w;
    int size = out_h*out_w;
    int i;

-    gradient_array(layer.output_gpu, size*layer.n*layer.batch, layer.activation, layer.delta_gpu);
-    backward_bias(layer.bias_updates_gpu, layer.delta, layer.batch, layer.n, size);
+    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);

-    if(state.delta) memset(state.delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float));
+    if(l.batch_normalize){
+        backward_batchnorm_layer_gpu(l, state);
+    } else {
+        backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
+    }

-    for(i = 0; i < layer.batch; ++i){
-        int m = layer.c;
-        int n = layer.size*layer.size*layer.n;
-        int k = layer.h*layer.w;
+    //if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float));
+
+    for(i = 0; i < l.batch; ++i){
+        int m = l.c;
+        int n = l.size*l.size*l.n;
+        int k = l.h*l.w;

        float *a = state.input + i*m*n;
-        float *b = layer.col_image_gpu;
-        float *c = layer.weight_updates_gpu;
+        float *b = state.workspace;
+        float *c = l.weight_updates_gpu;

-        im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w, 
-                layer.size, layer.stride, 0, b);
-        gemm_ongpu(0,1,m,n,k,alpha,a,k,b,k,1,c,n);
+        im2col_ongpu(l.delta_gpu + i*l.n*size, l.n, out_h, out_w, 
+                l.size, l.stride, l.pad, b);
+        gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);

        if(state.delta){
-            int m = layer.c;
-            int n = layer.h*layer.w;
-            int k = layer.size*layer.size*layer.n;
+            int m = l.c;
+            int n = l.h*l.w;
+            int k = l.size*l.size*l.n;

-            float *a = layer.weights_gpu;
-            float *b = layer.col_image_gpu;
+            float *a = l.weights_gpu;
+            float *b = state.workspace;
            float *c = state.delta + i*n*m;

-            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
+            gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
        }
    }
 }

-extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void pull_deconvolutional_layer(layer l)
 {
-    cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.biases_gpu, l.biases, l.n);
+    cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_pull_array(l.scales_gpu, l.scales, l.n);
+        cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }

-extern "C" void push_deconvolutional_layer(deconvolutional_layer layer)
+extern "C" void push_deconvolutional_layer(layer l)
 {
-    cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
-    cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
-    cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
+    cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.biases_gpu, l.biases, l.n);
+    cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size);
+    cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
+    if (l.batch_normalize){
+        cuda_push_array(l.scales_gpu, l.scales, l.n);
+        cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
+        cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
+    }
 }

-extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
 {
-    int size = layer.size*layer.size*layer.c*layer.n;
+    int size = l.size*l.size*l.c*l.n;
+    axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
+    scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);

-    axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1);
-    scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1);
+    if(l.scales_gpu){
+        axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1);
+        scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1);
+    }

-    axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1);
-    axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1);
-    scal_ongpu(size, momentum, layer.weight_updates_gpu, 1);
+    axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+    axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+    scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
 }

--- a/src/deconvolutional_layer.c
+++ b/src/deconvolutional_layer.c
@ -1,5 +1,6 @@
 #include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
+#include "batchnorm_layer.h"
 #include "utils.h"
 #include "im2col.h"
 #include "col2im.h"
@ -8,45 +9,25 @@
 #include <stdio.h>
 #include <time.h>

-int deconvolutional_out_height(deconvolutional_layer l)
-{
-    int h = l.stride*(l.h - 1) + l.size;
-    return h;
+
+static size_t get_workspace_size(layer l){
+    return (size_t)l.h*l.w*l.size*l.size*l.c*sizeof(float);
 }

-int deconvolutional_out_width(deconvolutional_layer l)
+int deconvolutional_out_height(layer l)
 {
-    int w = l.stride*(l.w - 1) + l.size;
-    return w;
+    return (l.h) * l.stride + l.size/2 - l.pad;
 }

-int deconvolutional_out_size(deconvolutional_layer l)
+int deconvolutional_out_width(layer l)
 {
-    return deconvolutional_out_height(l) * deconvolutional_out_width(l);
+    return (l.w) * l.stride + l.size/2 - l.pad;
 }

-image get_deconvolutional_image(deconvolutional_layer l)
-{
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.output);
-}
-
-image get_deconvolutional_delta(deconvolutional_layer l)
-{
-    int h,w,c;
-    h = deconvolutional_out_height(l);
-    w = deconvolutional_out_width(l);
-    c = l.n;
-    return float_to_image(w,h,c,l.delta);
-}
-
-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation)
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize)
 {
    int i;
-    deconvolutional_layer l = {0};
+    layer l = {0};
    l.type = DECONVOLUTIONAL;

    l.h = h;
@ -67,71 +48,135 @@ deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c,
    for(i = 0; i < n; ++i){
        l.biases[i] = scale;
    }
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
+    l.pad = l.size/2;

-    l.out_h = out_h;
-    l.out_w = out_w;
+    l.out_h = (l.h) * l.stride + l.size/2 - l.pad;
+    l.out_w = (l.w) * l.stride + l.size/2 - l.pad;
    l.out_c = n;
    l.outputs = l.out_w * l.out_h * l.out_c;
    l.inputs = l.w * l.h * l.c;

-    l.col_image = calloc(h*w*size*size*n, sizeof(float));
-    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
-    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+    l.output = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float));
+    l.delta  = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float));

    l.forward = forward_deconvolutional_layer;
    l.backward = backward_deconvolutional_layer;
    l.update = update_deconvolutional_layer;

-    #ifdef GPU
-    l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
-    l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+    l.batch_normalize = batch_normalize;

-    l.biases_gpu = cuda_make_array(l.biases, n);
-    l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+    if(batch_normalize){
+        l.scales = calloc(n, sizeof(float));
+        l.scale_updates = calloc(n, sizeof(float));
+        for(i = 0; i < n; ++i){
+            l.scales[i] = 1;
+        }

-    l.col_image_gpu = cuda_make_array(l.col_image, h*w*size*size*n);
-    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
-    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);
+        l.mean = calloc(n, sizeof(float));
+        l.variance = calloc(n, sizeof(float));
+
+        l.mean_delta = calloc(n, sizeof(float));
+        l.variance_delta = calloc(n, sizeof(float));
+
+        l.rolling_mean = calloc(n, sizeof(float));
+        l.rolling_variance = calloc(n, sizeof(float));
+        l.x = calloc(l.batch*l.outputs, sizeof(float));
+        l.x_norm = calloc(l.batch*l.outputs, sizeof(float));
+    }
+
+#ifdef GPU
+    l.forward_gpu = forward_deconvolutional_layer_gpu;
+    l.backward_gpu = backward_deconvolutional_layer_gpu;
+    l.update_gpu = update_deconvolutional_layer_gpu;
+
+    if(gpu_index >= 0){
+
+        l.weights_gpu = cuda_make_array(l.weights, c*n*size*size);
+        l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size);
+
+        l.biases_gpu = cuda_make_array(l.biases, n);
+        l.bias_updates_gpu = cuda_make_array(l.bias_updates, n);
+
+        l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n);
+        l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
+
+        if(batch_normalize){
+            l.mean_gpu = cuda_make_array(l.mean, n);
+            l.variance_gpu = cuda_make_array(l.variance, n);
+
+            l.rolling_mean_gpu = cuda_make_array(l.mean, n);
+            l.rolling_variance_gpu = cuda_make_array(l.variance, n);
+
+            l.mean_delta_gpu = cuda_make_array(l.mean, n);
+            l.variance_delta_gpu = cuda_make_array(l.variance, n);
+
+            l.scales_gpu = cuda_make_array(l.scales, n);
+            l.scale_updates_gpu = cuda_make_array(l.scale_updates, n);
+
+            l.x_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
+            l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n);
+        }
+    }
+    #ifdef CUDNN
+        cudnnCreateTensorDescriptor(&l.dstTensorDesc);
+        cudnnCreateTensorDescriptor(&l.normTensorDesc);
+        cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); 
+        cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); 
    #endif
+#endif

    l.activation = activation;
+    l.workspace_size = get_workspace_size(l);

-    fprintf(stderr, "Deconvolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n);
+    fprintf(stderr, "deconv%5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);

    return l;
 }

-void resize_deconvolutional_layer(deconvolutional_layer *l, int h, int w)
+void resize_deconvolutional_layer(layer *l, int h, int w)
 {
    l->h = h;
    l->w = w;
-    int out_h = deconvolutional_out_height(*l);
-    int out_w = deconvolutional_out_width(*l);
+    l->out_h = (l->h) * l->stride + l->size/2 - l->pad;
+    l->out_w = (l->w) * l->stride + l->size/2 - l->pad;

-    l->col_image = realloc(l->col_image,
-                                out_h*out_w*l->size*l->size*l->c*sizeof(float));
-    l->output = realloc(l->output,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    l->delta  = realloc(l->delta,
-                                l->batch*out_h * out_w * l->n*sizeof(float));
-    #ifdef GPU
-    cuda_free(l->col_image_gpu);
+    l->outputs = l->out_h * l->out_w * l->out_c;
+    l->inputs = l->w * l->h * l->c;
+
+    l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
+    l->delta  = realloc(l->delta,  l->batch*l->outputs*sizeof(float));
+    if(l->batch_normalize){
+        l->x = realloc(l->x, l->batch*l->outputs*sizeof(float));
+        l->x_norm  = realloc(l->x_norm, l->batch*l->outputs*sizeof(float));
+    }
+
+#ifdef GPU
    cuda_free(l->delta_gpu);
    cuda_free(l->output_gpu);

-    l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c);
-    l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n);
-    l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n);
+    l->delta_gpu =  cuda_make_array(l->delta,  l->batch*l->outputs);
+    l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+
+    if(l->batch_normalize){
+        cuda_free(l->x_gpu);
+        cuda_free(l->x_norm_gpu);
+
+        l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+        l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs);
+    }
+    #ifdef CUDNN
+        cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); 
+        cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); 
    #endif
+#endif
+    l->workspace_size = get_workspace_size(*l);
 }

-void forward_deconvolutional_layer(const deconvolutional_layer l, network_state state)
+void forward_deconvolutional_layer(const layer l, network_state state)
 {
    int i;
-    int out_h = deconvolutional_out_height(l);
-    int out_w = deconvolutional_out_width(l);
+    int out_h = l.out_h;
+    int out_w = l.out_w;
    int size = out_h*out_w;

    int m = l.size*l.size*l.n;
@ -143,17 +188,22 @@ void forward_deconvolutional_layer(const deconvolutional_layer l, network_state
    for(i = 0; i < l.batch; ++i){
        float *a = l.weights;
        float *b = state.input + i*l.c*l.h*l.w;
-        float *c = l.col_image;
+        float *c = state.workspace;

        gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);

        col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size);
    }
-    add_bias(l.output, l.biases, l.batch, l.n, size);
+
+    if(l.batch_normalize){
+        forward_batchnorm_layer(l, state);
+    } else {
+        add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w);
+    }
    activate_array(l.output, l.batch*l.n*size, l.activation);
 }

-void backward_deconvolutional_layer(deconvolutional_layer l, network_state state)
+void backward_deconvolutional_layer(layer l, network_state state)
 {
    float alpha = 1./l.batch;
    int out_h = deconvolutional_out_height(l);
@ -162,7 +212,11 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state
    int i;

    gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta);
-    backward_bias(l.bias_updates, l.delta, l.batch, l.n, size);
+    if(l.batch_normalize){
+        backward_batchnorm_layer(l, state);
+    } else {
+        backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h);
+    }

    for(i = 0; i < l.batch; ++i){
        int m = l.c;
@ -170,7 +224,7 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state
        int k = l.h*l.w;

        float *a = state.input + i*m*n;
-        float *b = l.col_image;
+        float *b = state.workspace;
        float *c = l.weight_updates;

        im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, 
@ -183,7 +237,7 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state
            int k = l.size*l.size*l.n;

            float *a = l.weights;
-            float *b = l.col_image;
+            float *b = state.workspace;
            float *c = state.delta + i*n*m;

            gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
@ -191,14 +245,19 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state
    }
 }

-void update_deconvolutional_layer(deconvolutional_layer l, float learning_rate, float momentum, float decay)
+void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay)
 {
    int size = l.size*l.size*l.c*l.n;
-    axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1);
+    axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1);
    scal_cpu(l.n, momentum, l.bias_updates, 1);

-    axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1);
-    axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1);
+    if(l.scales){
+        axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1);
+        scal_cpu(l.n, momentum, l.scale_updates, 1);
+    }
+
+    axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1);
+    axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1);
    scal_cpu(size, momentum, l.weight_updates, 1);
 }

--- a/src/deconvolutional_layer.h
+++ b/src/deconvolutional_layer.h
@ -7,28 +7,19 @@
 #include "layer.h"
 #include "network.h"

-typedef layer deconvolutional_layer;
-
 #ifdef GPU
-void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void push_deconvolutional_layer(deconvolutional_layer layer);
-void pull_deconvolutional_layer(deconvolutional_layer layer);
+void forward_deconvolutional_layer_gpu(layer l, network_state state);
+void backward_deconvolutional_layer_gpu(layer l, network_state state);
+void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay);
+void push_deconvolutional_layer(layer l);
+void pull_deconvolutional_layer(layer l);
 #endif

-deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation);
-void resize_deconvolutional_layer(deconvolutional_layer *layer, int h, int w);
-void forward_deconvolutional_layer(const deconvolutional_layer layer, network_state state);
-void update_deconvolutional_layer(deconvolutional_layer layer, float learning_rate, float momentum, float decay);
-void backward_deconvolutional_layer(deconvolutional_layer layer, network_state state);
-
-image get_deconvolutional_image(deconvolutional_layer layer);
-image get_deconvolutional_delta(deconvolutional_layer layer);
-image get_deconvolutional_filter(deconvolutional_layer layer, int i);
-
-int deconvolutional_out_height(deconvolutional_layer layer);
-int deconvolutional_out_width(deconvolutional_layer layer);
+layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize);
+void resize_deconvolutional_layer(layer *l, int h, int w);
+void forward_deconvolutional_layer(const layer l, network_state state);
+void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay);
+void backward_deconvolutional_layer(layer l, network_state state);

 #endif

--- a/src/detection_layer.c
+++ b/src/detection_layer.c
@ -58,7 +58,7 @@ void forward_detection_layer(const detection_layer l, network_state state)
            int index = b*l.inputs;
            for (i = 0; i < locations; ++i) {
                int offset = i*l.classes;
-                softmax(l.output + index + offset, l.classes, 1,
+                softmax(l.output + index + offset, l.classes, 1, 1,
                        l.output + index + offset);
            }
        }
@ -101,13 +101,13 @@ void forward_detection_layer(const detection_layer l, network_state state)
                    avg_allcat += l.output[class_index+j];
                }

-                box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
+                box truth = float_to_box(state.truth + truth_index + 1 + l.classes, 1);
                truth.x /= l.side;
                truth.y /= l.side;

                for(j = 0; j < l.n; ++j){
                    int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
-                    box out = float_to_box(l.output + box_index);
+                    box out = float_to_box(l.output + box_index, 1);
                    out.x /= l.side;
                    out.y /= l.side;

@ -146,7 +146,7 @@ void forward_detection_layer(const detection_layer l, network_state state)
                int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
                int tbox_index = truth_index + 1 + l.classes;

-                box out = float_to_box(l.output + box_index);
+                box out = float_to_box(l.output + box_index, 1);
                out.x /= l.side;
                out.y /= l.side;
                if (l.sqrt) {
--- a/src/detector.c
+++ b/src/detector.c
@ -6,6 +6,7 @@
 #include "box.h"
 #include "demo.h"
 #include "option_list.h"
+#include "blas.h"

 #ifdef OPENCV
 #include "opencv2/highgui/highgui_c.h"
@ -103,21 +104,28 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
        load_thread = load_data(args);

        /*
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           if(!b.x) break;
-           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
-           }
-           image im = float_to_image(448, 448, 3, train.X.vals[10]);
-           int k;
-           for(k = 0; k < l.max_boxes; ++k){
-           box b = float_to_box(train.y.vals[10] + 1 + k*5);
-           printf("%d %d %d %d\n", truth.x, truth.y, truth.w, truth.h);
-           draw_bbox(im, b, 8, 1,0,0);
-           }
-           save_image(im, "truth11");
-         */
+        int k;
+        for(k = 0; k < l.max_boxes; ++k){
+            box b = float_to_box(train.y.vals[10] + 1 + k*5);
+            if(!b.x) break;
+            printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
+        }
+        */
+        /*
+        int zz;
+        for(zz = 0; zz < train.X.cols; ++zz){
+            image im = float_to_image(net.w, net.h, 3, train.X.vals[zz]);
+            int k;
+            for(k = 0; k < l.max_boxes; ++k){
+                box b = float_to_box(train.y.vals[zz] + k*5);
+                printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+                draw_bbox(im, b, 1, 1,0,0);
+            }
+            show_image(im, "truth11");
+            cvWaitKey(0);
+            save_image(im, "truth11");
+        }
+        */

        printf("Loaded: %lf seconds\n", sec(clock()-time));

@ -192,13 +200,13 @@ void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs,
 {
    int i, j;
    for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = boxes[i].x - boxes[i].w/2. + 1;
+        float xmax = boxes[i].x + boxes[i].w/2. + 1;
+        float ymin = boxes[i].y - boxes[i].h/2. + 1;
+        float ymax = boxes[i].y + boxes[i].h/2. + 1;

-        if (xmin < 0) xmin = 0;
-        if (ymin < 0) ymin = 0;
+        if (xmin < 1) xmin = 1;
+        if (ymin < 1) ymin = 1;
        if (xmax > w) xmax = w;
        if (ymax > h) ymax = h;

@ -231,6 +239,142 @@ void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int
    }
 }

+void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
+{
+    int j;
+    list *options = read_data_cfg(datacfg);
+    char *valid_images = option_find_str(options, "valid", "data/train.list");
+    char *name_list = option_find_str(options, "names", "data/names.list");
+    char *prefix = option_find_str(options, "results", "results");
+    char **names = get_labels(name_list);
+    char *mapf = option_find_str(options, "map", 0);
+    int *map = 0;
+    if (mapf) map = read_map(mapf);
+
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 2);
+    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    srand(time(0));
+
+    list *plist = get_paths(valid_images);
+    char **paths = (char **)list_to_array(plist);
+
+    layer l = net.layers[net.n-1];
+    int classes = l.classes;
+
+    char buff[1024];
+    char *type = option_find_str(options, "eval", "voc");
+    FILE *fp = 0;
+    FILE **fps = 0;
+    int coco = 0;
+    int imagenet = 0;
+    if(0==strcmp(type, "coco")){
+        if(!outfile) outfile = "coco_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
+        fp = fopen(buff, "w");
+        fprintf(fp, "[\n");
+        coco = 1;
+    } else if(0==strcmp(type, "imagenet")){
+        if(!outfile) outfile = "imagenet-detection";
+        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
+        fp = fopen(buff, "w");
+        imagenet = 1;
+        classes = 200;
+    } else {
+        if(!outfile) outfile = "comp4_det_test_";
+        fps = calloc(classes, sizeof(FILE *));
+        for(j = 0; j < classes; ++j){
+            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
+            fps[j] = fopen(buff, "w");
+        }
+    }
+
+
+    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
+    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
+    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
+
+    int m = plist->size;
+    int i=0;
+    int t;
+
+    float thresh = .005;
+    float nms = .45;
+
+    int nthreads = 4;
+    image *val = calloc(nthreads, sizeof(image));
+    image *val_resized = calloc(nthreads, sizeof(image));
+    image *buf = calloc(nthreads, sizeof(image));
+    image *buf_resized = calloc(nthreads, sizeof(image));
+    pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+
+    image input = make_image(net.w, net.h, net.c*2);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    //args.type = IMAGE_DATA;
+    args.type = LETTERBOX_DATA;
+
+    for(t = 0; t < nthreads; ++t){
+        args.path = paths[i+t];
+        args.im = &buf[t];
+        args.resized = &buf_resized[t];
+        thr[t] = load_data_in_thread(args);
+    }
+    time_t start = time(0);
+    for(i = nthreads; i < m+nthreads; i += nthreads){
+        fprintf(stderr, "%d\n", i);
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            pthread_join(thr[t], 0);
+            val[t] = buf[t];
+            val_resized[t] = buf_resized[t];
+        }
+        for(t = 0; t < nthreads && i+t < m; ++t){
+            args.path = paths[i+t];
+            args.im = &buf[t];
+            args.resized = &buf_resized[t];
+            thr[t] = load_data_in_thread(args);
+        }
+        for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
+            char *path = paths[i+t-nthreads];
+            char *id = basecfg(path);
+            copy_cpu(net.w*net.h*net.c, val_resized[t].data, 1, input.data, 1);
+            flip_image(val_resized[t]);
+            copy_cpu(net.w*net.h*net.c, val_resized[t].data, 1, input.data + net.w*net.h*net.c, 1);
+
+            network_predict(net, input.data);
+            int w = val[t].w;
+            int h = val[t].h;
+            get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5);
+            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
+            if (coco){
+                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
+            } else if (imagenet){
+                print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h);
+            } else {
+                print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h);
+            }
+            free(id);
+            free_image(val[t]);
+            free_image(val_resized[t]);
+        }
+    }
+    for(j = 0; j < classes; ++j){
+        if(fps) fclose(fps[j]);
+    }
+    if(coco){
+        fseek(fp, -2, SEEK_CUR); 
+        fprintf(fp, "\n]\n");
+        fclose(fp);
+    }
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+}
+
+
 void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
 {
    int j;
@ -306,7 +450,8 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
    load_args args = {0};
    args.w = net.w;
    args.h = net.h;
-    args.type = IMAGE_DATA;
+    //args.type = IMAGE_DATA;
+    args.type = LETTERBOX_DATA;

    for(t = 0; t < nthreads; ++t){
        args.path = paths[i+t];
@ -467,7 +612,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
            strtok(input, "\n");
        }
        image im = load_image_color(input,0,0);
-        image sized = resize_image(im, net.w, net.h);
+        image sized = letterbox_image(im, net.w, net.h);
+        //image sized = resize_image(im, net.w, net.h);
+        //image sized2 = resize_max(im, net.w);
+        //image sized = crop_image(sized2, -((net.w - sized2.w)/2), -((net.h - sized2.h)/2), net.w, net.h);
+        //resize_network(&net, sized.w, sized.h);
        layer l = net.layers[net.n-1];

        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
@ -481,9 +630,9 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh);
        if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
        else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
-        save_image(im, "predictions");
-        show_image(im, "predictions");
+        draw_detections(sized, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
+        save_image(sized, "predictions");
+        show_image(sized, "predictions");

        free_image(im);
        free_image(sized);
@ -541,6 +690,7 @@ void run_detector(int argc, char **argv)
    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh);
    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
+    else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
    else if(0==strcmp(argv[2], "demo")) {
        list *options = read_data_cfg(datacfg);
--- a/src/go.c
+++ b/src/go.c
@ -3,6 +3,8 @@
 #include "parser.h"
 #include "option_list.h"
 #include "blas.h"
+#include "data.h"
+#include <unistd.h>

 #ifdef OPENCV
 #include "opencv2/highgui/highgui_c.h"
@ -10,7 +12,7 @@

 int inverted = 1;
 int noi = 1;
-static const int nind = 5;
+static const int nind = 2;

 typedef struct {
    char **data;
@ -88,22 +90,30 @@ void board_to_string(char *s, float *board)
    }
 }

-void random_go_moves(moves m, float *boards, float *labels, int n)
+data random_go_moves(moves m, int n)
 {
+    data d = {0};
+    d.X = make_matrix(n, 19*19);
+    d.y = make_matrix(n, 19*19+1);
    int i;
-    memset(labels, 0, 19*19*n*sizeof(float));
    for(i = 0; i < n; ++i){
+        float *board = d.X.vals[i];
+        float *label = d.y.vals[i];
        char *b = m.data[rand()%m.n];
        int row = b[0];
        int col = b[1];
-        labels[col + 19*(row + i*19)] = 1;
-        string_to_board(b+2, boards+i*19*19);
-        boards[col + 19*(row + i*19)] = 0;
+        if(row >= 19 || col >= 19){
+            label[19*19] = 1;
+        } else {
+            label[col + 19*row] = 1;
+            string_to_board(b+2, board);
+            if(board[col + 19*row]) printf("hey\n");
+        }

        int flip = rand()%2;
        int rotate = rand()%4;
-        image in = float_to_image(19, 19, 1, boards+i*19*19);
-        image out = float_to_image(19, 19, 1, labels+i*19*19);
+        image in = float_to_image(19, 19, 1, board);
+        image out = float_to_image(19, 19, 1, label);
        if(flip){
            flip_image(in);
            flip_image(out);
@ -111,36 +121,60 @@ void random_go_moves(moves m, float *boards, float *labels, int n)
        rotate_image_cw(in, rotate);
        rotate_image_cw(out, rotate);
    }
+    return d;
 }


-void train_go(char *cfgfile, char *weightfile)
+void train_go(char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
 {
-    srand(time(0));
+    int i;
    float avg_loss = -1;
    char *base = basecfg(cfgfile);
    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
+    printf("%d\n", ngpus);
+    network *nets = calloc(ngpus, sizeof(network));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = load_network(cfgfile, weightfile, clear);
+        nets[i].learning_rate *= ngpus;
    }
+    network net = nets[0];
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);

    char *backup_directory = "/home/pjreddie/backup/";

    char buff[256];
-    float *board = calloc(19*19*net.batch, sizeof(float));
-    float *move = calloc(19*19*net.batch, sizeof(float));
    moves m = load_go_moves("/home/pjreddie/backup/go.train");
    //moves m = load_go_moves("games.txt");

    int N = m.n;
+    printf("Moves: %d\n", N);
    int epoch = (*net.seen)/N;
    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
        clock_t time=clock();

-        random_go_moves(m, board, move, net.batch);
-        float loss = train_network_datum(net, board, move) / net.batch;
+        data train = random_go_moves(m, net.batch*net.subdivisions*ngpus);
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        free_data(train);
+
        if(avg_loss == -1) avg_loss = loss;
        avg_loss = avg_loss*.95 + loss*.05;
        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
@ -151,7 +185,7 @@ void train_go(char *cfgfile, char *weightfile)
            save_weights(net, buff);

        }
-        if(get_current_batch(net)%100 == 0){
+        if(get_current_batch(net)%1000 == 0){
            char buff[256];
            sprintf(buff, "%s/%s.backup",backup_directory,base);
            save_weights(net, buff);
@ -204,12 +238,9 @@ int *calculate_liberties(float *board)
    return lib;
 }

-void print_board(float *board, int swap, int *indexes)
+void print_board(FILE *stream, float *board, int swap, int *indexes)
 {
-    //FILE *stream = stdout;
-    FILE *stream = stderr;
    int i,j,n;
-    fprintf(stream, "\n\n");
    fprintf(stream, "   ");
    for(i = 0; i < 19; ++i){
        fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi));
@ -225,12 +256,12 @@ void print_board(float *board, int swap, int *indexes)
                    if(index == indexes[n]){
                        found = 1;
                        /*
-                        if(n == 0) fprintf(stream, "\uff11");
-                        else if(n == 1) fprintf(stream, "\uff12");
-                        else if(n == 2) fprintf(stream, "\uff13");
-                        else if(n == 3) fprintf(stream, "\uff14");
-                        else if(n == 4) fprintf(stream, "\uff15");
-                        */
+                           if(n == 0) fprintf(stream, "\uff11");
+                           else if(n == 1) fprintf(stream, "\uff12");
+                           else if(n == 2) fprintf(stream, "\uff13");
+                           else if(n == 3) fprintf(stream, "\uff14");
+                           else if(n == 4) fprintf(stream, "\uff15");
+                         */
                        if(n == 0) fprintf(stream, " 1");
                        else if(n == 1) fprintf(stream, " 2");
                        else if(n == 2) fprintf(stream, " 3");
@ -261,7 +292,7 @@ void flip_board(float *board)
 void predict_move(network net, float *board, float *move, int multi)
 {
    float *output = network_predict(net, board);
-    copy_cpu(19*19, output, 1, move, 1);
+    copy_cpu(19*19+1, output, 1, move, 1);
    int i;
    if(multi){
        image bim = float_to_image(19, 19, 1, board);
@ -275,12 +306,12 @@ void predict_move(network net, float *board, float *move, int multi)
            if(i >= 4) flip_image(oim);
            rotate_image_cw(oim, -i);

-            axpy_cpu(19*19, 1, output, 1, move, 1);
+            axpy_cpu(19*19+1, 1, output, 1, move, 1);

            if(i >= 4) flip_image(bim);
            rotate_image_cw(bim, -i);
        }
-        scal_cpu(19*19, 1./8., move, 1);
+        scal_cpu(19*19+1, 1./8., move, 1);
    }
    for(i = 0; i < 19*19; ++i){
        if(board[i]) move[i] = 0;
@ -350,14 +381,24 @@ int legal_go(float *b, char *ko, int p, int r, int c)
 int generate_move(network net, int player, float *board, int multi, float thresh, float temp, char *ko, int print)
 {
    int i, j;
+    int empty = 1;
+    for(i = 0; i < 19*19; ++i){
+        if (board[i]) {
+            empty = 0;
+            break;
+        }
+    }
+    if(empty) {
+        return 72;
+    }
    for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp;

-    float move[361];
+    float move[362];
    if (player < 0) flip_board(board);
    predict_move(net, board, move, multi);
    if (player < 0) flip_board(board);

-    
+
    for(i = 0; i < 19; ++i){
        for(j = 0; j < 19; ++j){
            if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0;
@ -365,40 +406,43 @@ int generate_move(network net, int player, float *board, int multi, float thresh
    }

    int indexes[nind];
-    top_k(move, 19*19, nind, indexes);
+    top_k(move, 19*19+1, nind, indexes);
    if(thresh > move[indexes[0]]) thresh = move[indexes[nind-1]];

-    for(i = 0; i < 19; ++i){
-        for(j = 0; j < 19; ++j){
-            if (move[i*19 + j] < thresh) move[i*19 + j] = 0;
-        }
+    for(i = 0; i < 19*19+1; ++i){
+        if (move[i] < thresh) move[i] = 0;
    }


-    int max = max_index(move, 19*19);
+    int max = max_index(move, 19*19+1);
    int row = max / 19;
    int col = max % 19;
-    int index = sample_array(move, 19*19);
+    int index = sample_array(move, 19*19+1);

    if(print){
-        top_k(move, 19*19, nind, indexes);
+        top_k(move, 19*19+1, nind, indexes);
        for(i = 0; i < nind; ++i){
            if (!move[indexes[i]]) indexes[i] = -1;
        }
-        print_board(board, player, indexes);
+        print_board(stderr, board, player, indexes);
        for(i = 0; i < nind; ++i){
            fprintf(stderr, "%d: %f\n", i+1, move[indexes[i]]);
        }
    }
+    if (row == 19) return -1;

-    if(suicide_go(board, player, row, col)){
+    if (suicide_go(board, player, row, col)){
        return -1; 
    }
-    if(suicide_go(board, player, index/19, index%19)) index = max;
+
+    if (suicide_go(board, player, index/19, index%19)){
+        index = max;
+    }
+    if (index == 19*19) return -1;
    return index;
 }

-void valid_go(char *cfgfile, char *weightfile, int multi)
+void valid_go(char *cfgfile, char *weightfile, int multi, char *filename)
 {
    srand(time(0));
    char *base = basecfg(cfgfile);
@ -411,8 +455,9 @@ void valid_go(char *cfgfile, char *weightfile, int multi)
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);

    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
-    moves m = load_go_moves("/home/pjreddie/backup/go.test");
+    float *move = calloc(19*19+1, sizeof(float));
+   // moves m = load_go_moves("/home/pjreddie/backup/go.test");
+    moves m = load_go_moves(filename);

    int N = m.n;
    int i;
@ -430,6 +475,23 @@ void valid_go(char *cfgfile, char *weightfile, int multi)
    }
 }

+int print_game(float *board, FILE *fp)
+{
+    int i, j;
+    int count = 3;
+    fprintf(fp, "komi 6.5\n");
+    fprintf(fp, "boardsize 19\n");
+    fprintf(fp, "clear_board\n");
+    for(j = 0; j < 19; ++j){
+        for(i = 0; i < 19; ++i){
+            if(board[j*19 + i] == 1) fprintf(fp, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(board[j*19 + i] == -1) fprintf(fp, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
+            if(board[j*19 + i]) ++count;
+        }
+    }
+    return count;
+}
+
 void engine_go(char *filename, char *weightfile, int multi)
 {
    network net = parse_network_cfg(filename);
@ -456,8 +518,12 @@ void engine_go(char *filename, char *weightfile, int multi)
            printf("=%s 2\n\n", ids);
        } else if (!strcmp(buff, "name")){
            printf("=%s DarkGo\n\n", ids);
+        } else if (!strcmp(buff, "time_settings") || !strcmp(buff, "time_left")){
+            char *line = fgetl(stdin);
+            free(line);
+            printf("=%s \n\n", ids);
        } else if (!strcmp(buff, "version")){
-            printf("=%s 1.0\n\n", ids);
+            printf("=%s 1.0. Want more DarkGo? You can find me on OGS, unlimited games, no waiting! https://online-go.com/user/view/434218\n\n", ids);
        } else if (!strcmp(buff, "known_command")){
            char comm[256];
            scanf("%s", comm);
@ -472,11 +538,14 @@ void engine_go(char *filename, char *weightfile, int multi)
                    !strcmp(comm, "komi") || 
                    !strcmp(comm, "final_status_list") || 
                    !strcmp(comm, "play") || 
+                    !strcmp(comm, "genmove_white") || 
+                    !strcmp(comm, "genmove_black") || 
+                    !strcmp(comm, "fixed_handicap") || 
                    !strcmp(comm, "genmove"));
            if(known) printf("=%s true\n\n", ids);
            else printf("=%s false\n\n", ids);
        } else if (!strcmp(buff, "list_commands")){
-            printf("=%s protocol_version\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove\nfinal_status_list\n\n", ids);
+            printf("=%s protocol_version\nshowboard\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove_black\ngenmove_white\ngenmove\nfinal_status_list\nfixed_handicap\n\n", ids);
        } else if (!strcmp(buff, "quit")){
            break;
        } else if (!strcmp(buff, "boardsize")){
@ -486,8 +555,17 @@ void engine_go(char *filename, char *weightfile, int multi)
            if(boardsize != 19){
                printf("?%s unacceptable size\n\n", ids);
            } else {
+                memset(board, 0, 19*19*sizeof(float));
                printf("=%s \n\n", ids);
            }
+        } else if (!strcmp(buff, "fixed_handicap")){
+            int handicap = 0;
+            scanf("%d", &handicap);
+            int indexes[] = {72, 288, 300, 60, 180, 174, 186, 66, 294};
+            int i;
+            for(i = 0; i < handicap; ++i){
+                board[indexes[i]] = 1;   
+            }
        } else if (!strcmp(buff, "clear_board")){
            passed = 0;
            memset(board, 0, 19*19*sizeof(float));
@ -496,14 +574,24 @@ void engine_go(char *filename, char *weightfile, int multi)
            float komi = 0;
            scanf("%f", &komi);
            printf("=%s \n\n", ids);
-        } else if (!strcmp(buff, "play")){
+        } else if (!strcmp(buff, "showboard")){
+            printf("=%s \n", ids);
+            print_board(stdout, board, 1, 0);
+            printf("\n");
+        } else if (!strcmp(buff, "play") || !strcmp(buff, "black") || !strcmp(buff, "white")){
            char color[256];
-            scanf("%s ", color);
+            if(!strcmp(buff, "play"))
+            {
+                scanf("%s ", color);
+            } else {
+                scanf(" ");
+                color[0] = buff[0];
+            }
            char c;
            int r;
            int count = scanf("%c%d", &c, &r);
            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
-            if(c == 'p' && count < 2) {
+            if((c == 'p' || c == 'P') && count < 2) {
                passed = 1;
                printf("=%s \n\n", ids);
                char *line = fgetl(stdin);
@ -527,13 +615,20 @@ void engine_go(char *filename, char *weightfile, int multi)
            board_to_string(one, board);

            printf("=%s \n\n", ids);
-            print_board(board, 1, 0);
-        } else if (!strcmp(buff, "genmove")){
-            char color[256];
-            scanf("%s", color);
-            int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+            //print_board(stderr, board, 1, 0);
+        } else if (!strcmp(buff, "genmove") || !strcmp(buff, "genmove_black") || !strcmp(buff, "genmove_white")){
+            int player = 0;
+            if(!strcmp(buff, "genmove")){
+                char color[256];
+                scanf("%s", color);
+                player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1;
+            } else if (!strcmp(buff, "genmove_black")){
+                player = 1;
+            } else {
+                player = -1;
+            }

-            int index = generate_move(net, player, board, multi, .1, .7, two, 1);
+            int index = generate_move(net, player, board, multi, .4, 1, two, 0);
            if(passed || index < 0){
                printf("=%s pass\n\n", ids);
                passed = 0;
@ -550,7 +645,7 @@ void engine_go(char *filename, char *weightfile, int multi)
                row = 19 - row;
                if (col >= 8) ++col;
                printf("=%s %c%d\n\n", ids, 'A' + col, row);
-                print_board(board, 1, 0);
+                //print_board(board, 1, 0);
            }

        } else if (!strcmp(buff, "p")){
@ -562,19 +657,10 @@ void engine_go(char *filename, char *weightfile, int multi)
            char *line = fgetl(stdin);
            free(line);
            if(type[0] == 'd' || type[0] == 'D'){
+                int i;
                FILE *f = fopen("game.txt", "w");
-                int i, j;
-                int count = 2;
-                fprintf(f, "boardsize 19\n");
-                fprintf(f, "clear_board\n");
-                for(j = 0; j < 19; ++j){
-                    for(i = 0; i < 19; ++i){
-                        if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-                        if(board[j*19 + i]) ++count;
-                    }
-                }
-                fprintf(f, "final_status_list dead\n");
+                int count = print_game(board, f);
+                fprintf(f, "%s final_status_list dead\n", ids);
                fclose(f);
                FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
                for(i = 0; i < count; ++i){
@ -608,44 +694,25 @@ void test_go(char *cfg, char *weights, int multi)
    srand(time(0));
    set_batch_network(&net, 1);
    float *board = calloc(19*19, sizeof(float));
-    float *move = calloc(19*19, sizeof(float));
+    float *move = calloc(19*19+1, sizeof(float));
    int color = 1;
    while(1){
-        float *output = network_predict(net, board);
-        copy_cpu(19*19, output, 1, move, 1);
        int i;
-        if(multi){
-            image bim = float_to_image(19, 19, 1, board);
-            for(i = 1; i < 8; ++i){
-                rotate_image_cw(bim, i);
-                if(i >= 4) flip_image(bim);
-
-                float *output = network_predict(net, board);
-                image oim = float_to_image(19, 19, 1, output);
-
-                if(i >= 4) flip_image(oim);
-                rotate_image_cw(oim, -i);
-
-                axpy_cpu(19*19, 1, output, 1, move, 1);
-
-                if(i >= 4) flip_image(bim);
-                rotate_image_cw(bim, -i);
-            }
-            scal_cpu(19*19, 1./8., move, 1);
-        }
-        for(i = 0; i < 19*19; ++i){
-            if(board[i]) move[i] = 0;
-        }
+        predict_move(net, board, move, multi);

        int indexes[nind];
        int row, col;
-        top_k(move, 19*19, nind, indexes);
-        print_board(board, color, indexes);
+        top_k(move, 19*19+1, nind, indexes);
+        print_board(stderr, board, color, indexes);
        for(i = 0; i < nind; ++i){
            int index = indexes[i];
            row = index / 19;
            col = index % 19;
-            printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
+            if(row == 19){
+                printf("%d: Pass, %.2f%%\n", i+1, move[index]*100);
+            } else {
+                printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100);
+            }
        }
        //if(color == 1) printf("\u25EF Enter move: ");
        //else printf("\u25C9 Enter move: ");
@ -663,7 +730,9 @@ void test_go(char *cfg, char *weights, int multi)
                int index = indexes[picked];
                row = index / 19;
                col = index % 19;
-                board[row*19 + col] = 1;
+                if(row < 19){
+                    move_go(board, 1, row, col);
+                }
            }
        } else if (cnum){
            if (c <= 'T' && c >= 'A'){
@ -671,7 +740,7 @@ void test_go(char *cfg, char *weights, int multi)
                row = (inverted)?19 - row : row-1;
                col = c - 'A';
                if (col > 7 && noi) col -= 1;
-                if (num == 2) board[row*19 + col] = 1;
+                if (num == 2) move_go(board, 1, row, col);
            } else if (c == 'p') {
                // Pass
            } else if(c=='b' || c == 'w'){
@ -698,19 +767,9 @@ void test_go(char *cfg, char *weights, int multi)

 float score_game(float *board)
 {
+    int i;
    FILE *f = fopen("game.txt", "w");
-    int i, j;
-    int count = 3;
-    fprintf(f, "komi 6.5\n");
-    fprintf(f, "boardsize 19\n");
-    fprintf(f, "clear_board\n");
-    for(j = 0; j < 19; ++j){
-        for(i = 0; i < 19; ++i){
-            if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j);
-            if(board[j*19 + i]) ++count;
-        }
-    }
+    int count = print_game(board, f);
    fprintf(f, "final_score\n");
    fclose(f);
    FILE *p = popen("./gnugo --mode gtp < game.txt", "r");
@ -747,7 +806,7 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
        }
    }
    srand(time(0));
-    char boards[300][93];
+    char boards[600][93];
    int count = 0;
    set_batch_network(&net, 1);
    set_batch_network(&net2, 1);
@ -760,13 +819,15 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
    int p2 = 0;
    int total = 0;
    while(1){
-        if (done || count >= 300){
+        if (done){
            float score = score_game(board);
-            int i = (score > 0)? 0 : 1;
            if((score > 0) == (total%2==0)) ++p1;
            else ++p2;
            ++total;
            fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total);
+            sleep(1);
+            /*
+            int i = (score > 0)? 0 : 1;
            int j;
            for(; i < count; i += 2){
                for(j = 0; j < 93; ++j){
@ -774,6 +835,7 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
                }
                printf("\n");
            }
+            */
            memset(board, 0, 19*19*sizeof(float));
            player = 1;
            done = 0;
@ -781,10 +843,10 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi)
            fflush(stdout);
            fflush(stderr);
        }
-        //print_board(board, 1, 0);
+        print_board(stderr, board, 1, 0);
        //sleep(1);
        network use = ((total%2==0) == (player==1)) ? net : net2;
-        int index = generate_move(use, player, board, multi, .1, .7, two, 0);
+        int index = generate_move(use, player, board, multi, .4, 1, two, 0);
        if(index < 0){
            done = 1;
            continue;
@ -818,13 +880,37 @@ void run_go(int argc, char **argv)
        return;
    }

+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+    int clear = find_arg(argc, argv, "-clear");
+
    char *cfg = argv[3];
    char *weights = (argc > 4) ? argv[4] : 0;
    char *c2 = (argc > 5) ? argv[5] : 0;
    char *w2 = (argc > 6) ? argv[6] : 0;
    int multi = find_arg(argc, argv, "-multi");
-    if(0==strcmp(argv[2], "train")) train_go(cfg, weights);
-    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi);
+    if(0==strcmp(argv[2], "train")) train_go(cfg, weights, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi, c2);
    else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi);
    else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi);
    else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, multi);
--- a/src/image.c
+++ b/src/image.c
@ -613,6 +613,21 @@ image float_to_image(int w, int h, int c, float *data)
    return out;
 }

+void place_image(image im, int w, int h, int dx, int dy, image canvas)
+{
+    int x, y, c;
+    for(c = 0; c < im.c; ++c){
+        for(y = 0; y < h; ++y){
+            for(x = 0; x < w; ++x){
+                int rx = ((float)x / w) * im.w;
+                int ry = ((float)y / h) * im.h;
+                float val = bilinear_interpolate(im, rx, ry, c);
+                set_pixel(canvas, x + dx, y + dy, c, val);
+            }
+        }
+    }
+}
+

 image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect)
 {
@ -652,6 +667,12 @@ image rotate_image(image im, float rad)
    return rot;
 }

+void fill_image(image m, float s)
+{
+    int i;
+    for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s;
+}
+
 void translate_image(image m, float s)
 {
    int i;
@ -753,6 +774,27 @@ void composite_3d(char *f1, char *f2, char *out, int delta)
 #endif
 }

+image letterbox_image(image im, int w, int h)
+{
+    int new_w = im.w;
+    int new_h = im.h;
+    if (((float)w/im.w) < ((float)h/im.h)) {
+        new_w = w;
+        new_h = (im.h * w)/im.w;
+    } else {
+        new_h = h;
+        new_w = (im.w * h)/im.h;
+    }
+    image resized = resize_image(im, new_w, new_h);
+    image boxed = make_image(w, h, im.c);
+    fill_image(boxed, .5);
+    //int i;
+    //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0;
+    embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); 
+    free_image(resized);
+    return boxed;
+}
+
 image resize_max(image im, int max)
 {
    int w = im.w;
@ -824,6 +866,52 @@ float three_way_min(float a, float b, float c)
    return (a < b) ? ( (a < c) ? a : c) : ( (b < c) ? b : c) ;
 }

+void yuv_to_rgb(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            y = get_pixel(im, i , j, 0);
+            u = get_pixel(im, i , j, 1);
+            v = get_pixel(im, i , j, 2);
+            
+            r = y + 1.13983*v;
+            g = y + -.39465*u + -.58060*v;
+            b = y + 2.03211*u;
+
+            set_pixel(im, i, j, 0, r);
+            set_pixel(im, i, j, 1, g);
+            set_pixel(im, i, j, 2, b);
+        }
+    }
+}
+
+void rgb_to_yuv(image im)
+{
+    assert(im.c == 3);
+    int i, j;
+    float r, g, b;
+    float y, u, v;
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            r = get_pixel(im, i , j, 0);
+            g = get_pixel(im, i , j, 1);
+            b = get_pixel(im, i , j, 2);
+            
+            y = .299*r + .587*g + .114*b;
+            u = -.14713*r + -.28886*g + .436*b;
+            v = .615*r + -.51499*g + -.10001*b;
+
+            set_pixel(im, i, j, 0, y);
+            set_pixel(im, i, j, 1, u);
+            set_pixel(im, i, j, 2, v);
+        }
+    }
+}
+
 // http://www.cs.rit.edu/~ncs/color/t_convert.html
 void rgb_to_hsv(image im)
 {
@ -903,12 +991,30 @@ void hsv_to_rgb(image im)
    }
 }

+void grayscale_image_3c(image im)
+{
+    assert(im.c == 3);
+    int i, j, k;
+    float scale[] = {0.299, 0.587, 0.114};
+    for(j = 0; j < im.h; ++j){
+        for(i = 0; i < im.w; ++i){
+            float val = 0;
+            for(k = 0; k < 3; ++k){
+                val += scale[k]*get_pixel(im, i, j, k);
+            }
+            im.data[0*im.h*im.w + im.w*j + i] = val;
+            im.data[1*im.h*im.w + im.w*j + i] = val;
+            im.data[2*im.h*im.w + im.w*j + i] = val;
+        }
+    }
+}
+
 image grayscale_image(image im)
 {
    assert(im.c == 3);
    int i, j, k;
    image gray = make_image(im.w, im.h, 1);
-    float scale[] = {0.587, 0.299, 0.114};
+    float scale[] = {0.299, 0.587, 0.114};
    for(k = 0; k < im.c; ++k){
        for(j = 0; j < im.h; ++j){
            for(i = 0; i < im.w; ++i){
--- a/src/image.h
+++ b/src/image.h
@ -29,25 +29,32 @@ image crop_image(image im, int dx, int dy, int w, int h);
 image random_crop_image(image im, int w, int h);
 image random_augment_image(image im, float angle, float aspect, int low, int high, int size);
 void random_distort_image(image im, float hue, float saturation, float exposure);
+image letterbox_image(image im, int w, int h);
 image resize_image(image im, int w, int h);
 image resize_min(image im, int min);
 image resize_max(image im, int max);
+void fill_image(image m, float s);
 void translate_image(image m, float s);
 void normalize_image(image p);
 image rotate_image(image m, float rad);
 void rotate_image_cw(image im, int times);
 void embed_image(image source, image dest, int dx, int dy);
+void place_image(image im, int w, int h, int dx, int dy, image canvas);
 void saturate_image(image im, float sat);
 void exposure_image(image im, float sat);
 void distort_image(image im, float hue, float sat, float val);
 void saturate_exposure_image(image im, float sat, float exposure);
+void rgb_to_hsv(image im);
 void hsv_to_rgb(image im);
+void yuv_to_rgb(image im);
+void rgb_to_yuv(image im);
 void rgbgr_image(image im);
 void constrain_image(image im);
 void composite_3d(char *f1, char *f2, char *out, int delta);
 int best_3d_shift_r(image a, image b, int min, int max);

 image grayscale_image(image im);
+void grayscale_image_3c(image im);
 image threshold_image(image im, float thresh);

 image collapse_image_layers(image source, int border);
--- a/src/layer.c
+++ b/src/layer.c
@ -32,7 +32,6 @@ void free_layer(layer l)
    if(l.scale_updates)      free(l.scale_updates);
    if(l.weights)            free(l.weights);
    if(l.weight_updates)     free(l.weight_updates);
-    if(l.col_image)          free(l.col_image);
    if(l.delta)              free(l.delta);
    if(l.output)             free(l.output);
    if(l.squared)            free(l.squared);
@ -80,7 +79,6 @@ void free_layer(layer l)
    if(l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu);
    if(l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu);
    if(l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu);
-    if(l.col_image_gpu)           cuda_free(l.col_image_gpu);
    if(l.x_gpu)                   cuda_free(l.x_gpu);
    if(l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
    if(l.weights_gpu)             cuda_free(l.weights_gpu);
--- a/src/layer.h
+++ b/src/layer.h
@ -38,7 +38,7 @@ typedef enum {
 } LAYER_TYPE;

 typedef enum{
-    SSE, MASKED, SMOOTH
+    SSE, MASKED, L1, SMOOTH
 } COST_TYPE;

 struct layer{
@ -58,6 +58,7 @@ struct layer{
    int flipped;
    int inputs;
    int outputs;
+    int extra;
    int truths;
    int h,w,c;
    int out_h, out_w, out_c;
@ -68,6 +69,7 @@ struct layer{
    int side;
    int stride;
    int reverse;
+    int flatten;
    int pad;
    int sqrt;
    int flip;
@ -76,6 +78,8 @@ struct layer{
    int xnor;
    int steps;
    int hidden;
+    int truth;
+    float smooth;
    float dot;
    float angle;
    float jitter;
@ -83,6 +87,7 @@ struct layer{
    float exposure;
    float shift;
    float ratio;
+    float learning_rate_scale;
    int softmax;
    int classes;
    int coords;
@ -115,6 +120,8 @@ struct layer{
    int classfix;
    int absolute;

+    int onlyforward;
+    int stopbackward;
    int dontload;
    int dontloadscales;

@ -149,7 +156,6 @@ struct layer{
    float * weights;
    float * weight_updates;

-    float * col_image;
    float * delta;
    float * output;
    float * squared;
@ -235,8 +241,6 @@ struct layer{
    float * variance_delta_gpu;
    float * mean_delta_gpu;

-    float * col_image_gpu;
-
    float * x_gpu;
    float * x_norm_gpu;
    float * weights_gpu;
@ -256,6 +260,7 @@ struct layer{
    #ifdef CUDNN
    cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc;
    cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc;
+    cudnnTensorDescriptor_t normTensorDesc;
    cudnnFilterDescriptor_t weightDesc;
    cudnnFilterDescriptor_t dweightDesc;
    cudnnConvolutionDescriptor_t convDesc;
--- a/src/local_layer.c
+++ b/src/local_layer.c
@ -57,9 +57,10 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
    float scale = sqrt(2./(size*size*c));
    for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1);

-    l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float));
    l.output = calloc(l.batch*out_h * out_w * n, sizeof(float));
    l.delta  = calloc(l.batch*out_h * out_w * n, sizeof(float));
+
+    l.workspace_size = out_h*out_w*size*size*c;
    
    l.forward = forward_local_layer;
    l.backward = backward_local_layer;
@ -76,7 +77,6 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in
    l.biases_gpu = cuda_make_array(l.biases, l.outputs);
    l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs);

-    l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c);
    l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n);
    l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n);

@ -102,11 +102,11 @@ void forward_local_layer(const local_layer l, network_state state)
    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, state.workspace);
        float *output = l.output + i*l.outputs;
        for(j = 0; j < locations; ++j){
            float *a = l.weights + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image + j;
+            float *b = state.workspace + j;
            float *c = output + j;

            int m = l.n;
@ -133,11 +133,11 @@ void backward_local_layer(local_layer l, network_state state)
    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_cpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image);
+                l.size, l.stride, l.pad, state.workspace);

        for(j = 0; j < locations; ++j){ 
            float *a = l.delta + i*l.outputs + j;
-            float *b = l.col_image + j;
+            float *b = state.workspace + j;
            float *c = l.weight_updates + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
@ -150,7 +150,7 @@ void backward_local_layer(local_layer l, network_state state)
            for(j = 0; j < locations; ++j){ 
                float *a = l.weights + j*l.size*l.size*l.c*l.n;
                float *b = l.delta + i*l.outputs + j;
-                float *c = l.col_image + j;
+                float *c = state.workspace + j;

                int m = l.size*l.size*l.c;
                int n = 1;
@ -159,7 +159,7 @@ void backward_local_layer(local_layer l, network_state state)
                gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
            }

-            col2im_cpu(l.col_image, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_cpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
 }
@ -192,11 +192,11 @@ void forward_local_layer_gpu(const local_layer l, network_state state)
    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+                l.size, l.stride, l.pad, state.workspace);
        float *output = l.output_gpu + i*l.outputs;
        for(j = 0; j < locations; ++j){
            float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
-            float *b = l.col_image_gpu + j;
+            float *b = state.workspace + j;
            float *c = output + j;

            int m = l.n;
@ -222,11 +222,11 @@ void backward_local_layer_gpu(local_layer l, network_state state)
    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_ongpu(input, l.c, l.h, l.w, 
-                l.size, l.stride, l.pad, l.col_image_gpu);
+                l.size, l.stride, l.pad, state.workspace);

        for(j = 0; j < locations; ++j){ 
            float *a = l.delta_gpu + i*l.outputs + j;
-            float *b = l.col_image_gpu + j;
+            float *b = state.workspace + j;
            float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
@ -239,7 +239,7 @@ void backward_local_layer_gpu(local_layer l, network_state state)
            for(j = 0; j < locations; ++j){ 
                float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
                float *b = l.delta_gpu + i*l.outputs + j;
-                float *c = l.col_image_gpu + j;
+                float *c = state.workspace + j;

                int m = l.size*l.size*l.c;
                int n = 1;
@ -248,7 +248,7 @@ void backward_local_layer_gpu(local_layer l, network_state state)
                gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
            }

-            col2im_ongpu(l.col_image_gpu, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
+            col2im_ongpu(state.workspace, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
 }
--- a/src/lsd.c
+++ b/src/lsd.c
@ -0,0 +1,924 @@
+#include "network.h"
+#include "cost_layer.h"
+#include "utils.h"
+#include "parser.h"
+#include "blas.h"
+
+#ifdef OPENCV
+#include "opencv2/highgui/highgui_c.h"
+#endif
+
+void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    //char *style_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *style_images = "/home/pjreddie/zelda.txt";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    network fnet = load_network(fcfg, fweight, clear);
+    network gnet = load_network(gcfg, gweight, clear);
+    network anet = load_network(acfg, aweight, clear);
+    char *gbase = basecfg(gcfg);
+    char *abase = basecfg(acfg);
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet.learning_rate, gnet.momentum, gnet.decay);
+    int imgs = gnet.batch*gnet.subdivisions;
+    int i = *gnet.seen/imgs;
+    data train, tbuffer;
+    data style, sbuffer;
+
+
+    list *slist = get_paths(style_images);
+    char **spaths = (char **)list_to_array(slist);
+
+    list *tlist = get_paths(train_images);
+    char **tpaths = (char **)list_to_array(tlist);
+
+    load_args targs= get_base_args(gnet);
+    targs.paths = tpaths;
+    targs.n = imgs;
+    targs.m = tlist->size;
+    targs.d = &tbuffer;
+    targs.type = CLASSIFICATION_DATA;
+    targs.classes = 1;
+    char *ls[1] = {"zelda"};
+    targs.labels = ls;
+
+    load_args sargs = get_base_args(gnet);
+    sargs.paths = spaths;
+    sargs.n = imgs;
+    sargs.m = slist->size;
+    sargs.d = &sbuffer;
+    sargs.type = CLASSIFICATION_DATA;
+    sargs.classes = 1;
+    sargs.labels = ls;
+
+    pthread_t tload_thread = load_data_in_thread(targs);
+    pthread_t sload_thread = load_data_in_thread(sargs);
+    clock_t time;
+
+    float aloss_avg = -1;
+    float floss_avg = -1;
+
+    network_state fstate = {0};
+    fstate.index = 0;
+    fstate.net = fnet;
+    int x_size = get_network_input_size(fnet)*fnet.batch;
+    int y_size = get_network_output_size(fnet)*fnet.batch;
+    fstate.input = cuda_make_array(0, x_size);
+    fstate.truth = cuda_make_array(0, y_size);
+    fstate.delta = cuda_make_array(0, x_size);
+    fstate.train = 1;
+    float *X = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+    float *ones = cuda_make_array(0, anet.batch);
+    float *zeros = cuda_make_array(0, anet.batch);
+    fill_ongpu(anet.batch, .99, ones, 1);
+    fill_ongpu(anet.batch, .01, zeros, 1);
+
+    network_state astate = {0};
+    astate.index = 0;
+    astate.net = anet;
+    int ax_size = get_network_input_size(anet)*anet.batch;
+    int ay_size = get_network_output_size(anet)*anet.batch;
+    astate.input = 0;
+    astate.truth = ones;
+    astate.delta = cuda_make_array(0, ax_size);
+    astate.train = 1;
+
+    network_state gstate = {0};
+    gstate.index = 0;
+    gstate.net = gnet;
+    int gx_size = get_network_input_size(gnet)*gnet.batch;
+    int gy_size = get_network_output_size(gnet)*gnet.batch;
+    gstate.input = cuda_make_array(0, gx_size);
+    gstate.truth = 0;
+    gstate.delta = 0;
+    gstate.train = 1;
+
+    while (get_current_batch(gnet) < gnet.max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(tload_thread, 0);
+        pthread_join(sload_thread, 0);
+        train = tbuffer;
+        style = sbuffer;
+        tload_thread = load_data_in_thread(targs);
+        sload_thread = load_data_in_thread(sargs);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data generated = copy_data(train);
+        time=clock();
+
+        int j, k;
+        float floss = 0;
+        for(j = 0; j < fnet.subdivisions; ++j){
+            layer imlayer = gnet.layers[gnet.n - 1];
+            get_next_batch(train, fnet.batch, j*fnet.batch, X, y);
+
+            cuda_push_array(fstate.input, X, x_size);
+            cuda_push_array(gstate.input, X, gx_size);
+            *gnet.seen += gnet.batch;
+
+            forward_network_gpu(fnet, fstate);
+            float *feats = fnet.layers[fnet.n - 2].output_gpu;
+            copy_ongpu(y_size, feats, 1, fstate.truth, 1);
+
+            forward_network_gpu(gnet, gstate);
+            float *gen = gnet.layers[gnet.n-1].output_gpu;
+            copy_ongpu(x_size, gen, 1, fstate.input, 1);
+
+            fill_ongpu(x_size, 0, fstate.delta, 1);
+            forward_network_gpu(fnet, fstate);
+            backward_network_gpu(fnet, fstate);
+            //HERE
+
+            astate.input = gen;
+            fill_ongpu(ax_size, 0, astate.delta, 1);
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            float *delta = imlayer.delta_gpu;
+            fill_ongpu(x_size, 0, delta, 1);
+            scal_ongpu(x_size, 100, astate.delta, 1);
+            scal_ongpu(x_size, .00001, fstate.delta, 1);
+            axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1);
+            axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
+
+            //fill_ongpu(x_size, 0, delta, 1);
+            //cuda_push_array(delta, X, x_size);
+            //axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1);
+            //printf("pix error: %f\n", cuda_mag_array(delta, x_size));
+            printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size));
+            printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size));
+            //axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1);
+
+            backward_network_gpu(gnet, gstate);
+
+            floss += get_network_cost(fnet) /(fnet.subdivisions*fnet.batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size);
+            for(k = 0; k < gnet.batch; ++k){
+                int index = j*gnet.batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
+                generated.y.vals[index][0] = .01;
+            }
+        }
+
+/*
+        image sim = float_to_image(anet.w, anet.h, anet.c, style.X.vals[j]);
+        show_image(sim, "style");
+        cvWaitKey(0);
+        */
+
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(style, generated);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(gnet);
+
+        free_data(merge);
+        free_data(train);
+        free_data(generated);
+        free_data(style);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        if (floss_avg < 0) floss_avg = floss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        floss_avg = floss_avg*.9 + floss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, floss, aloss, floss_avg, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, gbase, i);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, gbase);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+#endif
+}
+
+void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/train1.txt";
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network net = load_network(cfg, weight, clear);
+    network anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < net.n; ++i) {
+        if (net.layers[i].out_c == 3) {
+            imlayer = net.layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w;
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[1] = {"coco"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    network_state gstate = {0};
+    gstate.index = 0;
+    gstate.net = net;
+    int x_size = get_network_input_size(net)*net.batch;
+    int y_size = x_size;
+    gstate.input = cuda_make_array(0, x_size);
+    gstate.truth = cuda_make_array(0, y_size);
+    gstate.delta = 0;
+    gstate.train = 1;
+    float *pixs = calloc(x_size, sizeof(float));
+    float *graypixs = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+    network_state astate = {0};
+    astate.index = 0;
+    astate.net = anet;
+    int ay_size = get_network_output_size(anet)*anet.batch;
+    astate.input = 0;
+    astate.truth = 0;
+    astate.delta = 0;
+    astate.train = 1;
+
+    float *imerror = cuda_make_array(0, imlayer.outputs);
+    float *ones_gpu = cuda_make_array(0, ay_size);
+    fill_ongpu(ay_size, .9, ones_gpu, 1);
+
+    float aloss_avg = -1;
+    float gloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    while (get_current_batch(net) < net.max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gray = copy_data(train);
+        for(j = 0; j < imgs; ++j){
+            image gim = float_to_image(net.w, net.h, net.c, gray.X.vals[j]);
+            grayscale_image_3c(gim);
+            train.y.vals[j][0] = .9;
+
+            image yim = float_to_image(net.w, net.h, net.c, train.X.vals[j]);
+            //rgb_to_yuv(yim);
+        }
+        time=clock();
+        float gloss = 0;
+
+        for(j = 0; j < net.subdivisions; ++j){
+            get_next_batch(train, net.batch, j*net.batch, pixs, y);
+            get_next_batch(gray, net.batch, j*net.batch, graypixs, y);
+            cuda_push_array(gstate.input, graypixs, x_size);
+            cuda_push_array(gstate.truth, pixs, x_size);
+            /*
+            image origi = float_to_image(net.w, net.h, 3, pixs);
+            image grayi = float_to_image(net.w, net.h, 3, graypixs);
+            show_image(grayi, "gray");
+            show_image(origi, "orig");
+            cvWaitKey(0);
+            */
+            *net.seen += net.batch;
+            forward_network_gpu(net, gstate);
+
+            fill_ongpu(imlayer.outputs, 0, imerror, 1);
+            astate.input = imlayer.output_gpu;
+            astate.delta = imerror;
+            astate.truth = ones_gpu;
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
+
+            backward_network_gpu(net, gstate);
+
+            scal_ongpu(imlayer.outputs, 100, imerror, 1);
+
+            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+            printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs));
+
+            axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+            gloss += get_network_cost(net) /(net.subdivisions*net.batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size);
+            for(k = 0; k < net.batch; ++k){
+                int index = j*net.batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1);
+                gray.y.vals[index][0] = .1;
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gray);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(net);
+        update_network_gpu(anet);
+        free_data(merge);
+        free_data(train);
+        free_data(gray);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        gloss_avg = gloss_avg*.9 + gloss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+#endif
+}
+
+void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear)
+{
+#ifdef GPU
+    //char *train_images = "/home/pjreddie/data/coco/train1.txt";
+    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network net = load_network(cfg, weight, clear);
+    network anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < net.n; ++i) {
+        if (net.layers[i].out_c == 3) {
+            imlayer = net.layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w;
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[1] = {"imagenet"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    network_state gstate = {0};
+    gstate.index = 0;
+    gstate.net = net;
+    int x_size = get_network_input_size(net)*net.batch;
+    int y_size = x_size;
+    gstate.input = cuda_make_array(0, x_size);
+    gstate.truth = cuda_make_array(0, y_size);
+    gstate.delta = 0;
+    gstate.train = 1;
+    float *pixs = calloc(x_size, sizeof(float));
+    float *graypixs = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+    network_state astate = {0};
+    astate.index = 0;
+    astate.net = anet;
+    int ay_size = get_network_output_size(anet)*anet.batch;
+    astate.input = 0;
+    astate.truth = 0;
+    astate.delta = 0;
+    astate.train = 1;
+
+    float *imerror = cuda_make_array(0, imlayer.outputs);
+    float *ones_gpu = cuda_make_array(0, ay_size);
+    fill_ongpu(ay_size, .99, ones_gpu, 1);
+
+    float aloss_avg = -1;
+    float gloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    while (get_current_batch(net) < net.max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gray = copy_data(train);
+        for(j = 0; j < imgs; ++j){
+            image gim = float_to_image(net.w, net.h, net.c, gray.X.vals[j]);
+            grayscale_image_3c(gim);
+            train.y.vals[j][0] = .99;
+
+            image yim = float_to_image(net.w, net.h, net.c, train.X.vals[j]);
+            //rgb_to_yuv(yim);
+        }
+        time=clock();
+        float gloss = 0;
+
+        for(j = 0; j < net.subdivisions; ++j){
+            get_next_batch(train, net.batch, j*net.batch, pixs, y);
+            get_next_batch(gray, net.batch, j*net.batch, graypixs, y);
+            cuda_push_array(gstate.input, graypixs, x_size);
+            cuda_push_array(gstate.truth, pixs, x_size);
+            /*
+            image origi = float_to_image(net.w, net.h, 3, pixs);
+            image grayi = float_to_image(net.w, net.h, 3, graypixs);
+            show_image(grayi, "gray");
+            show_image(origi, "orig");
+            cvWaitKey(0);
+            */
+            *net.seen += net.batch;
+            forward_network_gpu(net, gstate);
+
+            fill_ongpu(imlayer.outputs, 0, imerror, 1);
+            astate.input = imlayer.output_gpu;
+            astate.delta = imerror;
+            astate.truth = ones_gpu;
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1);
+
+            backward_network_gpu(net, gstate);
+
+            scal_ongpu(imlayer.outputs, 100, imerror, 1);
+
+            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+            printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs));
+
+            axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+            gloss += get_network_cost(net) /(net.subdivisions*net.batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size);
+            for(k = 0; k < net.batch; ++k){
+                int index = j*net.batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1);
+                gray.y.vals[index][0] = .01;
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gray);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(net);
+        update_network_gpu(anet);
+        free_data(merge);
+        free_data(train);
+        free_data(gray);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        gloss_avg = gloss_avg*.9 + gloss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+#endif
+}
+
+void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear)
+{
+#ifdef GPU
+    char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    if(clear) *net.seen = 0;
+
+    char *abase = basecfg(acfgfile);
+    network anet = parse_network_cfg(acfgfile);
+    if(aweightfile){
+        load_weights(&anet, aweightfile);
+    }
+    if(clear) *anet.seen = 0;
+
+    int i, j, k;
+    layer imlayer = {0};
+    for (i = 0; i < net.n; ++i) {
+        if (net.layers[i].out_c == 3) {
+            imlayer = net.layers[i];
+            break;
+        }
+    }
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w;
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[1] = {"coco"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    network_state gstate = {0};
+    gstate.index = 0;
+    gstate.net = net;
+    int x_size = get_network_input_size(net)*net.batch;
+    int y_size = 1*net.batch;
+    gstate.input = cuda_make_array(0, x_size);
+    gstate.truth = 0;
+    gstate.delta = 0;
+    gstate.train = 1;
+    float *X = calloc(x_size, sizeof(float));
+    float *y = calloc(y_size, sizeof(float));
+
+    network_state astate = {0};
+    astate.index = 0;
+    astate.net = anet;
+    int ay_size = get_network_output_size(anet)*anet.batch;
+    astate.input = 0;
+    astate.truth = 0;
+    astate.delta = 0;
+    astate.train = 1;
+
+    float *imerror = cuda_make_array(0, imlayer.outputs);
+    float *ones_gpu = cuda_make_array(0, ay_size);
+    fill_ongpu(ay_size, 1, ones_gpu, 1);
+
+    float aloss_avg = -1;
+    float gloss_avg = -1;
+
+    //data generated = copy_data(train);
+
+    while (get_current_batch(net) < net.max_batches) {
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data generated = copy_data(train);
+        time=clock();
+        float gloss = 0;
+
+        for(j = 0; j < net.subdivisions; ++j){
+            get_next_batch(train, net.batch, j*net.batch, X, y);
+            cuda_push_array(gstate.input, X, x_size);
+            *net.seen += net.batch;
+            forward_network_gpu(net, gstate);
+
+            fill_ongpu(imlayer.outputs, 0, imerror, 1);
+            astate.input = imlayer.output_gpu;
+            astate.delta = imerror;
+            astate.truth = ones_gpu;
+            forward_network_gpu(anet, astate);
+            backward_network_gpu(anet, astate);
+
+            scal_ongpu(imlayer.outputs, 1, imerror, 1);
+            axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+            backward_network_gpu(net, gstate);
+
+            printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs));
+            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+
+            gloss += get_network_cost(net) /(net.subdivisions*net.batch);
+
+            cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size);
+            for(k = 0; k < net.batch; ++k){
+                int index = j*net.batch + k;
+                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
+                generated.y.vals[index][0] = 0;
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, generated);
+        randomize_data(merge);
+        float aloss = train_network(anet, merge);
+
+        update_network_gpu(net);
+        update_network_gpu(anet);
+        free_data(merge);
+        free_data(train);
+        free_data(generated);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+        gloss_avg = gloss_avg*.9 + gloss*.1;
+
+        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+#endif
+}
+
+void train_lsd(char *cfgfile, char *weightfile, int clear)
+{
+    char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    float avg_loss = -1;
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    if(clear) *net.seen = 0;
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    int imgs = net.batch*net.subdivisions;
+    int i = *net.seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    //int N = plist->size;
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w;
+    args.type = CLASSIFICATION_DATA;
+    args.classes = 1;
+    char *ls[1] = {"coco"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+    //while(i*imgs < N*120){
+    while(get_current_batch(net) < net.max_batches){
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        time=clock();
+        float loss = train_network(net, train);
+        if (avg_loss < 0) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+
+        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(net, buff);
+        }
+        if(i%100==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(net, buff);
+        }
+        free_data(train);
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(net, buff);
+}
+
+void test_lsd(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int i, imlayer = 0;
+
+    for (i = 0; i < net.n; ++i) {
+        if (net.layers[i].out_c == 3) {
+            imlayer = i;
+            printf("%d\n", i);
+            break;
+        }
+    }
+
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image resized = resize_min(im, net.w);
+        image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h);
+        //grayscale_image_3c(crop);
+
+        float *X = crop.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image_layer(net, imlayer);
+        //yuv_to_rgb(out);
+        constrain_image(out);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        show_image(out, "out");
+        show_image(crop, "crop");
+        save_image(out, "out");
+#ifdef OPENCV
+        cvWaitKey(0);
+#endif
+
+        free_image(im);
+        free_image(resized);
+        free_image(crop);
+        if (filename) break;
+    }
+}
+
+
+void run_lsd(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    int clear = find_arg(argc, argv, "-clear");
+
+    char *cfg = argv[3];
+    char *weights = (argc > 4) ? argv[4] : 0;
+    char *filename = (argc > 5) ? argv[5] : 0;
+    char *acfg = argv[5];
+    char *aweights = (argc > 6) ? argv[6] : 0;
+    if(0==strcmp(argv[2], "train")) train_lsd(cfg, weights, clear);
+    else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear);
+    else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear);
+    else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear);
+    else if(0==strcmp(argv[2], "test")) test_lsd(cfg, weights, filename);
+    /*
+       else if(0==strcmp(argv[2], "valid")) validate_lsd(cfg, weights);
+     */
+}
--- a/src/matrix.c
+++ b/src/matrix.c
@ -1,5 +1,6 @@
 #include "matrix.h"
 #include "utils.h"
+#include "blas.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -73,6 +74,20 @@ void matrix_add_matrix(matrix from, matrix to)
    }
 }

+matrix copy_matrix(matrix m)
+{
+    matrix c = {0};
+    c.rows = m.rows;
+    c.cols = m.cols;
+    c.vals = calloc(c.rows, sizeof(float *));
+    int i;
+    for(i = 0; i < c.rows; ++i){
+        c.vals[i] = calloc(c.cols, sizeof(float));
+        copy_cpu(c.cols, m.vals[i], 1, c.vals[i], 1);
+    }
+    return c;
+}
+
 matrix make_matrix(int rows, int cols)
 {
    int i;
--- a/src/matrix.h
+++ b/src/matrix.h
@ -6,6 +6,7 @@ typedef struct matrix{
 } matrix;

 matrix make_matrix(int rows, int cols);
+matrix copy_matrix(matrix m);
 void free_matrix(matrix m);
 void print_matrix(matrix m);

--- a/src/network.c
+++ b/src/network.c
@ -27,6 +27,35 @@
 #include "dropout_layer.h"
 #include "route_layer.h"
 #include "shortcut_layer.h"
+#include "parser.h"
+#include "data.h"
+
+load_args get_base_args(network net)
+{
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.size = net.w;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    return args;
+}
+
+network load_network(char *cfg, char *weights, int clear)
+{
+    network net = parse_network_cfg(cfg);
+    if(weights && weights[0] != 0){
+        load_weights(&net, weights);
+    }
+    if(clear) *net.seen = 0;
+    return net;
+}

 int get_current_batch(network net)
 {
@ -50,6 +79,7 @@ float get_current_rate(network net)
    int batch_num = get_current_batch(net);
    int i;
    float rate;
+    if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
    switch (net.policy) {
        case CONSTANT:
            return net.learning_rate;
@ -66,7 +96,6 @@ float get_current_rate(network net)
        case EXP:
            return net.learning_rate * pow(net.gamma, batch_num);
        case POLY:
-            if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power);
            return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power);
        case RANDOM:
            return net.learning_rate * pow(rand_uniform(0,1), net.power);
@ -150,7 +179,7 @@ void forward_network(network net, network_state state)
        state.index = i;
        layer l = net.layers[i];
        if(l.delta){
-            scal_cpu(l.outputs * l.batch, 0, l.delta, 1);
+            fill_cpu(l.outputs * l.batch, 0, l.delta, 1);
        }
        l.forward(l, state);
        state.input = l.output;
@ -165,7 +194,7 @@ void update_network(network net)
    for(i = 0; i < net.n; ++i){
        layer l = net.layers[i];
        if(l.update){
-            l.update(l, update_batch, rate, net.momentum, net.decay);
+            l.update(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay);
        }
    }
 }
@ -218,6 +247,7 @@ void backward_network(network net, network_state state)
            state.delta = prev.delta;
        }
        layer l = net.layers[i];
+        if(l.stopbackward) break;
        l.backward(l, state);
    }
 }
@ -414,6 +444,9 @@ detection_layer get_network_detection_layer(network net)
 image get_network_image_layer(network net, int i)
 {
    layer l = net.layers[i];
+    #ifdef GPU
+        cuda_pull_array(l.output_gpu, l.output, l.outputs);
+    #endif
    if (l.out_w && l.out_h && l.out_c){
        return float_to_image(l.out_w, l.out_h, l.out_c, l.output);
    }
--- a/src/network.h
+++ b/src/network.h
@ -43,6 +43,7 @@ typedef struct network{
    float eps;

    int inputs;
+    int notruth;
    int h, w, c;
    int max_crop;
    int min_crop;
@ -82,6 +83,7 @@ float *get_network_output_gpu(network net);
 void forward_network_gpu(network net, network_state state);
 void backward_network_gpu(network net, network_state state);
 void update_network_gpu(network net);
+void harmless_update_network_gpu(network net);
 #endif

 float get_current_rate(network net);
@ -121,6 +123,8 @@ int resize_network(network *net, int w, int h);
 void set_batch_network(network *net, int b);
 int get_network_input_size(network net);
 float get_network_cost(network net);
+network load_network(char *cfg, char *weights, int clear);
+load_args get_base_args(network net);

 int get_network_nuisance(network net);
 int get_network_background(network net);
--- a/src/network_kernels.cu
+++ b/src/network_kernels.cu
@ -50,8 +50,10 @@ void forward_network_gpu(network net, network_state state)
        if(l.delta_gpu){
            fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
        }
+        //if(l.c ==3 && i > 5) state.input = *net.input_gpu;
        l.forward_gpu(l, state);
        state.input = l.output_gpu;
+        if(l.truth) state.truth = l.output_gpu;
    }
 }

@ -64,6 +66,7 @@ void backward_network_gpu(network net, network_state state)
    for(i = net.n-1; i >= 0; --i){
        state.index = i;
        layer l = net.layers[i];
+        if(l.stopbackward) break;
        if(i == 0){
            state.input = original_input;
            state.delta = original_delta;
@ -86,11 +89,18 @@ void update_network_gpu(network net)
        layer l = net.layers[i];
        l.t = get_current_batch(net);
        if(l.update_gpu){
-            l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
+            l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay);
        }
    }
 }

+void harmless_update_network_gpu(network net)
+{
+    net.learning_rate = 0;
+    net.momentum = 1;
+    update_network_gpu(net);
+}
+
 void forward_backward_network_gpu(network net, float *x, float *y)
 {
    network_state state;
@ -101,10 +111,10 @@ void forward_backward_network_gpu(network net, float *x, float *y)
    if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch;
    if(!*net.input_gpu){
        *net.input_gpu = cuda_make_array(x, x_size);
-        *net.truth_gpu = cuda_make_array(y, y_size);
+        if(!net.notruth) *net.truth_gpu = cuda_make_array(y, y_size);
    }else{
        cuda_push_array(*net.input_gpu, x, x_size);
-        cuda_push_array(*net.truth_gpu, y, y_size);
+        if(!net.notruth) cuda_push_array(*net.truth_gpu, y, y_size);
    }
    state.input = *net.input_gpu;
    state.delta = 0;
@ -180,7 +190,7 @@ void update_layer(layer l, network net)
    float rate = get_current_rate(net);
    l.t = get_current_batch(net);
    if(l.update_gpu){
-        l.update_gpu(l, update_batch, rate, net.momentum, net.decay);
+        l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay);
    }
 }

--- a/src/nightmare.c
+++ b/src/nightmare.c
@ -52,6 +52,7 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
    image delta = make_image(im.w, im.h, im.c);

    network_state state = {0};
+    state.net = *net;

 #ifdef GPU
    state.input = cuda_make_array(im.data, im.w*im.h*im.c);
@ -142,6 +143,7 @@ void reconstruct_picture(network net, float *features, image recon, image update
        image delta = make_image(recon.w, recon.h, recon.c);

        network_state state = {0};
+        state.net = net;
 #ifdef GPU
        state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c);
        state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c);
@ -178,6 +180,113 @@ void reconstruct_picture(network net, float *features, image recon, image update
    }
 }

+/*
+void run_lsd(int argc, char **argv)
+{
+    srand(0);
+    if(argc < 3){
+        fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [options! (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *cfg = argv[2];
+    char *weights = argv[3];
+    char *input = argv[4];
+
+    int norm = find_int_arg(argc, argv, "-norm", 1);
+    int rounds = find_int_arg(argc, argv, "-rounds", 1);
+    int iters = find_int_arg(argc, argv, "-iters", 10);
+    float rate = find_float_arg(argc, argv, "-rate", .04);
+    float momentum = find_float_arg(argc, argv, "-momentum", .9);
+    float lambda = find_float_arg(argc, argv, "-lambda", .01);
+    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
+    int reconstruct = find_arg(argc, argv, "-reconstruct");
+    int smooth_size = find_int_arg(argc, argv, "-smooth", 1);
+
+    network net = parse_network_cfg(cfg);
+    load_weights(&net, weights);
+    char *cfgbase = basecfg(cfg);
+    char *imbase = basecfg(input);
+
+    set_batch_network(&net, 1);
+    image im = load_image_color(input, 0, 0);
+
+    float *features = 0;
+    image update;
+    if (reconstruct){
+        im = letterbox_image(im, net.w, net.h);
+
+        int zz = 0;
+        network_predict(net, im.data);
+        image out_im = get_network_image(net);
+        image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz);
+        //flip_image(crop);
+        image f_im = resize_image(crop, out_im.w, out_im.h);
+        free_image(crop);
+        printf("%d features\n", out_im.w*out_im.h*out_im.c);
+
+
+        im = resize_image(im, im.w, im.h);
+        f_im = resize_image(f_im, f_im.w, f_im.h);
+        features = f_im.data;
+
+        int i;
+        for(i = 0; i < 14*14*512; ++i){
+            features[i] += rand_uniform(-.19, .19);
+        }
+
+        free_image(im);
+        im = make_random_image(im.w, im.h, im.c);
+        update = make_image(im.w, im.h, im.c);
+
+    }
+
+    int e;
+    int n;
+    for(e = 0; e < rounds; ++e){
+        fprintf(stderr, "Iteration: ");
+        fflush(stderr);
+        for(n = 0; n < iters; ++n){  
+            fprintf(stderr, "%d, ", n);
+            fflush(stderr);
+            if(reconstruct){
+                reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1);
+                //if ((n+1)%30 == 0) rate *= .5;
+                show_image(im, "reconstruction");
+#ifdef OPENCV
+                cvWaitKey(10);
+#endif
+            }else{
+                int layer = max_layer + rand()%range - range/2;
+                int octave = rand()%octaves;
+                optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm);
+            }
+        }
+        fprintf(stderr, "done\n");
+        char buff[256];
+        if (prefix){
+            sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e);
+        }else{
+            sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e);
+        }
+        printf("%d %s\n", e, buff);
+        save_image(im, buff);
+        //show_image(im, buff);
+        //cvWaitKey(0);
+
+        if(rotate){
+            image rot = rotate_image(im, rotate);
+            free_image(im);
+            im = rot;
+        }
+        image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom);
+        image resized = resize_image(crop, im.w, im.h);
+        free_image(im);
+        free_image(crop);
+        im = resized;
+    }
+}
+*/

 void run_nightmare(int argc, char **argv)
 {
@ -224,6 +333,7 @@ void run_nightmare(int argc, char **argv)
        free_image(im);
        im = resized;
    }
+    im = letterbox_image(im, net.w, net.h);

    float *features = 0;
    image update;
@ -246,13 +356,11 @@ void run_nightmare(int argc, char **argv)

        int i;
        for(i = 0; i < 14*14*512; ++i){
-            features[i] += rand_uniform(-.19, .19);
+            //features[i] += rand_uniform(-.19, .19);
        }
-
        free_image(im);
        im = make_random_image(im.w, im.h, im.c);
        update = make_image(im.w, im.h, im.c);
-
    }

    int e;
--- a/src/parser.c
+++ b/src/parser.c
@ -9,6 +9,7 @@
 #include "batchnorm_layer.h"
 #include "blas.h"
 #include "connected_layer.h"
+#include "deconvolutional_layer.h"
 #include "convolutional_layer.h"
 #include "cost_layer.h"
 #include "crnn_layer.h"
@ -48,6 +49,8 @@ LAYER_TYPE string_to_layer_type(char * type)
    if (strcmp(type, "[local]")==0) return LOCAL;
    if (strcmp(type, "[conv]")==0
            || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL;
+    if (strcmp(type, "[deconv]")==0
+            || strcmp(type, "[deconvolutional]")==0) return DECONVOLUTIONAL;
    if (strcmp(type, "[activation]")==0) return ACTIVE;
    if (strcmp(type, "[net]")==0
            || strcmp(type, "[network]")==0) return NETWORK;
@ -135,6 +138,29 @@ local_layer parse_local(list *options, size_params params)
    return layer;
 }

+layer parse_deconvolutional(list *options, size_params params)
+{
+    int n = option_find_int(options, "filters",1);
+    int size = option_find_int(options, "size",1);
+    int stride = option_find_int(options, "stride",1);
+
+    char *activation_s = option_find_str(options, "activation", "logistic");
+    ACTIVATION activation = get_activation(activation_s);
+
+    int batch,h,w,c;
+    h = params.h;
+    w = params.w;
+    c = params.c;
+    batch=params.batch;
+    if(!(h && w && c)) error("Layer before deconvolutional layer must output image.");
+    int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
+
+    layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,activation, batch_normalize);
+
+    return l;
+}
+
+
 convolutional_layer parse_convolutional(list *options, size_params params)
 {
    int n = option_find_int(options, "filters",1);
@ -312,6 +338,7 @@ cost_layer parse_cost(list *options, size_params params)
    float scale = option_find_float_quiet(options, "scale",1);
    cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale);
    layer.ratio =  option_find_float_quiet(options, "ratio",0);
+    layer.thresh =  option_find_float_quiet(options, "thresh",0);
    return layer;
 }

@ -343,6 +370,8 @@ layer parse_reorg(list *options, size_params params)
 {
    int stride = option_find_int(options, "stride",1);
    int reverse = option_find_int_quiet(options, "reverse",0);
+    int flatten = option_find_int_quiet(options, "flatten",0);
+    int extra = option_find_int_quiet(options, "extra",0);

    int batch,h,w,c;
    h = params.h;
@ -351,7 +380,7 @@ layer parse_reorg(list *options, size_params params)
    batch=params.batch;
    if(!(h && w && c)) error("Layer before reorg layer must output image.");

-    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse);
+    layer layer = make_reorg_layer(batch,w,h,c,stride,reverse, flatten, extra);
    return layer;
 }

@ -508,6 +537,7 @@ void parse_net_options(list *options, network *net)
    net->decay = option_find_float(options, "decay", .0001);
    int subdivs = option_find_int(options, "subdivisions",1);
    net->time_steps = option_find_int_quiet(options, "time_steps",1);
+    net->notruth = option_find_int_quiet(options, "notruth",0);
    net->batch /= subdivs;
    net->batch *= net->time_steps;
    net->subdivisions = subdivs;
@ -537,6 +567,7 @@ void parse_net_options(list *options, network *net)
    char *policy_s = option_find_str(options, "policy", "constant");
    net->policy = get_policy(policy_s);
    net->burn_in = option_find_int_quiet(options, "burn_in", 0);
+    net->power = option_find_float_quiet(options, "power", 4);
    if(net->policy == STEP){
        net->step = option_find_int(options, "step", 1);
        net->scale = option_find_float(options, "scale", 1);
@ -570,7 +601,6 @@ void parse_net_options(list *options, network *net)
        net->gamma = option_find_float(options, "gamma", 1);
        net->step = option_find_int(options, "step", 1);
    } else if (net->policy == POLY || net->policy == RANDOM){
-        net->power = option_find_float(options, "power", 1);
    }
    net->max_batches = option_find_int(options, "max_batches", 0);
 }
@ -617,6 +647,8 @@ network parse_network_cfg(char *filename)
        LAYER_TYPE lt = string_to_layer_type(s->type);
        if(lt == CONVOLUTIONAL){
            l = parse_convolutional(options, params);
+        }else if(lt == DECONVOLUTIONAL){
+            l = parse_deconvolutional(options, params);
        }else if(lt == LOCAL){
            l = parse_local(options, params);
        }else if(lt == ACTIVE){
@ -665,8 +697,13 @@ network parse_network_cfg(char *filename)
        }else{
            fprintf(stderr, "Type not recognized: %s\n", s->type);
        }
+        l.truth = option_find_int_quiet(options, "truth", 0);
+        l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
+        l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
        l.dontload = option_find_int_quiet(options, "dontload", 0);
        l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
+        l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
+        l.smooth = option_find_float_quiet(options, "smooth", 0);
        option_unused(options);
        net.layers[count] = l;
        if (l.workspace_size > workspace_size) workspace_size = l.workspace_size;
@ -840,7 +877,7 @@ void save_weights_upto(network net, char *filename, int cutoff)
    int i;
    for(i = 0; i < net.n && i < cutoff; ++i){
        layer l = net.layers[i];
-        if(l.type == CONVOLUTIONAL){
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
            save_convolutional_weights(l, fp);
        } if(l.type == CONNECTED){
            save_connected_weights(l, fp);
@ -1005,7 +1042,7 @@ void load_convolutional_weights(layer l, FILE *fp)
 }


-void load_weights_upto(network *net, char *filename, int cutoff)
+void load_weights_upto(network *net, char *filename, int start, int cutoff)
 {
 #ifdef GPU
    if(net->gpu_index >= 0){
@ -1027,10 +1064,10 @@ void load_weights_upto(network *net, char *filename, int cutoff)
    int transpose = (major > 1000) || (minor > 1000);

    int i;
-    for(i = 0; i < net->n && i < cutoff; ++i){
+    for(i = start; i < net->n && i < cutoff; ++i){
        layer l = net->layers[i];
        if (l.dontload) continue;
-        if(l.type == CONVOLUTIONAL){
+        if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
            load_convolutional_weights(l, fp);
        }
        if(l.type == CONNECTED){
@ -1075,6 +1112,6 @@ void load_weights_upto(network *net, char *filename, int cutoff)

 void load_weights(network *net, char *filename)
 {
-    load_weights_upto(net, filename, net->n);
+    load_weights_upto(net, filename, 0, net->n);
 }

--- a/src/parser.h
+++ b/src/parser.h
@ -8,6 +8,6 @@ void save_weights(network net, char *filename);
 void save_weights_upto(network net, char *filename, int cutoff);
 void save_weights_double(network net, char *filename);
 void load_weights(network *net, char *filename);
-void load_weights_upto(network *net, char *filename, int cutoff);
+void load_weights_upto(network *net, char *filename, int start, int cutoff);

 #endif
--- a/src/region_layer.c
+++ b/src/region_layer.c
@ -18,6 +18,10 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
    l.batch = batch;
    l.h = h;
    l.w = w;
+    l.c = n*(classes + coords + 1);
+    l.out_w = l.w;
+    l.out_h = l.h;
+    l.out_c = l.c;
    l.classes = classes;
    l.coords = coords;
    l.cost = calloc(1, sizeof(float));
@ -68,19 +72,19 @@ void resize_region_layer(layer *l, int w, int h)
 #endif
 }

-box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h)
+box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride)
 {
    box b;
-    b.x = (i + logistic_activate(x[index + 0])) / w;
-    b.y = (j + logistic_activate(x[index + 1])) / h;
-    b.w = exp(x[index + 2]) * biases[2*n]   / w;
-    b.h = exp(x[index + 3]) * biases[2*n+1] / h;
+    b.x = (i + x[index + 0*stride]) / w;
+    b.y = (j + x[index + 1*stride]) / h;
+    b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
    return b;
 }

-float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale)
+float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale, int stride)
 {
-    box pred = get_region_box(x, biases, n, index, i, j, w, h);
+    box pred = get_region_box(x, biases, n, index, i, j, w, h, stride);
    float iou = box_iou(pred, truth);

    float tx = (truth.x*w - i);
@ -88,34 +92,34 @@ float delta_region_box(box truth, float *x, float *biases, int n, int index, int
    float tw = log(truth.w*w / biases[2*n]);
    float th = log(truth.h*h / biases[2*n + 1]);

-    delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
-    delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
-    delta[index + 2] = scale * (tw - x[index + 2]);
-    delta[index + 3] = scale * (th - x[index + 3]);
+    delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
+    delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
+    delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
+    delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
    return iou;
 }

-void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, float *avg_cat)
+void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat)
 {
    int i, n;
    if(hier){
        float pred = 1;
        while(class >= 0){
-            pred *= output[index + class];
+            pred *= output[index + stride*class];
            int g = hier->group[class];
            int offset = hier->group_offset[g];
            for(i = 0; i < hier->group_size[g]; ++i){
-                delta[index + offset + i] = scale * (0 - output[index + offset + i]);
+                delta[index + stride*(offset + i)] = scale * (0 - output[index + stride*(offset + i)]);
            }
-            delta[index + class] = scale * (1 - output[index + class]);
+            delta[index + stride*class] = scale * (1 - output[index + stride*class]);

            class = hier->parent[class];
        }
        *avg_cat += pred;
    } else {
        for(n = 0; n < classes; ++n){
-            delta[index + n] = scale * (((n == class)?1 : 0) - output[index + n]);
-            if(n == class) *avg_cat += output[index + n];
+            delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]);
+            if(n == class) *avg_cat += output[index + stride*n];
        }
    }
 }
@ -130,42 +134,35 @@ float tisnan(float x)
    return (x != x);
 }

+int entry_index(layer l, int batch, int location, int entry)
+{
+    int n =   location / (l.w*l.h);
+    int loc = location % (l.w*l.h);
+    return batch*l.outputs + n*l.w*l.h*(l.coords+l.classes+1) + entry*l.w*l.h + loc;
+}
+
 void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
 void forward_region_layer(const layer l, network_state state)
 {
    int i,j,b,t,n;
-    int size = l.coords + l.classes + 1;
    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
-#ifndef GPU
-    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
-#endif
-    for (b = 0; b < l.batch; ++b){
-        for(i = 0; i < l.h*l.w*l.n; ++i){
-            int index = size*i + b*l.outputs;
-            l.output[index + 4] = logistic_activate(l.output[index + 4]);
-        }
-    }
-

 #ifndef GPU
    if (l.softmax_tree){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5);
-            }
+        int i;
+        int count = 5;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(state.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count);
+            count += group_size;
        }
    } else if (l.softmax){
-        for (b = 0; b < l.batch; ++b){
-            for(i = 0; i < l.h*l.w*l.n; ++i){
-                int index = size*i + b*l.outputs;
-                softmax(l.output + index + 5, l.classes, 1, l.output + index + 5);
-            }
-        }
+        softmax_cpu(state.input + 5, l.classes, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + 5);
    }
 #endif
-    if(!state.train) return;
+
    memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
+    if(!state.train) return;
    float avg_iou = 0;
    float recall = 0;
    float avg_cat = 0;
@ -178,26 +175,28 @@ void forward_region_layer(const layer l, network_state state)
        if(l.softmax_tree){
            int onlyclass = 0;
            for(t = 0; t < 30; ++t){
-                box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                box truth = float_to_box(state.truth + t*5 + b*l.truths, 1);
                if(!truth.x) break;
                int class = state.truth[t*5 + b*l.truths + 4];
                float maxp = 0;
                int maxi = 0;
                if(truth.x > 100000 && truth.y > 100000){
                    for(n = 0; n < l.n*l.w*l.h; ++n){
-                        int index = size*n + b*l.outputs + 5;
-                        float scale =  l.output[index-1];
-                        l.delta[index - 1] = l.noobject_scale * ((0 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class);
+                        int class_index = entry_index(l, b, n, 5);
+                        int obj_index = entry_index(l, b, n, 4);
+                        float scale =  l.output[obj_index];
+                        l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
+                        float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h);
                        if(p > maxp){
                            maxp = p;
                            maxi = n;
                        }
                    }
-                    int index = size*maxi + b*l.outputs + 5;
-                    delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
-                    if(l.output[index - 1] < .3) l.delta[index - 1] = l.object_scale * ((.3 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
-                    else  l.delta[index - 1] = 0;
+                    int class_index = entry_index(l, b, maxi, 5);
+                    int obj_index = entry_index(l, b, maxi, 4);
+                    delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat);
+                    if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]);
+                    else  l.delta[obj_index] = 0;
                    ++class_count;
                    onlyclass = 1;
                    break;
@ -208,21 +207,22 @@ void forward_region_layer(const layer l, network_state state)
        for (j = 0; j < l.h; ++j) {
            for (i = 0; i < l.w; ++i) {
                for (n = 0; n < l.n; ++n) {
-                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                    int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                    box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                    float best_iou = 0;
                    for(t = 0; t < 30; ++t){
-                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
+                        box truth = float_to_box(state.truth + t*5 + b*l.truths, 1);
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
                            best_iou = iou;
                        }
                    }
-                    avg_anyobj += l.output[index + 4];
-                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
+                    int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
+                    avg_anyobj += l.output[obj_index];
+                    l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
                    if (best_iou > l.thresh) {
-                        l.delta[index + 4] = 0;
+                        l.delta[obj_index] = 0;
                    }

                    if(*(state.net.seen) < 12800){
@ -231,17 +231,16 @@ void forward_region_layer(const layer l, network_state state)
                        truth.y = (j + .5)/l.h;
                        truth.w = l.biases[2*n]/l.w;
                        truth.h = l.biases[2*n+1]/l.h;
-                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
+                        delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h);
                    }
                }
            }
        }
        for(t = 0; t < 30; ++t){
-            box truth = float_to_box(state.truth + t*5 + b*l.truths);
+            box truth = float_to_box(state.truth + t*5 + b*l.truths, 1);

            if(!truth.x) break;
            float best_iou = 0;
-            int best_index = 0;
            int best_n = 0;
            i = (truth.x * l.w);
            j = (truth.y * l.h);
@ -251,8 +250,8 @@ void forward_region_layer(const layer l, network_state state)
            truth_shift.y = 0;
            //printf("index %d %d\n",i, j);
            for(n = 0; n < l.n; ++n){
-                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
-                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
+                int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
+                box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
                if(l.bias_match){
                    pred.w = l.biases[2*n]/l.w;
                    pred.h = l.biases[2*n+1]/l.h;
@ -262,80 +261,118 @@ void forward_region_layer(const layer l, network_state state)
                pred.y = 0;
                float iou = box_iou(pred, truth_shift);
                if (iou > best_iou){
-                    best_index = index;
                    best_iou = iou;
                    best_n = n;
                }
            }
            //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);

-            float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale);
+            int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0);
+            float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale *  (2 - truth.w*truth.h), l.w*l.h);
            if(iou > .5) recall += 1;
            avg_iou += iou;

            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
-            avg_obj += l.output[best_index + 4];
-            l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+            int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4);
+            avg_obj += l.output[obj_index];
+            l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
            if (l.rescore) {
-                l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]);
+                l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]);
            }

-
            int class = state.truth[t*5 + b*l.truths + 4];
            if (l.map) class = l.map[class];
-            delta_region_class(l.output, l.delta, best_index + 5, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
+            int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 5);
+            delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat);
            ++count;
            ++class_count;
        }
    }
    //printf("\n");
-#ifndef GPU
-    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
-#endif
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
 }

 void backward_region_layer(const layer l, network_state state)
 {
-    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+    /*
+       int b;
+       int size = l.coords + l.classes + 1;
+       for (b = 0; b < l.batch*l.n; ++b){
+       int index = (b*size + 4)*l.w*l.h;
+       gradient_array(l.output + index, l.w*l.h, LOGISTIC, l.delta + index);
+       }
+       axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
+     */
 }

 void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh)
 {
-    int i,j,n;
+    int i,j,n,z;
    float *predictions = l.output;
+    if (l.batch == 2) {
+        float *flip = l.output + l.outputs;
+        for (j = 0; j < l.h; ++j) {
+            for (i = 0; i < l.w/2; ++i) {
+                for (n = 0; n < l.n; ++n) {
+                    for(z = 0; z < l.classes + 5; ++z){
+                        int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
+                        int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
+                        float swap = flip[i1];
+                        flip[i1] = flip[i2];
+                        flip[i2] = swap;
+                        if(z == 0){
+                            flip[i1] = -flip[i1];
+                            flip[i2] = -flip[i2];
+                        }
+                    }
+                }
+            }
+        }
+        for(i = 0; i < l.outputs; ++i){
+            l.output[i] = (l.output[i] + flip[i])/2.;
+        }
+    }
    for (i = 0; i < l.w*l.h; ++i){
        int row = i / l.w;
        int col = i % l.w;
        for(n = 0; n < l.n; ++n){
-            int index = i*l.n + n;
-            int p_index = index * (l.classes + 5) + 4;
-            float scale = predictions[p_index];
-            int box_index = index * (l.classes + 5);
-            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
+            int index = n*l.w*l.h + i;
+            int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
+            int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
+            float scale = predictions[obj_index];
+            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h);
+            if(1){
+                int max = w > h ? w : h;
+                boxes[index].x =  (boxes[index].x - (max - w)/2./max) / ((float)w/max); 
+                boxes[index].y =  (boxes[index].y - (max - h)/2./max) / ((float)h/max); 
+                boxes[index].w *= (float)max/w;
+                boxes[index].h *= (float)max/h;
+            }
            boxes[index].x *= w;
            boxes[index].y *= h;
            boxes[index].w *= w;
            boxes[index].h *= h;

-            int class_index = index * (l.classes + 5) + 5;
+            int class_index = entry_index(l, 0, n*l.w*l.h + i, 5);
            if(l.softmax_tree){

-                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
+                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h);
                if(map){
                    for(j = 0; j < 200; ++j){
-                        float prob = scale*predictions[class_index+map[j]];
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, 5 + map[j]);
+                        float prob = scale*predictions[class_index];
                        probs[index][j] = (prob > thresh) ? prob : 0;
                    }
                } else {
-                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh);
+                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
                    probs[index][j] = (scale > thresh) ? scale : 0;
                    probs[index][l.classes] = scale;
                }
            } else {
                for(j = 0; j < l.classes; ++j){
-                    float prob = scale*predictions[class_index+j];
+                    int class_index = entry_index(l, 0, n*l.w*l.h + i, 5 + j);
+                    float prob = scale*predictions[class_index];
                    probs[index][j] = (prob > thresh) ? prob : 0;
                }
            }
@ -350,23 +387,33 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b

 void forward_region_layer_gpu(const layer l, network_state state)
 {
-    /*
-       if(!state.train){
-       copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
-       return;
-       }
-     */
-    flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu);
-    if(l.softmax_tree){
+    copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            activate_array_ongpu(l.output_gpu + index,   l.w*l.h, LOGISTIC);
+        }
+    }
+    if (l.softmax_tree){
        int i;
        int count = 5;
        for (i = 0; i < l.softmax_tree->groups; ++i) {
            int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count);
+            int index = entry_index(l, 0, 0, count);
+            softmax_gpu(state.input + index, group_size, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index);
            count += group_size;
        }
-    }else if (l.softmax){
-        softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5);
+    } else if (l.softmax) {
+        int index = entry_index(l, 0, 0, 5);
+        //printf("%d\n", index);
+        softmax_gpu(state.input + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index);
+    }
+    if(!state.train || l.onlyforward){
+        cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
+        return;
    }

    float *in_cpu = calloc(l.batch*l.inputs, sizeof(float));
@ -382,16 +429,25 @@ void forward_region_layer_gpu(const layer l, network_state state)
    cpu_state.truth = truth_cpu;
    cpu_state.input = in_cpu;
    forward_region_layer(l, cpu_state);
-    //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
+    cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs);
    free(cpu_state.input);
    if(!state.train) return;
    cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
    if(cpu_state.truth) free(cpu_state.truth);
 }

-void backward_region_layer_gpu(layer l, network_state state)
+void backward_region_layer_gpu(const layer l, network_state state)
 {
-    flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta);
+    int b, n;
+    for (b = 0; b < l.batch; ++b){
+        for(n = 0; n < l.n; ++n){
+            int index = entry_index(l, b, n*l.w*l.h, 0);
+            gradient_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            index = entry_index(l, b, n*l.w*l.h, 4);
+            gradient_array_ongpu(l.output_gpu + index,   l.w*l.h, LOGISTIC, l.delta_gpu + index);
+        }
+    }
+   axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1);
 }
 #endif

--- a/src/regressor.c
+++ b/src/regressor.c
@ -0,0 +1,261 @@
+#include "network.h"
+#include "utils.h"
+#include "parser.h"
+#include "option_list.h"
+#include "blas.h"
+#include "assert.h"
+#include "cuda.h"
+#include <sys/time.h>
+
+#ifdef OPENCV
+#include "opencv2/highgui/highgui_c.h"
+image get_image_from_stream(CvCapture *cap);
+#endif
+
+void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
+{
+    int i;
+
+    float avg_loss = -1;
+    char *base = basecfg(cfgfile);
+    printf("%s\n", base);
+    printf("%d\n", ngpus);
+    network *nets = calloc(ngpus, sizeof(network));
+
+    srand(time(0));
+    int seed = rand();
+    for(i = 0; i < ngpus; ++i){
+        srand(seed);
+#ifdef GPU
+        cuda_set_device(gpus[i]);
+#endif
+        nets[i] = parse_network_cfg(cfgfile);
+        if(weightfile){
+            load_weights(&nets[i], weightfile);
+        }
+        if(clear) *nets[i].seen = 0;
+        nets[i].learning_rate *= ngpus;
+    }
+    srand(time(0));
+    network net = nets[0];
+
+    int imgs = net.batch * net.subdivisions * ngpus;
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
+    list *options = read_data_cfg(datacfg);
+
+    char *backup_directory = option_find_str(options, "backup", "/backup/");
+    char *train_list = option_find_str(options, "train", "data/train.list");
+
+    list *plist = get_paths(train_list);
+    char **paths = (char **)list_to_array(plist);
+    printf("%d\n", plist->size);
+    int N = plist->size;
+    clock_t time;
+
+    load_args args = {0};
+    args.w = net.w;
+    args.h = net.h;
+    args.threads = 32;
+
+    args.min = net.min_crop;
+    args.max = net.max_crop;
+    args.angle = net.angle;
+    args.aspect = net.aspect;
+    args.exposure = net.exposure;
+    args.saturation = net.saturation;
+    args.hue = net.hue;
+    args.size = net.w;
+
+    args.paths = paths;
+    args.n = imgs;
+    args.m = N;
+    args.type = REGRESSION_DATA;
+
+    data train;
+    data buffer;
+    pthread_t load_thread;
+    args.d = &buffer;
+    load_thread = load_data(args);
+
+    int epoch = (*net.seen)/N;
+    while(get_current_batch(net) < net.max_batches || net.max_batches == 0){
+        time=clock();
+
+        pthread_join(load_thread, 0);
+        train = buffer;
+        load_thread = load_data(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        time=clock();
+
+        float loss = 0;
+#ifdef GPU
+        if(ngpus == 1){
+            loss = train_network(net, train);
+        } else {
+            loss = train_networks(nets, ngpus, train, 4);
+        }
+#else
+        loss = train_network(net, train);
+#endif
+        if(avg_loss == -1) avg_loss = loss;
+        avg_loss = avg_loss*.9 + loss*.1;
+        printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen);
+        free_data(train);
+        if(*net.seen/N > epoch){
+            epoch = *net.seen/N;
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch);
+            save_weights(net, buff);
+        }
+        if(get_current_batch(net)%100 == 0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup",backup_directory,base);
+            save_weights(net, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s.weights", backup_directory, base);
+    save_weights(net, buff);
+
+    free_network(net);
+    free_ptrs((void**)paths, plist->size);
+    free_list(plist);
+    free(base);
+}
+
+void predict_regressor(char *cfgfile, char *weightfile, char *filename)
+{
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    while(1){
+        if(filename){
+            strncpy(input, filename, 256);
+        }else{
+            printf("Enter Image Path: ");
+            fflush(stdout);
+            input = fgets(input, 256, stdin);
+            if(!input) return;
+            strtok(input, "\n");
+        }
+        image im = load_image_color(input, 0, 0);
+        image sized = letterbox_image(im, net.w, net.h);
+
+        float *X = sized.data;
+        time=clock();
+        float *predictions = network_predict(net, X);
+        printf("Predicted: %f\n", predictions[0]);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        free_image(im);
+        free_image(sized);
+        if (filename) break;
+    }
+}
+
+
+void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
+{
+#ifdef OPENCV
+    printf("Regressor Demo\n");
+    network net = parse_network_cfg(cfgfile);
+    if(weightfile){
+        load_weights(&net, weightfile);
+    }
+    set_batch_network(&net, 1);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    if(filename){
+        cap = cvCaptureFromFile(filename);
+    }else{
+        cap = cvCaptureFromCAM(cam_index);
+    }
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    cvNamedWindow("Regressor", CV_WINDOW_NORMAL); 
+    cvResizeWindow("Regressor", 512, 512);
+    float fps = 0;
+    int i;
+
+    while(1){
+        struct timeval tval_before, tval_after, tval_result;
+        gettimeofday(&tval_before, NULL);
+
+        image in = get_image_from_stream(cap);
+        image in_s = letterbox_image(in, net.w, net.h);
+        show_image(in, "Regressor");
+
+        float *predictions = network_predict(net, in_s.data);
+
+        printf("\033[2J");
+        printf("\033[1;1H");
+        printf("\nFPS:%.0f\n",fps);
+
+        printf("People: %f\n", predictions[0]);
+
+        free_image(in_s);
+        free_image(in);
+
+        cvWaitKey(10);
+
+        gettimeofday(&tval_after, NULL);
+        timersub(&tval_after, &tval_before, &tval_result);
+        float curr = 1000000.f/((long int)tval_result.tv_usec);
+        fps = .9*fps + .1*curr;
+    }
+#endif
+}
+
+
+void run_regressor(int argc, char **argv)
+{
+    if(argc < 4){
+        fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]);
+        return;
+    }
+
+    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    int *gpus = 0;
+    int gpu = 0;
+    int ngpus = 0;
+    if(gpu_list){
+        printf("%s\n", gpu_list);
+        int len = strlen(gpu_list);
+        ngpus = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (gpu_list[i] == ',') ++ngpus;
+        }
+        gpus = calloc(ngpus, sizeof(int));
+        for(i = 0; i < ngpus; ++i){
+            gpus[i] = atoi(gpu_list);
+            gpu_list = strchr(gpu_list, ',')+1;
+        }
+    } else {
+        gpu = gpu_index;
+        gpus = &gpu;
+        ngpus = 1;
+    }
+
+    int cam_index = find_int_arg(argc, argv, "-c", 0);
+    int clear = find_arg(argc, argv, "-clear");
+    char *data = argv[3];
+    char *cfg = argv[4];
+    char *weights = (argc > 5) ? argv[5] : 0;
+    char *filename = (argc > 6) ? argv[6]: 0;
+    if(0==strcmp(argv[2], "test")) predict_regressor(data, cfg, weights);
+    else if(0==strcmp(argv[2], "train")) train_regressor(data, cfg, weights, gpus, ngpus, clear);
+    else if(0==strcmp(argv[2], "demo")) demo_regressor(data, cfg, weights, cam_index, filename);
+}
+
+
--- a/src/reorg_layer.c
+++ b/src/reorg_layer.c
@ -4,15 +4,17 @@
 #include <stdio.h>


-layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra)
 {
    layer l = {0};
    l.type = REORG;
    l.batch = batch;
    l.stride = stride;
+    l.extra = extra;
    l.h = h;
    l.w = w;
    l.c = c;
+    l.flatten = flatten;
    if(reverse){
        l.out_w = w*stride;
        l.out_h = h*stride;
@ -23,10 +25,20 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse)
        l.out_c = c*(stride*stride);
    }
    l.reverse = reverse;
-    fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+
    l.outputs = l.out_h * l.out_w * l.out_c;
    l.inputs = h*w*c;
-    int output_size = l.out_h * l.out_w * l.out_c * batch;
+    if(l.extra){
+        l.out_w = l.out_h = l.out_c = 0;
+        l.outputs = l.inputs + l.extra;
+    }
+
+    if(extra){
+        fprintf(stderr, "reorg              %4d   ->  %4d\n",  l.inputs, l.outputs);
+    } else {
+        fprintf(stderr, "reorg              /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n",  stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    }
+    int output_size = l.outputs * batch;
    l.output =  calloc(output_size, sizeof(float));
    l.delta =   calloc(output_size, sizeof(float));

@ -77,17 +89,41 @@ void resize_reorg_layer(layer *l, int w, int h)

 void forward_reorg_layer(const layer l, network_state state)
 {
-    if(l.reverse){
+    int i;
+    if(l.flatten){
+        memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(l.output, l.w*l.h, l.c, l.batch, 0);
+        }else{
+            flatten(l.output, l.w*l.h, l.c, l.batch, 1);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, state.input + i*l.inputs, 1, l.output + i*l.outputs, 1);
+        }
+    } else if (l.reverse){
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output);
-    }else {
+    } else {
        reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output);
    }
 }

 void backward_reorg_layer(const layer l, network_state state)
 {
-    if(l.reverse){
+    int i;
+    if(l.flatten){
+        memcpy(state.delta, l.delta, l.outputs*l.batch*sizeof(float));
+        if(l.reverse){
+            flatten(state.delta, l.w*l.h, l.c, l.batch, 1);
+        }else{
+            flatten(state.delta, l.w*l.h, l.c, l.batch, 0);
+        }
+    } else if(l.reverse){
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_cpu(l.inputs, l.delta + i*l.outputs, 1, state.delta + i*l.inputs, 1);
+        }
    }else{
        reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
@ -96,7 +132,18 @@ void backward_reorg_layer(const layer l, network_state state)
 #ifdef GPU
 void forward_reorg_layer_gpu(layer l, network_state state)
 {
-    if(l.reverse){
+    int i;
+    if(l.flatten){
+        if(l.reverse){
+            flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 0, l.output_gpu);
+        }else{
+            flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 1, l.output_gpu);
+        }
+    } else if (l.extra) {
+        for(i = 0; i < l.batch; ++i){
+            copy_ongpu(l.inputs, state.input + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1);
+        }
+    } else if (l.reverse) {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu);
    }else {
        reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu);
@ -105,9 +152,20 @@ void forward_reorg_layer_gpu(layer l, network_state state)

 void backward_reorg_layer_gpu(layer l, network_state state)
 {
-    if(l.reverse){
+    if(l.flatten){
+        if(l.reverse){
+            flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, state.delta);
+        }else{
+            flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, state.delta);
+        }
+    } else if (l.extra) {
+        int i;
+        for(i = 0; i < l.batch; ++i){
+            copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, state.delta + i*l.inputs, 1);
+        }
+    } else if(l.reverse){
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta);
-    }else{
+    } else {
        reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta);
    }
 }
--- a/src/reorg_layer.h
+++ b/src/reorg_layer.h
@ -6,7 +6,7 @@
 #include "layer.h"
 #include "network.h"

-layer make_reorg_layer(int batch, int h, int w, int c, int stride, int reverse);
+layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra);
 void resize_reorg_layer(layer *l, int w, int h);
 void forward_reorg_layer(const layer l, network_state state);
 void backward_reorg_layer(const layer l, network_state state);
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@ -32,40 +32,24 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
    return l;
 }

-void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output)
-{
-    int b;
-    for(b = 0; b < batch; ++b){
-        int i;
-        int count = 0;
-        for(i = 0; i < hierarchy->groups; ++i){
-            int group_size = hierarchy->group_size[i];
-            softmax(input+b*inputs + count, group_size, temp, output+b*inputs + count);
-            count += group_size;
-        }
-    }
-}
-
 void forward_softmax_layer(const softmax_layer l, network_state state)
 {
-    int b;
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
    if(l.softmax_tree){
-        softmax_tree(state.input, batch, inputs, l.temperature, l.softmax_tree, l.output);
-    } else {
-        for(b = 0; b < batch; ++b){
-            softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs);
+        int i;
+        int count = 0;
+        for (i = 0; i < l.softmax_tree->groups; ++i) {
+            int group_size = l.softmax_tree->group_size[i];
+            softmax_cpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count);
+            count += group_size;
        }
+    } else {
+        softmax_cpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
    }
 }

 void backward_softmax_layer(const softmax_layer l, network_state state)
 {
-    int i;
-    for(i = 0; i < l.inputs*l.batch; ++i){
-        state.delta[i] += l.delta[i];
-    }
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1);
 }

 #ifdef GPU
@ -77,18 +61,16 @@ void pull_softmax_layer_output(const softmax_layer layer)

 void forward_softmax_layer_gpu(const softmax_layer l, network_state state)
 {
-    int inputs = l.inputs / l.groups;
-    int batch = l.batch * l.groups;
    if(l.softmax_tree){
        int i;
        int count = 0;
        for (i = 0; i < l.softmax_tree->groups; ++i) {
            int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count);
+            softmax_gpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
            count += group_size;
        }
    } else {
-        softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu);
+        softmax_gpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
    }
 }

--- a/src/super.c
+++ b/src/super.c
@ -7,7 +7,7 @@
 #include "opencv2/highgui/highgui_c.h"
 #endif

-void train_super(char *cfgfile, char *weightfile)
+void train_super(char *cfgfile, char *weightfile, int clear)
 {
    char *train_images = "/data/imagenet/imagenet1k.train.list";
    char *backup_directory = "/home/pjreddie/backup/";
@ -19,6 +19,7 @@ void train_super(char *cfgfile, char *weightfile)
    if(weightfile){
        load_weights(&net, weightfile);
    }
+    if(clear) *net.seen = 0;
    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    int imgs = net.batch*net.subdivisions;
    int i = *net.seen/imgs;
@ -123,7 +124,8 @@ void run_super(int argc, char **argv)
    char *cfg = argv[3];
    char *weights = (argc > 4) ? argv[4] : 0;
    char *filename = (argc > 5) ? argv[5] : 0;
-    if(0==strcmp(argv[2], "train")) train_super(cfg, weights);
+    int clear = find_arg(argc, argv, "-clear");
+    if(0==strcmp(argv[2], "train")) train_super(cfg, weights, clear);
    else if(0==strcmp(argv[2], "test")) test_super(cfg, weights, filename);
    /*
    else if(0==strcmp(argv[2], "valid")) validate_super(cfg, weights);
--- a/src/tree.c
+++ b/src/tree.c
@ -24,33 +24,33 @@ void change_leaves(tree *t, char *leaf_list)
    fprintf(stderr, "Found %d leaves.\n", found);
 }

-float get_hierarchy_probability(float *x, tree *hier, int c)
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride)
 {
    float p = 1;
    while(c >= 0){
-        p = p * x[c];
+        p = p * x[c*stride];
        c = hier->parent[c];
    }
    return p;
 }

-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves)
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride)
 {
    int j;
    for(j = 0; j < n; ++j){
        int parent = hier->parent[j];
        if(parent >= 0){
-            predictions[j] *= predictions[parent]; 
+            predictions[j*stride] *= predictions[parent*stride]; 
        }
    }
    if(only_leaves){
        for(j = 0; j < n; ++j){
-            if(!hier->leaf[j]) predictions[j] = 0;
+            if(!hier->leaf[j]) predictions[j*stride] = 0;
        }
    }
 }

-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride)
 {
    float p = 1;
    int group = 0;
@ -61,7 +61,7 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)

        for(i = 0; i < hier->group_size[group]; ++i){
            int index = i + hier->group_offset[group];
-            float val = predictions[i + hier->group_offset[group]];
+            float val = predictions[(i + hier->group_offset[group])*stride];
            if(val > max){
                max_i = index;
                max = val;
--- a/src/tree.h
+++ b/src/tree.h
@ -15,9 +15,9 @@ typedef struct{
 } tree;

 tree *read_tree(char *filename);
-void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
+void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride);
 void change_leaves(tree *t, char *leaf_list);
-int hierarchy_top_prediction(float *predictions, tree *hier, float thresh);
-float get_hierarchy_probability(float *x, tree *hier, int c);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
+float get_hierarchy_probability(float *x, tree *hier, int c, int stride);

 #endif