diff --git a/Makefile b/Makefile
index 22b40a85..48560f9c 100644
--- a/Makefile
+++ b/Makefile
@@ -57,8 +57,8 @@ CFLAGS+= -DCUDNN
 LDFLAGS+= -lcudnn
 endif
 
-OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o
-EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o attention.o darknet.o
+OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o detection_layer.o route_layer.o upsample_layer.o box.o normalization_layer.o avgpool_layer.o layer.o local_layer.o shortcut_layer.o logistic_layer.o activation_layer.o rnn_layer.o gru_layer.o crnn_layer.o demo.o batchnorm_layer.o region_layer.o reorg_layer.o tree.o  lstm_layer.o l2norm_layer.o
+EXECOBJA=captcha.o lsd.o super.o art.o tag.o cifar.o go.o rnn.o segmenter.o regressor.o classifier.o coco.o yolo.o detector.o nightmare.o darknet.o
 ifeq ($(GPU), 1) 
 LDFLAGS+= -lstdc++ 
 OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o avgpool_layer_kernels.o
@@ -97,5 +97,5 @@ results:
 .PHONY: clean
 
 clean:
-	rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ)
+	rm -rf $(OBJS) $(SLIB) $(ALIB) $(EXEC) $(EXECOBJ) $(OBJDIR)/*
 
diff --git a/cfg/darknet.cfg b/cfg/darknet.cfg
index ccb4629b..ec902830 100644
--- a/cfg/darknet.cfg
+++ b/cfg/darknet.cfg
@@ -1,12 +1,12 @@
 [net]
 # Train
-batch=1
-subdivisions=1
+ batch=128
+ subdivisions=1
 # Test
-# batch=1
-# subdivisions=1
-height=224
-width=224
+#batch=1
+#subdivisions=1
+height=256
+width=256
 channels=3
 momentum=0.9
 decay=0.0005
@@ -88,7 +88,6 @@ activation=leaky
 [maxpool]
 size=2
 stride=2
-padding=1
 
 [convolutional]
 batch_normalize=1
@@ -110,6 +109,3 @@ activation=leaky
 [softmax]
 groups=1
 
-[cost]
-type=sse
-
diff --git a/cfg/darknet19.cfg b/cfg/darknet19.cfg
index bf73fb7b..f56a46e2 100644
--- a/cfg/darknet19.cfg
+++ b/cfg/darknet19.cfg
@@ -1,17 +1,31 @@
 [net]
-batch=128
-subdivisions=1
-height=224
-width=224
+# Training
+#batch=128
+#subdivisions=2
+
+# Testing
+ batch=1
+ subdivisions=1
+
+height=256
+width=256
+min_crop=128
+max_crop=448
 channels=3
 momentum=0.9
 decay=0.0005
-max_crop=448
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=1600000
+max_batches=800000
+
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
 [convolutional]
 batch_normalize=1
diff --git a/cfg/imagenet22k.dataset b/cfg/imagenet22k.dataset
index 920785d6..e25ef007 100644
--- a/cfg/imagenet22k.dataset
+++ b/cfg/imagenet22k.dataset
@@ -1,6 +1,7 @@
 classes=21842
 train  = /data/imagenet/imagenet22k.train.list
 valid  = /data/imagenet/imagenet22k.valid.list
+#valid  = /data/imagenet/imagenet1k.valid.list
 backup = /home/pjreddie/backup/
 labels = data/imagenet.labels.list
 names  = data/imagenet.shortnames.list
diff --git a/cfg/tiny-yolo.cfg b/cfg/tiny-yolo.cfg
index 9a4a184f..37e71356 100644
--- a/cfg/tiny-yolo.cfg
+++ b/cfg/tiny-yolo.cfg
@@ -120,7 +120,7 @@ filters=425
 activation=linear
 
 [region]
-anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
+anchors =  18.3274,21.6763,  59.9827,66.001,  106.83,175.179,  252.25,112.889,  312.657,293.385
 bias_match=1
 classes=80
 coords=4
diff --git a/cfg/yolo.cfg b/cfg/yolo.cfg
index 088edf81..b8a9f683 100644
--- a/cfg/yolo.cfg
+++ b/cfg/yolo.cfg
@@ -1,10 +1,10 @@
 [net]
 # Testing
-batch=1
-subdivisions=1
+# batch=1
+# subdivisions=1
 # Training
-# batch=64
-# subdivisions=8
+batch=64
+subdivisions=8
 width=608
 height=608
 channels=3
@@ -239,7 +239,7 @@ activation=linear
 
 
 [region]
-anchors =  0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
+anchors = 18.3274,21.6763, 59.9827,66.001, 106.83,175.179, 252.25,112.889, 312.657,293.385
 bias_match=1
 classes=80
 coords=4
diff --git a/examples/classifier.c b/examples/classifier.c
index 8843e548..e707ebc3 100644
--- a/examples/classifier.c
+++ b/examples/classifier.c
@@ -44,11 +44,17 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     list *options = read_data_cfg(datacfg);
 
     char *backup_directory = option_find_str(options, "backup", "/backup/");
+    int tag = option_find_int_quiet(options, "tag", 0);
     char *label_list = option_find_str(options, "labels", "data/labels.list");
     char *train_list = option_find_str(options, "train", "data/train.list");
+    char *tree = option_find_str(options, "tree", 0);
+    if (tree) net->hierarchy = read_tree(tree);
     int classes = option_find_int(options, "classes", 2);
 
-    char **labels = get_labels(label_list);
+    char **labels;
+    if(!tag){
+        labels = get_labels(label_list);
+    }
     list *plist = get_paths(train_list);
     char **paths = (char **)list_to_array(plist);
     printf("%d\n", plist->size);
@@ -76,7 +82,11 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     args.n = imgs;
     args.m = N;
     args.labels = labels;
-    args.type = CLASSIFICATION_DATA;
+    if (tag){
+        args.type = TAG_DATA;
+    } else {
+        args.type = CLASSIFICATION_DATA;
+    }
 
     data train;
     data buffer;
@@ -385,15 +395,13 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
             }
         }
         image im = load_image_color(paths[i], 0, 0);
-        image resized = resize_min(im, net->w);
-        image crop = crop_image(resized, (resized.w - net->w)/2, (resized.h - net->h)/2, net->w, net->h);
+        image crop = center_crop_image(im, net->w, net->h);
         //show_image(im, "orig");
         //show_image(crop, "cropped");
         //cvWaitKey(0);
         float *pred = network_predict(net, crop.data);
         if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 1, 1);
 
-        if(resized.data != im.data) free_image(resized);
         free_image(im);
         free_image(crop);
         top_k(pred, classes, topk, indexes);
@@ -403,6 +411,7 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile)
             if(indexes[j] == class) avg_topk += 1;
         }
 
+        printf("%s, %d, %f, %f, \n", paths[i], class, pred[0], pred[1]);
         printf("%d: top 1: %f, top %d: %f\n", i, avg_acc/(i+1), topk, avg_topk/(i+1));
     }
 }
@@ -577,6 +586,8 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi
         }
         image im = load_image_color(input, 0, 0);
         image r = letterbox_image(im, net->w, net->h);
+        //image r = resize_min(im, 320);
+        //printf("%d %d\n", r.w, r.h);
         //resize_network(net, r.w, r.h);
         //printf("%d %d\n", r.w, r.h);
 
@@ -704,6 +715,44 @@ void test_classifier(char *datacfg, char *cfgfile, char *weightfile, int target_
     }
 }
 
+void file_output_classifier(char *datacfg, char *filename, char *weightfile, char *listfile)
+{
+    int i,j;
+    network *net = load_network(filename, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(time(0));
+
+    list *options = read_data_cfg(datacfg);
+
+    //char *label_list = option_find_str(options, "names", "data/labels.list");
+    int classes = option_find_int(options, "classes", 2);
+
+    list *plist = get_paths(listfile);
+
+    char **paths = (char **)list_to_array(plist);
+    int m = plist->size;
+    free_list(plist);
+
+    for(i = 0; i < m; ++i){
+        image im = load_image_color(paths[i], 0, 0);
+        image resized = resize_min(im, net->w);
+        image crop = crop_image(resized, (resized.w - net->w)/2, (resized.h - net->h)/2, net->w, net->h);
+
+        float *pred = network_predict(net, crop.data);
+        if(net->hierarchy) hierarchy_predictions(pred, net->outputs, net->hierarchy, 0, 1);
+
+        if(resized.data != im.data) free_image(resized);
+        free_image(im);
+        free_image(crop);
+
+        printf("%s", paths[i]);
+        for(j = 0; j < classes; ++j){
+            printf("\t%g", pred[j]);
+        }
+        printf("\n");
+    }
+}
+
 
 void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
 {
@@ -914,6 +963,8 @@ void gun_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
 void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename)
 {
 #ifdef OPENCV
+    char *base = basecfg(cfgfile);
+    image **alphabet = load_alphabet();
     printf("Classifier Demo\n");
     network *net = load_network(cfgfile, weightfile, 0);
     set_batch_network(net, 1);
@@ -922,22 +973,33 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
     srand(2222222);
     CvCapture * cap;
 
+    int w = 1280;
+    int h = 720;
+
     if(filename){
         cap = cvCaptureFromFile(filename);
     }else{
         cap = cvCaptureFromCAM(cam_index);
     }
 
+    if(w){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+    }
+    if(h){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+    }
+
     int top = option_find_int(options, "top", 1);
 
-    char *name_list = option_find_str(options, "names", 0);
+    char *label_list = option_find_str(options, "labels", 0);
+    char *name_list = option_find_str(options, "names", label_list);
     char **names = get_labels(name_list);
 
     int *indexes = calloc(top, sizeof(int));
 
     if(!cap) error("Couldn't connect to webcam.\n");
-    cvNamedWindow("Classifier", CV_WINDOW_NORMAL); 
-    cvResizeWindow("Classifier", 512, 512);
+    cvNamedWindow(base, CV_WINDOW_NORMAL); 
+    cvResizeWindow(base, 512, 512);
     float fps = 0;
     int i;
 
@@ -946,8 +1008,8 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
         gettimeofday(&tval_before, NULL);
 
         image in = get_image_from_stream(cap);
-        image in_s = resize_image(in, net->w, net->h);
-        show_image(in, "Classifier");
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
 
         float *predictions = network_predict(net, in_s.data);
         if(net->hierarchy) hierarchy_predictions(predictions, net->outputs, net->hierarchy, 1, 1);
@@ -957,11 +1019,24 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind
         printf("\033[1;1H");
         printf("\nFPS:%.0f\n",fps);
 
+        int lh = in.h*.03;
+        int toph = 3*lh;
+
+        float rgb[3] = {1,1,1};
         for(i = 0; i < top; ++i){
+            printf("%d\n", toph);
             int index = indexes[i];
             printf("%.1f%%: %s\n", predictions[index]*100, names[index]);
+
+            char buff[1024];
+            sprintf(buff, "%3.1f%%: %s\n", predictions[index]*100, names[index]);
+            image label = get_label(alphabet, buff, lh);
+            draw_label(in, toph, lh, label, rgb);
+            toph += 2*lh;
+            free_image(label);
         }
 
+        show_image(in, base);
         free_image(in_s);
         free_image(in);
 
@@ -998,6 +1073,7 @@ void run_classifier(int argc, char **argv)
     char *layer_s = (argc > 7) ? argv[7]: 0;
     int layer = layer_s ? atoi(layer_s) : -1;
     if(0==strcmp(argv[2], "predict")) predict_classifier(data, cfg, weights, filename, top);
+    else if(0==strcmp(argv[2], "fout")) file_output_classifier(data, cfg, weights, filename);
     else if(0==strcmp(argv[2], "try")) try_classifier(data, cfg, weights, filename, atoi(layer_s));
     else if(0==strcmp(argv[2], "train")) train_classifier(data, cfg, weights, gpus, ngpus, clear);
     else if(0==strcmp(argv[2], "demo")) demo_classifier(data, cfg, weights, cam_index, filename);
diff --git a/examples/coco.c b/examples/coco.c
index 77e04bbf..6d9638c4 100644
--- a/examples/coco.c
+++ b/examples/coco.c
@@ -94,14 +94,14 @@ void train_coco(char *cfgfile, char *weightfile)
     save_weights(net, buff);
 }
 
-void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
+static void print_cocos(FILE *fp, int image_id, detection *dets, int num_boxes, int classes, int w, int h)
 {
     int i, j;
     for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -114,7 +114,7 @@ void print_cocos(FILE *fp, int image_id, box *boxes, float **probs, int num_boxe
         float bh = ymax - ymin;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
+            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
         }
     }
 }
@@ -140,17 +140,13 @@ void validate_coco(char *cfg, char *weights)
 
     layer l = net->layers[net->n-1];
     int classes = l.classes;
-    int side = l.side;
 
-    int j;
     char buff[1024];
     snprintf(buff, 1024, "%s/coco_results.json", base);
     FILE *fp = fopen(buff, "w");
     fprintf(fp, "[\n");
 
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
+    detection *dets = make_network_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -199,9 +195,9 @@ void validate_coco(char *cfg, char *weights)
             network_predict(net, X);
             int w = val[t].w;
             int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, side*side*l.n, classes, iou_thresh);
-            print_cocos(fp, image_id, boxes, probs, side*side*l.n, classes, w, h);
+            fill_network_boxes(net, w, h, thresh, 0, 0, 0, dets);
+            if (nms) do_nms_sort(dets, l.side*l.side*l.n, classes, iou_thresh);
+            print_cocos(fp, image_id, dets, l.side*l.side*l.n, classes, w, h);
             free_image(val[t]);
             free_image(val_resized[t]);
         }
@@ -235,9 +231,7 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         snprintf(buff, 1024, "%s%s.txt", base, coco_classes[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
+    detection *dets = make_network_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -245,7 +239,6 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
     float thresh = .001;
     int nms = 0;
     float iou_thresh = .5;
-    float nms_thresh = .5;
 
     int total = 0;
     int correct = 0;
@@ -258,8 +251,9 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         image sized = resize_image(orig, net->w, net->h);
         char *id = basecfg(path);
         network_predict(net, sized.data);
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms_thresh);
+
+        fill_network_boxes(net, orig.w, orig.h, thresh, 0, 0, 1, dets);
+        if (nms) do_nms_obj(dets, side*side*l.n, 1, nms);
 
         char labelpath[4096];
         find_replace(path, "images", "labels", labelpath);
@@ -270,7 +264,7 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
         int num_labels = 0;
         box_label *truth = read_boxes(labelpath, &num_labels);
         for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
+            if(dets[k].objectness > thresh){
                 ++proposals;
             }
         }
@@ -279,8 +273,8 @@ void validate_coco_recall(char *cfgfile, char *weightfile)
             box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
             float best_iou = 0;
             for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
                     best_iou = iou;
                 }
             }
@@ -308,10 +302,7 @@ void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
     clock_t time;
     char buff[256];
     char *input = buff;
-    int j;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+    detection *dets = make_network_boxes(net);
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -328,9 +319,11 @@ void test_coco(char *cfgfile, char *weightfile, char *filename, float thresh)
         time=clock();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, 0, coco_classes, alphabet, 80);
+
+        fill_network_boxes(net, 1, 1, thresh, 0, 0, 0, dets);
+        if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
+
+        draw_detections(im, dets, l.side*l.side*l.n, thresh, coco_classes, alphabet, 80);
         save_image(im, "prediction");
         show_image(im, "predictions");
         free_image(im);
diff --git a/examples/darknet.c b/examples/darknet.c
index b89f69ab..9757eb18 100644
--- a/examples/darknet.c
+++ b/examples/darknet.c
@@ -12,7 +12,6 @@ extern void run_coco(int argc, char **argv);
 extern void run_captcha(int argc, char **argv);
 extern void run_nightmare(int argc, char **argv);
 extern void run_classifier(int argc, char **argv);
-extern void run_attention(int argc, char **argv);
 extern void run_regressor(int argc, char **argv);
 extern void run_segmenter(int argc, char **argv);
 extern void run_char_rnn(int argc, char **argv);
@@ -189,6 +188,25 @@ void partial(char *cfgfile, char *weightfile, char *outfile, int max)
     save_weights_upto(net, outfile, max);
 }
 
+void print_weights(char *cfgfile, char *weightfile, int n)
+{
+    gpu_index = -1;
+    network *net = load_network(cfgfile, weightfile, 1);
+    layer l = net->layers[n];
+    int i, j;
+    //printf("[");
+    for(i = 0; i < l.n; ++i){
+        //printf("[");
+        for(j = 0; j < l.size*l.size*l.c; ++j){
+            //if(j > 0) printf(",");
+            printf("%g ", l.weights[i*l.size*l.size*l.c + j]);
+        }
+        printf("\n");
+        //printf("]%s\n", (i == l.n-1)?"":",");
+    }
+    //printf("]");
+}
+
 void rescale_net(char *cfgfile, char *weightfile, char *outfile)
 {
     gpu_index = -1;
@@ -432,8 +450,6 @@ int main(int argc, char **argv)
         predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5);
     } else if (0 == strcmp(argv[1], "classifier")){
         run_classifier(argc, argv);
-    } else if (0 == strcmp(argv[1], "attention")){
-        run_attention(argc, argv);
     } else if (0 == strcmp(argv[1], "regressor")){
         run_regressor(argc, argv);
     } else if (0 == strcmp(argv[1], "segmenter")){
@@ -470,6 +486,8 @@ int main(int argc, char **argv)
         oneoff(argv[2], argv[3], argv[4]);
     } else if (0 == strcmp(argv[1], "oneoff2")){
         oneoff2(argv[2], argv[3], argv[4], atoi(argv[5]));
+    } else if (0 == strcmp(argv[1], "print")){
+        print_weights(argv[2], argv[3], atoi(argv[4]));
     } else if (0 == strcmp(argv[1], "partial")){
         partial(argv[2], argv[3], argv[4], atoi(argv[5]));
     } else if (0 == strcmp(argv[1], "average")){
diff --git a/examples/detector.c b/examples/detector.c
index 15575331..5bf8ac40 100644
--- a/examples/detector.c
+++ b/examples/detector.c
@@ -2,6 +2,7 @@
 
 static int coco_ids[] = {1,2,3,4,5,6,7,8,9,10,11,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,31,32,33,34,35,36,37,38,39,40,41,42,43,44,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,67,70,72,73,74,75,76,77,78,79,80,81,82,84,85,86,87,88,89,90};
 
+
 void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear)
 {
     list *options = read_data_cfg(datacfg);
@@ -73,6 +74,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             free_data(train);
             load_thread = load_data(args);
 
+            #pragma omp parallel for
             for(i = 0; i < ngpus; ++i){
                 resize_network(nets[i], dim, dim);
             }
@@ -84,28 +86,28 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         load_thread = load_data(args);
 
         /*
-        int k;
-        for(k = 0; k < l.max_boxes; ++k){
-            box b = float_to_box(train.y.vals[10] + 1 + k*5);
-            if(!b.x) break;
-            printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
-        }
-        */
+           int k;
+           for(k = 0; k < l.max_boxes; ++k){
+           box b = float_to_box(train.y.vals[10] + 1 + k*5);
+           if(!b.x) break;
+           printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
+           }
+         */
         /*
-        int zz;
-        for(zz = 0; zz < train.X.cols; ++zz){
-            image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
-            int k;
-            for(k = 0; k < l.max_boxes; ++k){
-                box b = float_to_box(train.y.vals[zz] + k*5, 1);
-                printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
-                draw_bbox(im, b, 1, 1,0,0);
-            }
-            show_image(im, "truth11");
-            cvWaitKey(0);
-            save_image(im, "truth11");
-        }
-        */
+           int zz;
+           for(zz = 0; zz < train.X.cols; ++zz){
+           image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
+           int k;
+           for(k = 0; k < l.max_boxes; ++k){
+           box b = float_to_box(train.y.vals[zz] + k*5, 1);
+           printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
+           draw_bbox(im, b, 1, 1,0,0);
+           }
+           show_image(im, "truth11");
+           cvWaitKey(0);
+           save_image(im, "truth11");
+           }
+         */
 
         printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
 
@@ -158,15 +160,15 @@ static int get_coco_image_id(char *filename)
     return atoi(p+1);
 }
 
-static void print_cocos(FILE *fp, char *image_path, box *boxes, float **probs, int num_boxes, int classes, int w, int h)
+static void print_cocos(FILE *fp, char *image_path, detection *dets, int num_boxes, int classes, int w, int h)
 {
     int i, j;
     int image_id = get_coco_image_id(image_path);
     for(i = 0; i < num_boxes; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -179,19 +181,19 @@ static void print_cocos(FILE *fp, char *image_path, box *boxes, float **probs, i
         float bh = ymax - ymin;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, probs[i][j]);
+            if (dets[i].prob[j]) fprintf(fp, "{\"image_id\":%d, \"category_id\":%d, \"bbox\":[%f, %f, %f, %f], \"score\":%f},\n", image_id, coco_ids[j], bx, by, bw, bh, dets[i].prob[j]);
         }
     }
 }
 
-void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
+void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
 {
     int i, j;
     for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2. + 1;
-        float xmax = boxes[i].x + boxes[i].w/2. + 1;
-        float ymin = boxes[i].y - boxes[i].h/2. + 1;
-        float ymax = boxes[i].y + boxes[i].h/2. + 1;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2. + 1;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2. + 1;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2. + 1;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2. + 1;
 
         if (xmin < 1) xmin = 1;
         if (ymin < 1) ymin = 1;
@@ -199,20 +201,20 @@ void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs,
         if (ymax > h) ymax = h;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
+            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
                     xmin, ymin, xmax, ymax);
         }
     }
 }
 
-void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int total, int classes, int w, int h)
+void print_imagenet_detections(FILE *fp, int id, detection *dets, int total, int classes, int w, int h)
 {
     int i, j;
     for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -221,7 +223,7 @@ void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int
 
         for(j = 0; j < classes; ++j){
             int class = j;
-            if (probs[i][class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, probs[i][class],
+            if (dets[i].prob[class]) fprintf(fp, "%d %d %f %f %f %f %f\n", id, j+1, dets[i].prob[class],
                     xmin, ymin, xmax, ymax);
         }
     }
@@ -277,10 +279,7 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
         }
     }
 
-
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes+1, sizeof(float *));
+    detection *dets = make_network_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -334,14 +333,14 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
             network_predict(net, input.data);
             int w = val[t].w;
             int h = val[t].h;
-            get_region_boxes(l, w, h, net->w, net->h, thresh, probs, boxes, 0, 0, map, .5, 0);
-            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
+            fill_network_boxes(net, w, h, thresh, .5, map, 0, dets);
+            if (nms) do_nms_sort(dets, l.w*l.h*l.n, classes, nms);
             if (coco){
-                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_cocos(fp, path, dets, l.w*l.h*l.n, classes, w, h);
             } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_imagenet_detections(fp, i+t-nthreads+1, dets, l.w*l.h*l.n, classes, w, h);
             } else {
-                print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_detector_detections(fps, id, dets, l.w*l.h*l.n, classes, w, h);
             }
             free(id);
             free_image(val[t]);
@@ -410,10 +409,8 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
         }
     }
 
-
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes+1, sizeof(float *));
+    detection *dets = make_network_boxes(net);
+    int nboxes = num_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -462,14 +459,14 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
             network_predict(net, X);
             int w = val[t].w;
             int h = val[t].h;
-            get_region_boxes(l, w, h, net->w, net->h, thresh, probs, boxes, 0, 0, map, .5, 0);
-            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
+            fill_network_boxes(net, w, h, thresh, .5, map, 0, dets);
+            if (nms) do_nms_sort(dets, nboxes, classes, nms);
             if (coco){
-                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_cocos(fp, path, dets, nboxes, classes, w, h);
             } else if (imagenet){
-                print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_imagenet_detections(fp, i+t-nthreads+1, dets, nboxes, classes, w, h);
             } else {
-                print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h);
+                print_detector_detections(fps, id, dets, nboxes, classes, w, h);
             }
             free(id);
             free_image(val[t]);
@@ -498,12 +495,9 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
     char **paths = (char **)list_to_array(plist);
 
     layer l = net->layers[net->n-1];
-    int classes = l.classes;
 
     int j, k;
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes+1, sizeof(float *));
+    detection *dets = make_network_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -516,6 +510,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
     int correct = 0;
     int proposals = 0;
     float avg_iou = 0;
+    int nboxes = num_boxes(net);
 
     for(i = 0; i < m; ++i){
         char *path = paths[i];
@@ -523,8 +518,8 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
         image sized = resize_image(orig, net->w, net->h);
         char *id = basecfg(path);
         network_predict(net, sized.data);
-        get_region_boxes(l, sized.w, sized.h, net->w, net->h, thresh, probs, boxes, 0, 1, 0, .5, 1);
-        if (nms) do_nms(boxes, probs, l.w*l.h*l.n, 1, nms);
+        fill_network_boxes(net, sized.w, sized.h, thresh, .5, 0, 1, dets);
+        if (nms) do_nms_obj(dets, nboxes, 1, nms);
 
         char labelpath[4096];
         find_replace(path, "images", "labels", labelpath);
@@ -534,8 +529,8 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
 
         int num_labels = 0;
         box_label *truth = read_boxes(labelpath, &num_labels);
-        for(k = 0; k < l.w*l.h*l.n; ++k){
-            if(probs[k][0] > thresh){
+        for(k = 0; k < nboxes; ++k){
+            if(dets[k].objectness > thresh){
                 ++proposals;
             }
         }
@@ -544,8 +539,8 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
             box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
             float best_iou = 0;
             for(k = 0; k < l.w*l.h*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
                     best_iou = iou;
                 }
             }
@@ -562,6 +557,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
     }
 }
 
+
 void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh, char *outfile, int fullscreen)
 {
     list *options = read_data_cfg(datacfg);
@@ -575,7 +571,6 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
     double time;
     char buff[256];
     char *input = buff;
-    int j;
     float nms=.3;
     while(1){
         if(filename){
@@ -595,23 +590,18 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
         //resize_network(net, sized.w, sized.h);
         layer l = net->layers[net->n-1];
 
-        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-        float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes + 1, sizeof(float *));
-        float **masks = 0;
-        if (l.coords > 4){
-            masks = calloc(l.w*l.h*l.n, sizeof(float*));
-            for(j = 0; j < l.w*l.h*l.n; ++j) masks[j] = calloc(l.coords-4, sizeof(float *));
-        }
+        int nboxes = num_boxes(net);
+        printf("%d\n", nboxes);
 
         float *X = sized.data;
         time=what_time_is_it_now();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, what_time_is_it_now()-time);
-        get_region_boxes(l, im.w, im.h, net->w, net->h, thresh, probs, boxes, masks, 0, 0, hier_thresh, 1);
+        detection *dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 1);
         //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
-        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, masks, names, alphabet, l.classes);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+        draw_detections(im, dets, nboxes, thresh, names, alphabet, l.classes);
+        free_detections(dets, num_boxes(net));
         if(outfile){
             save_image(im, outfile);
         }
@@ -630,12 +620,190 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam
 
         free_image(im);
         free_image(sized);
-        free(boxes);
-        free_ptrs((void **)probs, l.w*l.h*l.n);
         if (filename) break;
     }
 }
 
+void censor_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+{
+    image **alphabet = load_alphabet();
+    char *base = basecfg(cfgfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    list *options = read_data_cfg(datacfg);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    int w = 1280;
+    int h = 720;
+
+    if(filename){
+        cap = cvCaptureFromFile(filename);
+    }else{
+        cap = cvCaptureFromCAM(cam_index);
+    }
+
+    if(w){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+    }
+    if(h){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+    }
+
+    int top = option_find_int(options, "top", 1);
+
+    char *label_list = option_find_str(options, "labels", 0);
+    char *name_list = option_find_str(options, "names", label_list);
+    char **names = get_labels(name_list);
+
+    int *indexes = calloc(top, sizeof(int));
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    cvNamedWindow(base, CV_WINDOW_NORMAL); 
+    cvResizeWindow(base, 512, 512);
+    float fps = 0;
+    int i;
+    int count = 0;
+    float nms = .45;
+
+    while(1){
+        image in = get_image_from_stream(cap);
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
+        layer l = net->layers[net->n-1];
+
+        int nboxes = num_boxes(net);
+
+        float *X = in_s.data;
+        network_predict(net, X);
+        detection *dets = get_network_boxes(net, in.w, in.h, thresh, 0, 0, 0);
+        //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+
+        for(i = 0; i < nboxes; ++i){
+            if(dets[i].prob[class] > thresh){
+                box b = dets[i].bbox;
+                int left  = b.x-b.w/2.;
+                int top   = b.y-b.h/2.;
+                censor_image(in, left, top, b.w, b.h);
+            }
+        }
+        show_image(in, base);
+        cvWaitKey(10);
+        free_detections(dets, num_boxes(net));
+
+
+        free_image(in_s);
+        free_image(in);
+
+
+        float curr = 0;
+        fps = .9*fps + .1*curr;
+        for(i = 0; i < skip; ++i){
+            image in = get_image_from_stream(cap);
+            free_image(in);
+        }
+    }
+}
+
+void extract_detector(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename, int class, float thresh, int skip)
+{
+    image **alphabet = load_alphabet();
+    char *base = basecfg(cfgfile);
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    list *options = read_data_cfg(datacfg);
+
+    srand(2222222);
+    CvCapture * cap;
+
+    int w = 1280;
+    int h = 720;
+
+    if(filename){
+        cap = cvCaptureFromFile(filename);
+    }else{
+        cap = cvCaptureFromCAM(cam_index);
+    }
+
+    if(w){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_WIDTH, w);
+    }
+    if(h){
+        cvSetCaptureProperty(cap, CV_CAP_PROP_FRAME_HEIGHT, h);
+    }
+
+    int top = option_find_int(options, "top", 1);
+
+    char *label_list = option_find_str(options, "labels", 0);
+    char *name_list = option_find_str(options, "names", label_list);
+    char **names = get_labels(name_list);
+
+    int *indexes = calloc(top, sizeof(int));
+
+    if(!cap) error("Couldn't connect to webcam.\n");
+    cvNamedWindow(base, CV_WINDOW_NORMAL); 
+    cvResizeWindow(base, 512, 512);
+    float fps = 0;
+    int i;
+    int count = 0;
+    float nms = .45;
+
+    while(1){
+        image in = get_image_from_stream(cap);
+        //image in_s = resize_image(in, net->w, net->h);
+        image in_s = letterbox_image(in, net->w, net->h);
+        layer l = net->layers[net->n-1];
+
+        int nboxes = num_boxes(net);
+        show_image(in, base);
+
+        float *X = in_s.data;
+        network_predict(net, X);
+        detection *dets = get_network_boxes(net, in.w, in.h, thresh, 0, 0, 1);
+        //if (nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+
+        for(i = 0; i < nboxes; ++i){
+            if(dets[i].prob[class] > thresh){
+                box b = dets[i].bbox;
+                int size = b.w*in.w > b.h*in.h ? b.w*in.w : b.h*in.h;
+                int dx  = b.x*in.w-size/2.;
+                int dy  = b.y*in.h-size/2.;
+                image bim = crop_image(in, dx, dy, size, size);
+                char buff[2048];
+                sprintf(buff, "results/extract/%07d", count);
+                ++count;
+                save_image(bim, buff);
+                free_image(bim);
+            }
+        }
+        free_detections(dets, num_boxes(net));
+
+
+        free_image(in_s);
+        free_image(in);
+
+
+        float curr = 0;
+        fps = .9*fps + .1*curr;
+        for(i = 0; i < skip; ++i){
+            image in = get_image_from_stream(cap);
+            free_image(in);
+        }
+    }
+}
+
+void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets)
+{
+    network_predict_image(net, im);
+    layer l = net->layers[net->n-1];
+    int nboxes = num_boxes(net);
+    fill_network_boxes(net, im.w, im.h, thresh, hier_thresh, 0, 0, dets);
+    if (nms) do_nms_sort(dets, nboxes, l.classes, nms);
+}
+
 void run_detector(int argc, char **argv)
 {
     char *prefix = find_char_arg(argc, argv, "-prefix", 0);
@@ -677,12 +845,15 @@ void run_detector(int argc, char **argv)
     int width = find_int_arg(argc, argv, "-w", 0);
     int height = find_int_arg(argc, argv, "-h", 0);
     int fps = find_int_arg(argc, argv, "-fps", 0);
+    int class = find_int_arg(argc, argv, "-class", 0);
 
     char *datacfg = argv[3];
     char *cfg = argv[4];
     char *weights = (argc > 5) ? argv[5] : 0;
     char *filename = (argc > 6) ? argv[6]: 0;
     if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh, outfile, fullscreen);
+    else if(0==strcmp(argv[2], "extract")) extract_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
+    else if(0==strcmp(argv[2], "censor")) censor_detector(datacfg, cfg, weights, cam_index, filename, class, thresh, frame_skip);
     else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
     else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
     else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile);
diff --git a/examples/lsd.c b/examples/lsd.c
index c5977483..ad337644 100644
--- a/examples/lsd.c
+++ b/examples/lsd.c
@@ -383,7 +383,31 @@ void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear
 }
 */
 
-void test_dcgan(char *cfgfile, char *weightfile)
+void slerp(float *start, float *end, float s, int n, float *out)
+{
+    float omega = acos(dot_cpu(n, start, 1, end, 1));
+    float so = sin(omega);
+    fill_cpu(n, 0, out, 1);
+    axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1);
+    axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1);
+
+    float mag = mag_array(out, n);
+    scale_array(out, n, 1./mag);
+}
+
+image random_unit_vector_image(int w, int h, int c)
+{
+    image im = make_image(w, h, c);
+    int i;
+    for(i = 0; i < im.w*im.h*im.c; ++i){
+        im.data[i] = rand_normal();
+    }
+    float mag = mag_array(im.data, im.w*im.h*im.c);
+    scale_array(im.data, im.w*im.h*im.c, 1./mag);
+    return im;
+}
+
+void inter_dcgan(char *cfgfile, char *weightfile)
 {
     network *net = load_network(cfgfile, weightfile, 0);
     set_batch_network(net, 1);
@@ -401,6 +425,62 @@ void test_dcgan(char *cfgfile, char *weightfile)
             break;
         }
     }
+    image start = random_unit_vector_image(net->w, net->h, net->c);
+    image end = random_unit_vector_image(net->w, net->h, net->c);
+        image im = make_image(net->w, net->h, net->c);
+        image orig = copy_image(start);
+
+    int c = 0;
+    int count = 0;
+    int max_count = 15;
+    while(1){
+        ++c;
+        
+        if(count == max_count){
+            count = 0;
+            free_image(start);
+            start = end;
+            end = random_unit_vector_image(net->w, net->h, net->c);
+            if(c > 300){
+                end = orig;
+            }
+            if(c>300 + max_count) return;
+        }
+        ++count;
+
+        slerp(start.data, end.data, (float)count / max_count, im.w*im.h*im.c, im.data);
+
+        float *X = im.data;
+        time=clock();
+        network_predict(net, X);
+        image out = get_network_image_layer(net, imlayer);
+        //yuv_to_rgb(out);
+        normalize_image(out);
+        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
+        //char buff[256];
+        sprintf(buff, "out%05d", c);
+        show_image(out, "out");
+        save_image(out, "out");
+        save_image(out, buff);
+#ifdef OPENCV
+        //cvWaitKey(0);
+#endif
+
+    }
+}
+
+void test_dcgan(char *cfgfile, char *weightfile)
+{
+    network *net = load_network(cfgfile, weightfile, 0);
+    set_batch_network(net, 1);
+    srand(2222222);
+
+    clock_t time;
+    char buff[256];
+    char *input = buff;
+    int i, imlayer = 0;
+
+    imlayer = net->n-1;
 
     while(1){
         image im = make_image(net->w, net->h, net->c);
@@ -408,6 +488,8 @@ void test_dcgan(char *cfgfile, char *weightfile)
         for(i = 0; i < im.w*im.h*im.c; ++i){
             im.data[i] = rand_normal();
         }
+        //float mag = mag_array(im.data, im.w*im.h*im.c);
+        //scale_array(im.data, im.w*im.h*im.c, 1./mag);
 
         float *X = im.data;
         time=clock();
@@ -426,21 +508,177 @@ void test_dcgan(char *cfgfile, char *weightfile)
     }
 }
 
-void dcgan_batch(network gnet, network anet)
+void set_network_alpha_beta(network *net, float alpha, float beta)
 {
-    //float *input = calloc(x_size, sizeof(float));
+    int i;
+    for(i = 0; i < net->n; ++i){
+        if(net->layers[i].type == SHORTCUT){
+            net->layers[i].alpha = alpha;
+            net->layers[i].beta = beta;
+        }
+    }
 }
 
-
-void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images)
+void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
+{
+#ifdef GPU
+    char *backup_directory = "/home/pjreddie/backup/";
+    srand(time(0));
+    char *base = basecfg(cfg);
+    char *abase = basecfg(acfg);
+    printf("%s\n", base);
+    network *gnet = load_network(cfg, weight, clear);
+    network *anet = load_network(acfg, aweight, clear);
+
+    int i, j, k;
+    layer imlayer = gnet->layers[gnet->n-1];
+
+    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay);
+    int imgs = gnet->batch*gnet->subdivisions;
+    i = *gnet->seen/imgs;
+    data train, buffer;
+
+
+    list *plist = get_paths(train_images);
+    char **paths = (char **)list_to_array(plist);
+
+    load_args args= get_base_args(anet);
+    args.paths = paths;
+    args.n = imgs;
+    args.m = plist->size;
+    args.d = &buffer;
+    args.type = CLASSIFICATION_DATA;
+    args.threads=16;
+    args.classes = 1;
+    char *ls[2] = {"imagenet", "zzzzzzzz"};
+    args.labels = ls;
+
+    pthread_t load_thread = load_data_in_thread(args);
+    clock_t time;
+
+    gnet->train = 1;
+    anet->train = 1;
+
+    int x_size = gnet->inputs*gnet->batch;
+    int y_size = gnet->truths*gnet->batch;
+    float *imerror = cuda_make_array(0, y_size);
+
+    float aloss_avg = -1;
+
+    if (maxbatch == 0) maxbatch = gnet->max_batches;
+    while (get_current_batch(gnet) < maxbatch) {
+        {
+            int cb = get_current_batch(gnet);
+            float alpha = (float) cb / (maxbatch/2);
+            if(alpha > 1) alpha = 1;
+            float beta = 1 - alpha;
+            printf("%f %f\n", alpha, beta);
+            set_network_alpha_beta(gnet, alpha, beta);
+            set_network_alpha_beta(anet, beta, alpha);
+        }
+
+        i += 1;
+        time=clock();
+        pthread_join(load_thread, 0);
+        train = buffer;
+
+        load_thread = load_data_in_thread(args);
+
+        printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+        data gen = copy_data(train);
+        for (j = 0; j < imgs; ++j) {
+            train.y.vals[j][0] = 1;
+            gen.y.vals[j][0] = 0;
+        }
+        time=clock();
+
+        for (j = 0; j < gnet->subdivisions; ++j) {
+            get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0);
+            int z;
+            for(z = 0; z < x_size; ++z){
+                gnet->input[z] = rand_normal();
+            }
+            /*
+               for(z = 0; z < gnet->batch; ++z){
+               float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
+               scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
+               }
+             */
+            *gnet->seen += gnet->batch;
+            forward_network(gnet);
+
+            fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
+            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
+            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
+            anet->delta_gpu = imerror;
+            forward_network(anet);
+            backward_network(anet);
+
+            float genaloss = *anet->cost / anet->batch;
+
+            scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
+            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);
+
+            backward_network(gnet);
+
+            for(k = 0; k < gnet->batch; ++k){
+                int index = j*gnet->batch + k;
+                copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1);
+            }
+        }
+        harmless_update_network_gpu(anet);
+
+        data merge = concat_data(train, gen);
+        float aloss = train_network(anet, merge);
+
+#ifdef OPENCV
+        if(display){
+            image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
+            image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
+            show_image(im, "gen");
+            show_image(im2, "train");
+            save_image(im, "gen");
+            save_image(im2, "train");
+            cvWaitKey(1);
+        }
+#endif
+
+        update_network_gpu(gnet);
+
+        free_data(merge);
+        free_data(train);
+        free_data(gen);
+        if (aloss_avg < 0) aloss_avg = aloss;
+        aloss_avg = aloss_avg*.9 + aloss*.1;
+
+        printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs);
+        if(i%10000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+            save_weights(anet, buff);
+        }
+        if(i%1000==0){
+            char buff[256];
+            sprintf(buff, "%s/%s.backup", backup_directory, base);
+            save_weights(gnet, buff);
+            sprintf(buff, "%s/%s.backup", backup_directory, abase);
+            save_weights(anet, buff);
+        }
+    }
+    char buff[256];
+    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+    save_weights(gnet, buff);
+#endif
+}
+
+void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch)
 {
 #ifdef GPU
-    //char *train_images = "/home/pjreddie/data/coco/train1.txt";
-    //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
-    //char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list";
-    //char *train_images = "data/64.txt";
-    //char *train_images = "data/alp.txt";
-    //char *train_images = "data/cifar.txt";
     char *backup_directory = "/home/pjreddie/backup/";
     srand(time(0));
     char *base = basecfg(cfg);
@@ -450,7 +688,6 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
     network *anet = load_network(acfg, aweight, clear);
     //float orig_rate = anet->learning_rate;
 
-    int start = 0;
     int i, j, k;
     layer imlayer = {0};
     for (i = 0; i < gnet->n; ++i) {
@@ -497,8 +734,8 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
 
     //data generated = copy_data(train);
 
-    while (get_current_batch(gnet) < gnet->max_batches) {
-    start += 1;
+    if (maxbatch == 0) maxbatch = gnet->max_batches;
+    while (get_current_batch(gnet) < maxbatch) {
         i += 1;
         time=clock();
         pthread_join(load_thread, 0);
@@ -513,8 +750,8 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
 
         data gen = copy_data(train);
         for (j = 0; j < imgs; ++j) {
-            train.y.vals[j][0] = .95;
-            gen.y.vals[j][0] = .05;
+            train.y.vals[j][0] = 1;
+            gen.y.vals[j][0] = 0;
         }
         time=clock();
 
@@ -524,31 +761,50 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
             for(z = 0; z < x_size; ++z){
                 gnet->input[z] = rand_normal();
             }
+            for(z = 0; z < gnet->batch; ++z){
+                float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs);
+                scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag);
+            }
+            /*
+               for(z = 0; z < 100; ++z){
+               printf("%f, ", gnet->input[z]);
+               }
+               printf("\n");
+               printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size));
+             */
 
-            cuda_push_array(gnet->input_gpu, gnet->input, x_size);
-            cuda_push_array(gnet->truth_gpu, gnet->truth, y_size);
+            //cuda_push_array(gnet->input_gpu, gnet->input, x_size);
+            //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size);
             *gnet->seen += gnet->batch;
-            forward_network_gpu(gnet);
+            forward_network(gnet);
 
             fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1);
-            fill_gpu(anet->truths*anet->batch, .95, anet->truth_gpu, 1);
-            copy_gpu(anet->inputs*anet->batch, imlayer.output_gpu, 1, anet->input_gpu, 1);
+            fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1);
+            copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1);
             anet->delta_gpu = imerror;
-            forward_network_gpu(anet);
-            backward_network_gpu(anet);
+            forward_network(anet);
+            backward_network(anet);
 
             float genaloss = *anet->cost / anet->batch;
-            printf("%f\n", genaloss);
+            //printf("%f\n", genaloss);
 
             scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1);
-            scal_gpu(imlayer.outputs*imlayer.batch, .00, gnet->layers[gnet->n-1].delta_gpu, 1);
+            scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1);
 
-            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
-            printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch));
+            //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch));
+            //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch));
 
             axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1);
 
-            backward_network_gpu(gnet);
+            backward_network(gnet);
+
+            /*
+               for(k = 0; k < gnet->n; ++k){
+               layer l = gnet->layers[k];
+               cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
+               printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch));
+               }
+             */
 
             for(k = 0; k < gnet->batch; ++k){
                 int index = j*gnet->batch + k;
@@ -565,23 +821,25 @@ void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear,
         //scale_image(im, .5);
         //translate_image(im2, 1);
         //scale_image(im2, .5);
-            #ifdef OPENCV
+#ifdef OPENCV
         if(display){
             image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]);
             image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
             show_image(im, "gen");
             show_image(im2, "train");
-            cvWaitKey(50);
+            save_image(im, "gen");
+            save_image(im2, "train");
+            cvWaitKey(1);
         }
-        #endif
+#endif
 
-/*
-        if(aloss < .1){
-            anet->learning_rate = 0;
-        } else if (aloss > .3){
-            anet->learning_rate = orig_rate;
-        }
-        */
+        /*
+           if(aloss < .1){
+           anet->learning_rate = 0;
+           } else if (aloss > .3){
+           anet->learning_rate = orig_rate;
+           }
+         */
 
         update_network_gpu(gnet);
 
@@ -747,15 +1005,15 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
 
         update_network_gpu(net);
 
-            #ifdef OPENCV
+#ifdef OPENCV
         if(display){
             image im = float_to_image(anet->w, anet->h, anet->c, gray.X.vals[0]);
             image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]);
             show_image(im, "gen");
             show_image(im2, "train");
-            cvWaitKey(50);
+            cvWaitKey(1);
         }
-        #endif
+#endif
         free_data(merge);
         free_data(train);
         free_data(gray);
@@ -786,259 +1044,259 @@ void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int cle
 }
 
 /*
-void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear)
-{
+   void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear)
+   {
 #ifdef GPU
-    char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    if(clear) *net->seen = 0;
+char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+char *backup_directory = "/home/pjreddie/backup/";
+srand(time(0));
+char *base = basecfg(cfgfile);
+printf("%s\n", base);
+network net = parse_network_cfg(cfgfile);
+if(weightfile){
+load_weights(&net, weightfile);
+}
+if(clear) *net->seen = 0;
 
-    char *abase = basecfg(acfgfile);
-    network anet = parse_network_cfg(acfgfile);
-    if(aweightfile){
-        load_weights(&anet, aweightfile);
-    }
-    if(clear) *anet->seen = 0;
+char *abase = basecfg(acfgfile);
+network anet = parse_network_cfg(acfgfile);
+if(aweightfile){
+load_weights(&anet, aweightfile);
+}
+if(clear) *anet->seen = 0;
 
-    int i, j, k;
-    layer imlayer = {0};
-    for (i = 0; i < net->n; ++i) {
-        if (net->layers[i].out_c == 3) {
-            imlayer = net->layers[i];
-            break;
+int i, j, k;
+layer imlayer = {0};
+for (i = 0; i < net->n; ++i) {
+if (net->layers[i].out_c == 3) {
+imlayer = net->layers[i];
+break;
+}
+}
+
+printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+int imgs = net->batch*net->subdivisions;
+i = *net->seen/imgs;
+data train, buffer;
+
+
+list *plist = get_paths(train_images);
+//int N = plist->size;
+char **paths = (char **)list_to_array(plist);
+
+load_args args = {0};
+args.w = net->w;
+args.h = net->h;
+args.paths = paths;
+args.n = imgs;
+args.m = plist->size;
+args.d = &buffer;
+
+args.min = net->min_crop;
+args.max = net->max_crop;
+args.angle = net->angle;
+args.aspect = net->aspect;
+args.exposure = net->exposure;
+args.saturation = net->saturation;
+args.hue = net->hue;
+args.size = net->w;
+args.type = CLASSIFICATION_DATA;
+args.classes = 1;
+char *ls[1] = {"coco"};
+args.labels = ls;
+
+pthread_t load_thread = load_data_in_thread(args);
+clock_t time;
+
+network_state gstate = {0};
+gstate.index = 0;
+gstate.net = net;
+int x_size = get_network_input_size(net)*net->batch;
+int y_size = 1*net->batch;
+gstate.input = cuda_make_array(0, x_size);
+gstate.truth = 0;
+gstate.delta = 0;
+gstate.train = 1;
+float *X = calloc(x_size, sizeof(float));
+float *y = calloc(y_size, sizeof(float));
+
+network_state astate = {0};
+astate.index = 0;
+astate.net = anet;
+int ay_size = get_network_output_size(anet)*anet->batch;
+astate.input = 0;
+astate.truth = 0;
+astate.delta = 0;
+astate.train = 1;
+
+float *imerror = cuda_make_array(0, imlayer.outputs);
+float *ones_gpu = cuda_make_array(0, ay_size);
+fill_gpu(ay_size, 1, ones_gpu, 1);
+
+float aloss_avg = -1;
+float gloss_avg = -1;
+
+//data generated = copy_data(train);
+
+while (get_current_batch(net) < net->max_batches) {
+    i += 1;
+    time=clock();
+    pthread_join(load_thread, 0);
+    train = buffer;
+    load_thread = load_data_in_thread(args);
+
+    printf("Loaded: %lf seconds\n", sec(clock()-time));
+
+    data generated = copy_data(train);
+    time=clock();
+    float gloss = 0;
+
+    for(j = 0; j < net->subdivisions; ++j){
+        get_next_batch(train, net->batch, j*net->batch, X, y);
+        cuda_push_array(gstate.input, X, x_size);
+        *net->seen += net->batch;
+        forward_network_gpu(net, gstate);
+
+        fill_gpu(imlayer.outputs, 0, imerror, 1);
+        astate.input = imlayer.output_gpu;
+        astate.delta = imerror;
+        astate.truth = ones_gpu;
+        forward_network_gpu(anet, astate);
+        backward_network_gpu(anet, astate);
+
+        scal_gpu(imlayer.outputs, 1, imerror, 1);
+        axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
+
+        backward_network_gpu(net, gstate);
+
+        printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs));
+        printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
+
+        gloss += get_network_cost(net) /(net->subdivisions*net->batch);
+
+        cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch);
+        for(k = 0; k < net->batch; ++k){
+            int index = j*net->batch + k;
+            copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
+            generated.y.vals[index][0] = 0;
         }
     }
+    harmless_update_network_gpu(anet);
 
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
-    int imgs = net->batch*net->subdivisions;
-    i = *net->seen/imgs;
-    data train, buffer;
+    data merge = concat_data(train, generated);
+    randomize_data(merge);
+    float aloss = train_network(anet, merge);
 
+    update_network_gpu(net);
+    update_network_gpu(anet);
+    free_data(merge);
+    free_data(train);
+    free_data(generated);
+    if (aloss_avg < 0) aloss_avg = aloss;
+    aloss_avg = aloss_avg*.9 + aloss*.1;
+    gloss_avg = gloss_avg*.9 + gloss*.1;
 
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
-
-    load_args args = {0};
-    args.w = net->w;
-    args.h = net->h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.d = &buffer;
-
-    args.min = net->min_crop;
-    args.max = net->max_crop;
-    args.angle = net->angle;
-    args.aspect = net->aspect;
-    args.exposure = net->exposure;
-    args.saturation = net->saturation;
-    args.hue = net->hue;
-    args.size = net->w;
-    args.type = CLASSIFICATION_DATA;
-    args.classes = 1;
-    char *ls[1] = {"coco"};
-    args.labels = ls;
-
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-
-    network_state gstate = {0};
-    gstate.index = 0;
-    gstate.net = net;
-    int x_size = get_network_input_size(net)*net->batch;
-    int y_size = 1*net->batch;
-    gstate.input = cuda_make_array(0, x_size);
-    gstate.truth = 0;
-    gstate.delta = 0;
-    gstate.train = 1;
-    float *X = calloc(x_size, sizeof(float));
-    float *y = calloc(y_size, sizeof(float));
-
-    network_state astate = {0};
-    astate.index = 0;
-    astate.net = anet;
-    int ay_size = get_network_output_size(anet)*anet->batch;
-    astate.input = 0;
-    astate.truth = 0;
-    astate.delta = 0;
-    astate.train = 1;
-
-    float *imerror = cuda_make_array(0, imlayer.outputs);
-    float *ones_gpu = cuda_make_array(0, ay_size);
-    fill_gpu(ay_size, 1, ones_gpu, 1);
-
-    float aloss_avg = -1;
-    float gloss_avg = -1;
-
-    //data generated = copy_data(train);
-
-    while (get_current_batch(net) < net->max_batches) {
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
-
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
-
-        data generated = copy_data(train);
-        time=clock();
-        float gloss = 0;
-
-        for(j = 0; j < net->subdivisions; ++j){
-            get_next_batch(train, net->batch, j*net->batch, X, y);
-            cuda_push_array(gstate.input, X, x_size);
-            *net->seen += net->batch;
-            forward_network_gpu(net, gstate);
-
-            fill_gpu(imlayer.outputs, 0, imerror, 1);
-            astate.input = imlayer.output_gpu;
-            astate.delta = imerror;
-            astate.truth = ones_gpu;
-            forward_network_gpu(anet, astate);
-            backward_network_gpu(anet, astate);
-
-            scal_gpu(imlayer.outputs, 1, imerror, 1);
-            axpy_gpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1);
-
-            backward_network_gpu(net, gstate);
-
-            printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs));
-            printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs));
-
-            gloss += get_network_cost(net) /(net->subdivisions*net->batch);
-
-            cuda_pull_array(imlayer.output_gpu, imlayer.output, imlayer.outputs*imlayer.batch);
-            for(k = 0; k < net->batch; ++k){
-                int index = j*net->batch + k;
-                copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1);
-                generated.y.vals[index][0] = 0;
-            }
-        }
-        harmless_update_network_gpu(anet);
-
-        data merge = concat_data(train, generated);
-        randomize_data(merge);
-        float aloss = train_network(anet, merge);
-
-        update_network_gpu(net);
-        update_network_gpu(anet);
-        free_data(merge);
-        free_data(train);
-        free_data(generated);
-        if (aloss_avg < 0) aloss_avg = aloss;
-        aloss_avg = aloss_avg*.9 + aloss*.1;
-        gloss_avg = gloss_avg*.9 + gloss*.1;
-
-        printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
-            save_weights(anet, buff);
-        }
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-            sprintf(buff, "%s/%s.backup", backup_directory, abase);
-            save_weights(anet, buff);
-        }
+    printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs);
+    if(i%1000==0){
+        char buff[256];
+        sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+        save_weights(net, buff);
+        sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i);
+        save_weights(anet, buff);
     }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
+    if(i%100==0){
+        char buff[256];
+        sprintf(buff, "%s/%s.backup", backup_directory, base);
+        save_weights(net, buff);
+        sprintf(buff, "%s/%s.backup", backup_directory, abase);
+        save_weights(anet, buff);
+    }
+}
+char buff[256];
+sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+save_weights(net, buff);
 #endif
 }
 */
 
 /*
-void train_lsd(char *cfgfile, char *weightfile, int clear)
-{
-    char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
-    char *backup_directory = "/home/pjreddie/backup/";
-    srand(time(0));
-    char *base = basecfg(cfgfile);
-    printf("%s\n", base);
-    float avg_loss = -1;
-    network net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    if(clear) *net->seen = 0;
-    printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
-    int imgs = net->batch*net->subdivisions;
-    int i = *net->seen/imgs;
-    data train, buffer;
+   void train_lsd(char *cfgfile, char *weightfile, int clear)
+   {
+   char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt";
+   char *backup_directory = "/home/pjreddie/backup/";
+   srand(time(0));
+   char *base = basecfg(cfgfile);
+   printf("%s\n", base);
+   float avg_loss = -1;
+   network net = parse_network_cfg(cfgfile);
+   if(weightfile){
+   load_weights(&net, weightfile);
+   }
+   if(clear) *net->seen = 0;
+   printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
+   int imgs = net->batch*net->subdivisions;
+   int i = *net->seen/imgs;
+   data train, buffer;
 
 
-    list *plist = get_paths(train_images);
-    //int N = plist->size;
-    char **paths = (char **)list_to_array(plist);
+   list *plist = get_paths(train_images);
+//int N = plist->size;
+char **paths = (char **)list_to_array(plist);
 
-    load_args args = {0};
-    args.w = net->w;
-    args.h = net->h;
-    args.paths = paths;
-    args.n = imgs;
-    args.m = plist->size;
-    args.d = &buffer;
+load_args args = {0};
+args.w = net->w;
+args.h = net->h;
+args.paths = paths;
+args.n = imgs;
+args.m = plist->size;
+args.d = &buffer;
 
-    args.min = net->min_crop;
-    args.max = net->max_crop;
-    args.angle = net->angle;
-    args.aspect = net->aspect;
-    args.exposure = net->exposure;
-    args.saturation = net->saturation;
-    args.hue = net->hue;
-    args.size = net->w;
-    args.type = CLASSIFICATION_DATA;
-    args.classes = 1;
-    char *ls[1] = {"coco"};
-    args.labels = ls;
+args.min = net->min_crop;
+args.max = net->max_crop;
+args.angle = net->angle;
+args.aspect = net->aspect;
+args.exposure = net->exposure;
+args.saturation = net->saturation;
+args.hue = net->hue;
+args.size = net->w;
+args.type = CLASSIFICATION_DATA;
+args.classes = 1;
+char *ls[1] = {"coco"};
+args.labels = ls;
 
-    pthread_t load_thread = load_data_in_thread(args);
-    clock_t time;
-    //while(i*imgs < N*120){
-    while(get_current_batch(net) < net->max_batches){
-        i += 1;
-        time=clock();
-        pthread_join(load_thread, 0);
-        train = buffer;
-        load_thread = load_data_in_thread(args);
+pthread_t load_thread = load_data_in_thread(args);
+clock_t time;
+//while(i*imgs < N*120){
+while(get_current_batch(net) < net->max_batches){
+i += 1;
+time=clock();
+pthread_join(load_thread, 0);
+train = buffer;
+load_thread = load_data_in_thread(args);
 
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
+printf("Loaded: %lf seconds\n", sec(clock()-time));
 
-        time=clock();
-        float loss = train_network(net, train);
-        if (avg_loss < 0) avg_loss = loss;
-        avg_loss = avg_loss*.9 + loss*.1;
+time=clock();
+float loss = train_network(net, train);
+if (avg_loss < 0) avg_loss = loss;
+avg_loss = avg_loss*.9 + loss*.1;
 
-        printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
-        if(i%1000==0){
-            char buff[256];
-            sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
-            save_weights(net, buff);
-        }
-        if(i%100==0){
-            char buff[256];
-            sprintf(buff, "%s/%s.backup", backup_directory, base);
-            save_weights(net, buff);
-        }
-        free_data(train);
-    }
-    char buff[256];
-    sprintf(buff, "%s/%s_final.weights", backup_directory, base);
-    save_weights(net, buff);
+printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+if(i%1000==0){
+char buff[256];
+sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
+save_weights(net, buff);
+}
+if(i%100==0){
+char buff[256];
+sprintf(buff, "%s/%s.backup", backup_directory, base);
+save_weights(net, buff);
+}
+free_data(train);
+}
+char buff[256];
+sprintf(buff, "%s/%s_final.weights", backup_directory, base);
+save_weights(net, buff);
 }
 */
 
@@ -1107,6 +1365,7 @@ void run_lsd(int argc, char **argv)
 
     int clear = find_arg(argc, argv, "-clear");
     int display = find_arg(argc, argv, "-display");
+    int batches = find_int_arg(argc, argv, "-b", 0);
     char *file = find_char_arg(argc, argv, "-file", "/home/pjreddie/data/imagenet/imagenet1k.train.list");
 
     char *cfg = argv[3];
@@ -1118,9 +1377,11 @@ void run_lsd(int argc, char **argv)
     //else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear);
     //else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear);
     //else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear);
-    if(0==strcmp(argv[2], "traingan")) train_dcgan(cfg, weights, acfg, aweights, clear, display, file);
+    if(0==strcmp(argv[2], "traingan")) train_dcgan(cfg, weights, acfg, aweights, clear, display, file, batches);
+    else if(0==strcmp(argv[2], "trainprog")) train_prog(cfg, weights, acfg, aweights, clear, display, file, batches);
     else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear, display);
     else if(0==strcmp(argv[2], "gan")) test_dcgan(cfg, weights);
+    else if(0==strcmp(argv[2], "inter")) inter_dcgan(cfg, weights);
     else if(0==strcmp(argv[2], "test")) test_lsd(cfg, weights, filename, 0);
     else if(0==strcmp(argv[2], "color")) test_lsd(cfg, weights, filename, 1);
     /*
diff --git a/examples/nightmare.c b/examples/nightmare.c
index 71d38334..8ec6e966 100644
--- a/examples/nightmare.c
+++ b/examples/nightmare.c
@@ -83,6 +83,10 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa
      */
 
     //rate = rate / abs_mean(out.data, out.w*out.h*out.c);
+    image gray = make_image(out.w, out.h, out.c);
+    fill_image(gray, .5);
+    axpy_cpu(orig.w*orig.h*orig.c, -1, orig.data, 1, gray.data, 1);
+    axpy_cpu(orig.w*orig.h*orig.c, .1, gray.data, 1, out.data, 1);
 
     if(norm) normalize_array(out.data, out.w*out.h*out.c);
     axpy_cpu(orig.w*orig.h*orig.c, rate, out.data, 1, orig.data, 1);
diff --git a/examples/regressor.c b/examples/regressor.c
index 00c55aae..60a9f2b9 100644
--- a/examples/regressor.c
+++ b/examples/regressor.c
@@ -32,6 +32,7 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
 
     char *backup_directory = option_find_str(options, "backup", "/backup/");
     char *train_list = option_find_str(options, "train", "data/train.list");
+    int classes = option_find_int(options, "classes", 1);
 
     list *plist = get_paths(train_list);
     char **paths = (char **)list_to_array(plist);
@@ -43,9 +44,10 @@ void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     args.w = net->w;
     args.h = net->h;
     args.threads = 32;
+    args.classes = classes;
 
-    args.min = net->min_crop;
-    args.max = net->max_crop;
+    args.min = net->min_ratio*net->w;
+    args.max = net->max_ratio*net->w;
     args.angle = net->angle;
     args.aspect = net->aspect;
     args.exposure = net->exposure;
@@ -160,6 +162,10 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
     }else{
         cap = cvCaptureFromCAM(cam_index);
     }
+    list *options = read_data_cfg(datacfg);
+    int classes = option_find_int(options, "classes", 1);
+    char *name_list = option_find_str(options, "names", 0);
+    char **names = get_labels(name_list);
 
     if(!cap) error("Couldn't connect to webcam.\n");
     cvNamedWindow("Regressor", CV_WINDOW_NORMAL); 
@@ -171,19 +177,23 @@ void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
         gettimeofday(&tval_before, NULL);
 
         image in = get_image_from_stream(cap);
-        image in_s = letterbox_image(in, net->w, net->h);
-        show_image(in, "Regressor");
+        image crop = center_crop_image(in, net->w, net->h);
+        grayscale_image_3c(crop);
+        show_image(crop, "Regressor");
 
-        float *predictions = network_predict(net, in_s.data);
+        float *predictions = network_predict(net, crop.data);
 
         printf("\033[2J");
         printf("\033[1;1H");
         printf("\nFPS:%.0f\n",fps);
 
-        printf("People: %f\n", predictions[0]);
+        int i;
+        for(i = 0; i < classes; ++i){
+            printf("%s: %f\n", names[i], predictions[i]);
+        }
 
-        free_image(in_s);
         free_image(in);
+        free_image(crop);
 
         cvWaitKey(10);
 
diff --git a/examples/super.c b/examples/super.c
index 506b065e..79799d05 100644
--- a/examples/super.c
+++ b/examples/super.c
@@ -93,6 +93,8 @@ void test_super(char *cfgfile, char *weightfile, char *filename)
         image out = get_network_image(net);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
         save_image(out, "out");
+        show_image(out, "out");
+        cvWaitKey(0);
 
         free_image(im);
         if (filename) break;
diff --git a/examples/yolo.c b/examples/yolo.c
index 9174b401..af4b8b50 100644
--- a/examples/yolo.c
+++ b/examples/yolo.c
@@ -74,14 +74,14 @@ void train_yolo(char *cfgfile, char *weightfile)
     save_weights(net, buff);
 }
 
-void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int total, int classes, int w, int h)
+void print_yolo_detections(FILE **fps, char *id, int total, int classes, int w, int h, detection *dets)
 {
     int i, j;
     for(i = 0; i < total; ++i){
-        float xmin = boxes[i].x - boxes[i].w/2.;
-        float xmax = boxes[i].x + boxes[i].w/2.;
-        float ymin = boxes[i].y - boxes[i].h/2.;
-        float ymax = boxes[i].y + boxes[i].h/2.;
+        float xmin = dets[i].bbox.x - dets[i].bbox.w/2.;
+        float xmax = dets[i].bbox.x + dets[i].bbox.w/2.;
+        float ymin = dets[i].bbox.y - dets[i].bbox.h/2.;
+        float ymax = dets[i].bbox.y + dets[i].bbox.h/2.;
 
         if (xmin < 0) xmin = 0;
         if (ymin < 0) ymin = 0;
@@ -89,7 +89,7 @@ void print_yolo_detections(FILE **fps, char *id, box *boxes, float **probs, int
         if (ymax > h) ymax = h;
 
         for(j = 0; j < classes; ++j){
-            if (probs[i][j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, probs[i][j],
+            if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
                     xmin, ymin, xmax, ymax);
         }
     }
@@ -118,9 +118,6 @@ void validate_yolo(char *cfg, char *weights)
         snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
 
     int m = plist->size;
     int i=0;
@@ -136,6 +133,7 @@ void validate_yolo(char *cfg, char *weights)
     image *buf = calloc(nthreads, sizeof(image));
     image *buf_resized = calloc(nthreads, sizeof(image));
     pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
+    detection *dets = make_network_boxes(net);
 
     load_args args = {0};
     args.w = net->w;
@@ -169,9 +167,9 @@ void validate_yolo(char *cfg, char *weights)
             network_predict(net, X);
             int w = val[t].w;
             int h = val[t].h;
-            get_detection_boxes(l, w, h, thresh, probs, boxes, 0);
-            if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, classes, iou_thresh);
-            print_yolo_detections(fps, id, boxes, probs, l.side*l.side*l.n, classes, w, h);
+            fill_network_boxes(net, w, h, thresh, 0, 0, 0, dets);
+            if (nms) do_nms_sort(dets, l.side*l.side*l.n, classes, iou_thresh);
+            print_yolo_detections(fps, id, l.side*l.side*l.n, classes, w, h, dets);
             free(id);
             free_image(val[t]);
             free_image(val_resized[t]);
@@ -202,9 +200,7 @@ void validate_yolo_recall(char *cfg, char *weights)
         snprintf(buff, 1024, "%s%s.txt", base, voc_names[j]);
         fps[j] = fopen(buff, "w");
     }
-    box *boxes = calloc(side*side*l.n, sizeof(box));
-    float **probs = calloc(side*side*l.n, sizeof(float *));
-    for(j = 0; j < side*side*l.n; ++j) probs[j] = calloc(classes, sizeof(float *));
+    detection *dets = make_network_boxes(net);
 
     int m = plist->size;
     int i=0;
@@ -224,8 +220,9 @@ void validate_yolo_recall(char *cfg, char *weights)
         image sized = resize_image(orig, net->w, net->h);
         char *id = basecfg(path);
         network_predict(net, sized.data);
-        get_detection_boxes(l, orig.w, orig.h, thresh, probs, boxes, 1);
-        if (nms) do_nms(boxes, probs, side*side*l.n, 1, nms);
+
+        fill_network_boxes(net, orig.w, orig.h, thresh, 0, 0, 1, dets);
+        if (nms) do_nms_obj(dets, side*side*l.n, 1, nms);
 
         char labelpath[4096];
         find_replace(path, "images", "labels", labelpath);
@@ -236,7 +233,7 @@ void validate_yolo_recall(char *cfg, char *weights)
         int num_labels = 0;
         box_label *truth = read_boxes(labelpath, &num_labels);
         for(k = 0; k < side*side*l.n; ++k){
-            if(probs[k][0] > thresh){
+            if(dets[k].objectness > thresh){
                 ++proposals;
             }
         }
@@ -245,8 +242,8 @@ void validate_yolo_recall(char *cfg, char *weights)
             box t = {truth[j].x, truth[j].y, truth[j].w, truth[j].h};
             float best_iou = 0;
             for(k = 0; k < side*side*l.n; ++k){
-                float iou = box_iou(boxes[k], t);
-                if(probs[k][0] > thresh && iou > best_iou){
+                float iou = box_iou(dets[k].bbox, t);
+                if(dets[k].objectness > thresh && iou > best_iou){
                     best_iou = iou;
                 }
             }
@@ -273,11 +270,8 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
     clock_t time;
     char buff[256];
     char *input = buff;
-    int j;
     float nms=.4;
-    box *boxes = calloc(l.side*l.side*l.n, sizeof(box));
-    float **probs = calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+    detection *dets = make_network_boxes(net);
     while(1){
         if(filename){
             strncpy(input, filename, 256);
@@ -294,9 +288,11 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh)
         time=clock();
         network_predict(net, X);
         printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_detection_boxes(l, 1, 1, thresh, probs, boxes, 0);
-        if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-        draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, 0, voc_names, alphabet, 20);
+
+        fill_network_boxes(net, 1, 1, thresh, 0, 0, 0, dets);
+        if (nms) do_nms_sort(dets, l.side*l.side*l.n, l.classes, nms);
+
+        draw_detections(im, dets, l.side*l.side*l.n, thresh, voc_names, alphabet, 20);
         save_image(im, "predictions");
         show_image(im, "predictions");
 
diff --git a/include/darknet.h b/include/darknet.h
index 5fa2ec17..81a01957 100644
--- a/include/darknet.h
+++ b/include/darknet.h
@@ -51,6 +51,7 @@ typedef struct{
     int *group_size;
     int *group_offset;
 } tree;
+tree *read_tree(char *filename);
 
 typedef enum{
     LOGISTIC, RELU, RELIE, LINEAR, RAMP, TANH, PLSE, LEAKY, ELU, LOGGY, STAIR, HARDTAN, LHTAN
@@ -85,11 +86,14 @@ typedef enum {
     XNOR,
     REGION,
     REORG,
+    UPSAMPLE,
+    LOGXENT,
+    L2NORM,
     BLANK
 } LAYER_TYPE;
 
 typedef enum{
-    SSE, MASKED, L1, SEG, SMOOTH
+    SSE, MASKED, L1, SEG, SMOOTH,WGAN
 } COST_TYPE;
 
 typedef struct{
@@ -160,18 +164,20 @@ struct layer{
     float shift;
     float ratio;
     float learning_rate_scale;
+    float clip;
     int softmax;
     int classes;
     int coords;
     int background;
     int rescore;
     int objectness;
-    int does_cost;
     int joint;
     int noadjust;
     int reorg;
     int log;
     int tanh;
+    int *mask;
+    int total;
 
     float alpha;
     float beta;
@@ -184,13 +190,17 @@ struct layer{
     float class_scale;
     int bias_match;
     int random;
+    float ignore_thresh;
+    float truth_thresh;
     float thresh;
+    float focus;
     int classfix;
     int absolute;
 
     int onlyforward;
     int stopbackward;
     int dontload;
+    int dontsave;
     int dontloadscales;
 
     float temperature;
@@ -228,6 +238,7 @@ struct layer{
 
     float * delta;
     float * output;
+    float * loss;
     float * squared;
     float * norms;
 
@@ -389,6 +400,7 @@ struct layer{
     float * scale_change_gpu;
 
     float * output_gpu;
+    float * loss_gpu;
     float * delta_gpu;
     float * rand_gpu;
     float * squared_gpu;
@@ -470,6 +482,7 @@ typedef struct network{
     int train;
     int index;
     float *cost;
+    float clip;
 
 #ifdef GPU
     float *input_gpu;
@@ -501,6 +514,15 @@ typedef struct{
     float x, y, w, h;
 } box;
 
+typedef struct detection{
+    box bbox;
+    int classes;
+    float *prob;
+    float *mask;
+    float objectness;
+    int sort_class;
+} detection;
+
 typedef struct matrix{
     int rows, cols;
     float **vals;
@@ -590,9 +612,11 @@ void backward_network(network *net);
 void update_network(network *net);
 
 
+float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
 void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
 void copy_cpu(int N, float *X, int INCX, float *Y, int INCY);
 void scal_cpu(int N, float ALPHA, float *X, int INCX);
+void fill_cpu(int N, float ALPHA, float * X, int INCX);
 void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
 void softmax(float *input, int n, float temp, int stride, float *output);
 
@@ -618,6 +642,8 @@ float train_networks(network **nets, int n, data d, int interval);
 void sync_nets(network **nets, int n, int interval);
 void harmless_update_network_gpu(network *net);
 #endif
+image get_label(image **characters, char *string, int size);
+void draw_label(image a, int r, int c, image label, const float *rgb);
 void save_image_png(image im, const char *name);
 void get_next_batch(data d, int n, int offset, float *X, float *y);
 void grayscale_image_3c(image im);
@@ -644,7 +670,7 @@ void rgbgr_weights(layer l);
 image *get_weights(layer l);
 
 void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, int avg, float hier_thresh, int w, int h, int fps, int fullscreen);
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets);
 
 char *option_find_str(list *l, char *key, char *def);
 int option_find_int(list *l, char *key, int def);
@@ -656,7 +682,7 @@ void save_weights_upto(network *net, char *filename, int cutoff);
 void load_weights_upto(network *net, char *filename, int start, int cutoff);
 
 void zero_objectness(layer l);
-void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, float **probs, box *boxes, float **masks, int only_objectness, int *map, float tree_thresh, int relative);
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets);
 void free_network(network *net);
 void set_batch_network(network *net, int b);
 void set_temp_network(network *net, float t);
@@ -664,8 +690,10 @@ image load_image(char *filename, int w, int h, int c);
 image load_image_color(char *filename, int w, int h);
 image make_image(int w, int h, int c);
 image resize_image(image im, int w, int h);
+void censor_image(image im, int dx, int dy, int w, int h);
 image letterbox_image(image im, int w, int h);
 image crop_image(image im, int dx, int dy, int w, int h);
+image center_crop_image(image im, int w, int h);
 image resize_min(image im, int min);
 image resize_max(image im, int max);
 image threshold_image(image im, float thresh);
@@ -697,11 +725,10 @@ double what_time_is_it_now();
 image rotate_image(image m, float rad);
 void visualize_network(network *net);
 float box_iou(box a, box b);
-void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
 data load_all_cifar10();
 box_label *read_boxes(char *filename, int *n);
 box float_to_box(float *f, int stride);
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, float **masks, char **names, image **alphabet, int classes);
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes);
 
 matrix network_predict_data(network *net, data test);
 image **load_alphabet();
@@ -711,15 +738,18 @@ float *network_predict(network *net, float *input);
 int network_width(network *net);
 int network_height(network *net);
 float *network_predict_image(network *net, image im);
-void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, box *boxes, float **probs);
+void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, detection *dets);
 int num_boxes(network *net);
-box *make_boxes(network *net);
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative);
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets);
+detection *make_network_boxes(network *net);
+void free_detections(detection *dets, int n);
 
 void reset_network_state(network *net, int b);
 
 char **get_labels(char *filename);
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
+void do_nms_obj(detection *dets, int total, int classes, float thresh);
+void do_nms_sort(detection *dets, int total, int classes, float thresh);
 
 matrix make_matrix(int rows, int cols);
 
@@ -758,11 +788,13 @@ void free_list(list *l);
 float mse_array(float *a, int n);
 float variance_array(float *a, int n);
 float mag_array(float *a, int n);
+void scale_array(float *a, int n, float s);
 float mean_array(float *a, int n);
 float sum_array(float *a, int n);
 void normalize_array(float *a, int n);
 int *read_intlist(char *s, int *n, int d);
 size_t rand_size_t();
 float rand_normal();
+float rand_uniform(float min, float max);
 
 #endif
diff --git a/python/darknet.py b/python/darknet.py
index 398a2913..203096b5 100644
--- a/python/darknet.py
+++ b/python/darknet.py
@@ -23,6 +23,15 @@ class BOX(Structure):
                 ("w", c_float),
                 ("h", c_float)]
 
+class DETECTION(Structure):
+    _fields_ = [("bbox", BOX),
+                ("classes", c_int),
+                ("prob", POINTER(c_float)),
+                ("mask", POINTER(c_float)),
+                ("objectness", c_float),
+                ("sort_class", c_int)]
+
+
 class IMAGE(Structure):
     _fields_ = [("w", c_int),
                 ("h", c_int),
@@ -53,9 +62,16 @@ make_image = lib.make_image
 make_image.argtypes = [c_int, c_int, c_int]
 make_image.restype = IMAGE
 
-make_boxes = lib.make_boxes
-make_boxes.argtypes = [c_void_p]
-make_boxes.restype = POINTER(BOX)
+get_network_boxes = lib.get_network_boxes
+get_network_boxes.argtypes = [c_void_p, c_int, c_int, c_float, c_float, POINTER(c_int), c_int]
+get_network_boxes.restype = POINTER(DETECTION)
+
+make_network_boxes = lib.make_network_boxes
+make_network_boxes.argtypes = [c_void_p]
+make_network_boxes.restype = POINTER(DETECTION)
+
+free_detections = lib.free_detections
+free_detections.argtypes = [POINTER(DETECTION), c_int]
 
 free_ptrs = lib.free_ptrs
 free_ptrs.argtypes = [POINTER(c_void_p), c_int]
@@ -64,12 +80,8 @@ num_boxes = lib.num_boxes
 num_boxes.argtypes = [c_void_p]
 num_boxes.restype = c_int
 
-make_probs = lib.make_probs
-make_probs.argtypes = [c_void_p]
-make_probs.restype = POINTER(POINTER(c_float))
-
-detect = lib.network_predict
-detect.argtypes = [c_void_p, IMAGE, c_float, c_float, c_float, POINTER(BOX), POINTER(POINTER(c_float))]
+network_predict = lib.network_predict
+network_predict.argtypes = [c_void_p, POINTER(c_float)]
 
 reset_rnn = lib.reset_rnn
 reset_rnn.argtypes = [c_void_p]
@@ -78,6 +90,12 @@ load_net = lib.load_network
 load_net.argtypes = [c_char_p, c_char_p, c_int]
 load_net.restype = c_void_p
 
+do_nms_obj = lib.do_nms_obj
+do_nms_obj.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
+do_nms_sort = lib.do_nms_sort
+do_nms_sort.argtypes = [POINTER(DETECTION), c_int, c_int, c_float]
+
 free_image = lib.free_image
 free_image.argtypes = [IMAGE]
 
@@ -100,21 +118,6 @@ predict_image = lib.network_predict_image
 predict_image.argtypes = [c_void_p, IMAGE]
 predict_image.restype = POINTER(c_float)
 
-network_detect = lib.network_detect
-network_detect.argtypes = [c_void_p, IMAGE, c_float, c_float, c_float, POINTER(BOX), POINTER(POINTER(c_float))]
-
-import numpy
-def array_to_image(arr):
-    arr = arr.copy()
-    arr = arr.transpose(2,0,1)
-    c = arr.shape[0]
-    h = arr.shape[1]
-    w = arr.shape[2]
-    arr = (arr.astype(numpy.float32)/255.0).flatten()
-    data = c_array(c_float, arr)
-    im = IMAGE(w,h,c,data)
-    return im
-
 def classify(net, meta, im):
     out = predict_image(net, im)
     res = []
@@ -124,24 +127,21 @@ def classify(net, meta, im):
     return res
 
 def detect(net, meta, image, thresh=.5, hier_thresh=.5, nms=.45):
-    if type(image) == numpy.ndarray:
-        im = array_to_image(image)
-    else:
-        im = load_image(image, 0, 0)
-    boxes = make_boxes(net)
-    probs = make_probs(net)
+    im = load_image(image, 0, 0)
     num =   num_boxes(net)
-    network_detect(net, im, thresh, hier_thresh, nms, boxes, probs)
+    predict_image(net, im)
+    dets = get_network_boxes(net, im.w, im.h, thresh, hier_thresh, None, 0)
+    if (nms): do_nms_obj(dets, num, meta.classes, nms);
+
     res = []
     for j in range(num):
         for i in range(meta.classes):
-            if probs[j][i] > 0:
-                res.append((meta.names[i], probs[j][i], (boxes[j].x, boxes[j].y, boxes[j].w, boxes[j].h)))
+            if dets[j].prob[i] > 0:
+                b = dets[j].bbox
+                res.append((meta.names[i], dets[j].prob[i], (b.x, b.y, b.w, b.h)))
     res = sorted(res, key=lambda x: -x[1])
-
-    if type(image) != numpy.ndarray:
-        free_image(im)
-    free_ptrs(cast(probs, POINTER(c_void_p)), num)
+    free_image(im)
+    free_detections(dets, num)
     return res
 
 if __name__ == "__main__":
@@ -153,6 +153,4 @@ if __name__ == "__main__":
     net = load_net("cfg/tiny-yolo.cfg", "tiny-yolo.weights", 0)
     meta = load_meta("cfg/coco.data")
     r = detect(net, meta, "data/dog.jpg")
-    print(r)
-
-
+    print r
diff --git a/src/blas.c b/src/blas.c
index d25c1969..a1bb5554 100644
--- a/src/blas.c
+++ b/src/blas.c
@@ -65,7 +65,7 @@ void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, floa
     }
 }
 
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int stride = w1/w2;
     int sample = w2/w1;
@@ -84,7 +84,7 @@ void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2,
                 for(i = 0; i < minw; ++i){
                     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
                     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-                    out[out_index] += add[add_index];
+                    out[out_index] = s1*out[out_index] + s2*add[add_index];
                 }
             }
         }
@@ -123,6 +123,27 @@ void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, fl
     }
 }
 
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    int b,f,i;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < spatial; ++i){
+            float sum = 0;
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                sum += powf(x[index], 2);
+            }
+            sum = sqrtf(sum);
+            for(f = 0; f < filters; ++f){
+                int index = b*filters*spatial + f*spatial + i;
+                x[index] /= sum;
+                dx[index] = (1 - x[index]) / sum;
+            }
+        }
+    }
+}
+
+
 void normalize_cpu(float *x, float *mean, float *variance, int batch, int filters, int spatial)
 {
     int b, f, i;
@@ -241,6 +262,28 @@ void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
     }
 }
 
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p) - (1-t)*log(1-p);
+        delta[i] = t-p;
+    }
+}
+
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i;
@@ -288,3 +331,21 @@ void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, i
     }
 }
 
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    int i, j, k, b;
+    for(b = 0; b < batch; ++b){
+        for(k = 0; k < c; ++k){
+            for(j = 0; j < h*stride; ++j){
+                for(i = 0; i < w*stride; ++i){
+                    int in_index = b*w*h*c + k*w*h + (j/stride)*w + i/stride;
+                    int out_index = b*w*h*c + k*w*h + j*w + i;
+                    if(forward) out[out_index] = scale*in[in_index];
+                    else in[in_index] += scale*out[out_index];
+                }
+            }
+        }
+    }
+}
+
+
diff --git a/src/blas.h b/src/blas.h
index a8408f32..707291de 100644
--- a/src/blas.h
+++ b/src/blas.h
@@ -19,10 +19,8 @@ void constrain_gpu(int N, float ALPHA, float * X, int INCX);
 void pow_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
 void mul_cpu(int N, float *X, int INCX, float *Y, int INCY);
 
-void fill_cpu(int N, float ALPHA, float * X, int INCX);
-float dot_cpu(int N, float *X, int INCX, float *Y, int INCY);
 int test_gpu_blas();
-void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_cpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 
 void mean_cpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_cpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
@@ -32,15 +30,19 @@ void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size,
 void mean_delta_cpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta);
 void  variance_delta_cpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta);
 void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
+void l2normalize_cpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void l1_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
 void weighted_delta_cpu(float *a, float *b, float *s, float *da, float *db, float *ds, int n, float *dc);
 
 void softmax(float *input, int n, float temp, int stride, float *output);
 void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
+void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #ifdef GPU
 #include "cuda.h"
@@ -52,7 +54,7 @@ void copy_gpu(int N, float * X, int INCX, float * Y, int INCY);
 void copy_gpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void add_gpu(int N, float ALPHA, float * X, int INCX);
 void supp_gpu(int N, float ALPHA, float * X, int INCX);
-void mask_gpu(int N, float * X, float mask_num, float * mask);
+void mask_gpu(int N, float * X, float mask_num, float * mask, float val);
 void scale_mask_gpu(int N, float * X, float mask_num, float * mask, float scale);
 void const_gpu(int N, float ALPHA, float *X, int INCX);
 void pow_gpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
@@ -61,6 +63,7 @@ void mul_gpu(int N, float *X, int INCX, float *Y, int INCY);
 void mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
 void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial);
+void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial);
 
 void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta);
 
@@ -69,16 +72,19 @@ void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *varianc
 
 void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance);
 void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean);
-void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out);
+void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates);
 void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
 
+void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
+void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
 void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c);
 void mult_add_into_gpu(int num, float *a, float *b, float *c);
@@ -93,6 +99,7 @@ void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rat
 
 void flatten_gpu(float *x, int spatial, int layers, int batch, int forward, float *out);
 void softmax_tree(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
+void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 
 #endif
 #endif
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index a483f2eb..47e82179 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -164,8 +164,11 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float
 {
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
+
+    float mhat = m[index] / (1.f - powf(B1, t));
+    float vhat = v[index] / (1.f - powf(B2, t));
     
-    x[index] = x[index] + (rate * sqrtf(1.f-powf(B2, t)) / (1.f-powf(B1, t)) * m[index] / (sqrtf(v[index]) + eps));
+    x[index] = x[index] + rate * mhat / (sqrtf(vhat) + eps);
 }
 
 extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
@@ -446,12 +449,6 @@ __global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
     if(i < N) X[i*INCX] = ALPHA;
 }
 
-__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask)
-{
-    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
-    if(i < n && mask[i] == mask_num) x[i] = mask_num;
-}
-
 __global__ void copy_kernel(int N,  float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -472,6 +469,35 @@ extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch,
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void l2norm_kernel(int N, float *x, float *dx, int batch, int filters, int spatial)
+{
+    int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if (index >= N) return;
+    int b = index / spatial;
+    int i = index % spatial;
+    int f;
+    float sum = 0;
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        sum += powf(x[index], 2);
+    }
+    sum = sqrtf(sum);
+    if(sum == 0) sum = 1;
+    //printf("%f\n", sum);
+    for(f = 0; f < filters; ++f){
+        int index = b*filters*spatial + f*spatial + i;
+        x[index] /= sum;
+        dx[index] = (1 - x[index]) / sum;
+    }
+}
+
+extern "C" void l2normalize_gpu(float *x, float *dx, int batch, int filters, int spatial)
+{
+    size_t N = batch*spatial;
+    l2norm_kernel<<<cuda_gridsize(N), BLOCK>>>(N, x, dx, batch, filters, spatial);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void  fast_mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
     const int threads = BLOCK;
@@ -621,6 +647,18 @@ extern "C" void reorg_gpu(float *x, int w, int h, int c, int batch, int stride,
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void mask_kernel(int n,  float *x, float mask_num, float *mask, float val)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n && mask[i] == mask_num) x[i] = val;
+}
+
+extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask, float val)
+{
+    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask, val);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void scale_mask_kernel(int n,  float *x, float mask_num, float *mask, float scale)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -633,12 +671,6 @@ extern "C" void scale_mask_gpu(int N, float * X, float mask_num, float * mask, f
     check_error(cudaPeekAtLastError());
 }
 
-extern "C" void mask_gpu(int N, float * X, float mask_num, float * mask)
-{
-    mask_kernel<<<cuda_gridsize(N), BLOCK>>>(N, X, mask_num, mask);
-    check_error(cudaPeekAtLastError());
-}
-
 extern "C" void const_gpu(int N, float ALPHA, float * X, int INCX)
 {
     const_kernel<<<cuda_gridsize(N), BLOCK>>>(N, ALPHA, X, INCX);
@@ -676,7 +708,7 @@ extern "C" void fill_gpu(int N, float ALPHA, float * X, int INCX)
     check_error(cudaPeekAtLastError());
 }
 
-__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+__global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stride, int sample, int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (id >= size) return;
@@ -690,10 +722,11 @@ __global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stri
 
     int out_index = i*sample + w2*(j*sample + h2*(k + c2*b));
     int add_index = i*stride + w1*(j*stride + h1*(k + c1*b));
-    out[out_index] += add[add_index];
+    out[out_index] = s1*out[out_index] + s2*add[add_index];
+    //out[out_index] += add[add_index];
 }
 
-extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out)
+extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float s1, float s2, float *out)
 {
     int minw = (w1 < w2) ? w1 : w2;
     int minh = (h1 < h2) ? h1 : h2;
@@ -707,7 +740,7 @@ extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int
     if(sample < 1) sample = 1;
 
     int size = batch * minw * minh * minc;
-    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, out);
+    shortcut_kernel<<<cuda_gridsize(size), BLOCK>>>(size, minw, minh, minc, stride, sample, batch, w1, h1, c1, add, w2, h2, c2, s1, s2, out);
     check_error(cudaPeekAtLastError());
 }
 
@@ -734,6 +767,40 @@ extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, fl
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = (t) ? -log(p) : 0;
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    softmax_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
+__global__ void logistic_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        float t = truth[i];
+        float p = pred[i];
+        error[i] = -t*log(p+.0000001) - (1-t)*log(1-p+.0000001);
+        delta[i] = t-p;
+    }
+}
+
+extern "C" void logistic_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    logistic_x_ent_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
 __global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
 {
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@@ -766,6 +833,21 @@ extern "C" void l1_gpu(int n, float *pred, float *truth, float *delta, float *er
     check_error(cudaPeekAtLastError());
 }
 
+__global__ void wgan_kernel(int n, float *pred, float *truth, float *delta, float *error)
+{
+    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i < n){
+        error[i] = truth[i] ? -pred[i] : pred[i];
+        delta[i] = (truth[i] > 0) ? 1 : -1;
+    }
+}
+
+extern "C" void wgan_gpu(int n, float *pred, float *truth, float *delta, float *error)
+{
+    wgan_kernel<<<cuda_gridsize(n), BLOCK>>>(n, pred, truth, delta, error);
+    check_error(cudaPeekAtLastError());
+}
+
 
 
 
@@ -892,13 +974,13 @@ extern "C" void softmax_tree(float *input, int spatial, int batch, int stride, f
     int *tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
     int *tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
     /*
-    static int *tree_groups_size = 0;
-    static int *tree_groups_offset = 0;
-    if(!tree_groups_size){
-        tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
-        tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
-    }
-    */
+       static int *tree_groups_size = 0;
+       static int *tree_groups_offset = 0;
+       if(!tree_groups_size){
+       tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
+       tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
+       }
+     */
     int num = spatial*batch*hier.groups;
     softmax_tree_kernel<<<cuda_gridsize(num), BLOCK>>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
     check_error(cudaPeekAtLastError());
@@ -920,3 +1002,34 @@ extern "C" void softmax_gpu(float *input, int n, int batch, int batch_offset, in
     softmax_kernel<<<cuda_gridsize(batch*groups), BLOCK>>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
     check_error(cudaPeekAtLastError());
 }
+
+
+__global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
+    if(i >= N) return;
+    int out_index = i;
+    int out_w = i%(w*stride);
+    i = i/(w*stride);
+    int out_h = i%(h*stride);
+    i = i/(h*stride);
+    int out_c = i%c;
+    i = i/c;
+    int b = i%batch;
+
+    int in_w = out_w / stride;
+    int in_h = out_h / stride;
+    int in_c = out_c;
+
+    int in_index = b*w*h*c + in_c*w*h + in_h*w + in_w;
+
+
+    if(forward) out[out_index] += scale * x[in_index];
+    else atomicAdd(x+in_index, scale * out[out_index]);
+}
+extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
+{
+    size_t size = w*h*c*batch*stride*stride;
+    upsample_kernel<<<cuda_gridsize(size), BLOCK>>>(size, in, w, h, c, batch, stride, forward, scale, out);
+    check_error(cudaPeekAtLastError());
+}
diff --git a/src/box.c b/src/box.c
index 88ca71ac..f0a3504d 100644
--- a/src/box.c
+++ b/src/box.c
@@ -3,9 +3,83 @@
 #include <math.h>
 #include <stdlib.h>
 
+int nms_comparator(const void *pa, const void *pb)
+{
+    detection a = *(detection *)pa;
+    detection b = *(detection *)pb;
+    float diff = 0;
+    if(b.sort_class >= 0){
+        diff = a.prob[b.sort_class] - b.prob[b.sort_class];
+    } else {
+        diff = a.objectness - b.objectness;
+    }
+    if(diff < 0) return 1;
+    else if(diff > 0) return -1;
+    return 0;
+}
+
+void do_nms_obj(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+
+    for(i = 0; i < total; ++i){
+        dets[i].sort_class = -1;
+    }
+
+    qsort(dets, total, sizeof(detection), nms_comparator);
+    for(i = 0; i < total; ++i){
+        if(dets[i].objectness == 0) continue;
+        box a = dets[i].bbox;
+        for(j = i+1; j < total; ++j){
+            if(dets[j].objectness == 0) continue;
+            box b = dets[j].bbox;
+            if (box_iou(a, b) > thresh){
+                dets[j].objectness = 0;
+                for(k = 0; k < classes; ++k){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
+
+void do_nms_sort(detection *dets, int total, int classes, float thresh)
+{
+    int i, j, k;
+    k = total-1;
+    for(i = 0; i <= k; ++i){
+        if(dets[i].objectness == 0){
+            detection swap = dets[i];
+            dets[i] = dets[k];
+            dets[k] = swap;
+            --k;
+            --i;
+        }
+    }
+    total = k+1;
+
+    for(k = 0; k < classes; ++k){
+        for(i = 0; i < total; ++i){
+            dets[i].sort_class = k;
+        }
+        qsort(dets, total, sizeof(detection), nms_comparator);
+        for(i = 0; i < total; ++i){
+            if(dets[i].prob[k] == 0) continue;
+            box a = dets[i].bbox;
+            for(j = i+1; j < total; ++j){
+                box b = dets[j].bbox;
+                if (box_iou(a, b) > thresh){
+                    dets[j].prob[k] = 0;
+                }
+            }
+        }
+    }
+}
+
 box float_to_box(float *f, int stride)
 {
-    box b;
+    box b = {0};
     b.x = f[0];
     b.y = f[1*stride];
     b.w = f[2*stride];
@@ -230,79 +304,6 @@ dbox diou(box a, box b)
     return dd;
 }
 
-typedef struct{
-    int index;
-    int class;
-    float **probs;
-} sortable_bbox;
-
-int nms_comparator(const void *pa, const void *pb)
-{
-    sortable_bbox a = *(sortable_bbox *)pa;
-    sortable_bbox b = *(sortable_bbox *)pb;
-    float diff = a.probs[a.index][b.class] - b.probs[b.index][b.class];
-    if(diff < 0) return 1;
-    else if(diff > 0) return -1;
-    return 0;
-}
-
-void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = classes;
-        s[i].probs = probs;
-    }
-
-    qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-    for(i = 0; i < total; ++i){
-        if(probs[s[i].index][classes] == 0) continue;
-        box a = boxes[s[i].index];
-        for(j = i+1; j < total; ++j){
-            box b = boxes[s[j].index];
-            if (box_iou(a, b) > thresh){
-                for(k = 0; k < classes+1; ++k){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
-
-
-void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
-{
-    int i, j, k;
-    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
-
-    for(i = 0; i < total; ++i){
-        s[i].index = i;       
-        s[i].class = 0;
-        s[i].probs = probs;
-    }
-
-    for(k = 0; k < classes; ++k){
-        for(i = 0; i < total; ++i){
-            s[i].class = k;
-        }
-        qsort(s, total, sizeof(sortable_bbox), nms_comparator);
-        for(i = 0; i < total; ++i){
-            if(probs[s[i].index][k] == 0) continue;
-            box a = boxes[s[i].index];
-            for(j = i+1; j < total; ++j){
-                box b = boxes[s[j].index];
-                if (box_iou(a, b) > thresh){
-                    probs[s[j].index][k] = 0;
-                }
-            }
-        }
-    }
-    free(s);
-}
 
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh)
 {
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 56043e78..8fa2ab2e 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -314,6 +314,9 @@ void update_convolutional_layer_gpu(layer l, update_args a)
             scal_gpu(l.n, momentum, l.scale_updates_gpu, 1);
         }
     }
+    if(l.clip){
+        constrain_gpu(l.nweights, l.clip, l.weights_gpu, 1);
+    }
 }
 
 
diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c
index f197bcfe..e4fb9bde 100644
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@@ -203,6 +203,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
 
     // float scale = 1./sqrt(size*size*c);
     float scale = sqrt(2./(size*size*c/l.groups));
+    //printf("convscale %f\n", scale);
     //scale = .02;
     //for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1);
     for(i = 0; i < l.nweights; ++i) l.weights[i] = scale*rand_normal();
@@ -321,7 +322,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int
     l.workspace_size = get_workspace_size(l);
     l.activation = activation;
 
-    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    fprintf(stderr, "conv  %5d %2d x%2d /%2d  %4d x%4d x%4d   ->  %4d x%4d x%4d  %5.3f BFLOPs\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, (2.0 * l.n * l.size*l.size*l.c/l.groups * l.out_h*l.out_w)/1000000000.);
 
     return l;
 }
diff --git a/src/cost_layer.c b/src/cost_layer.c
index 7ef1094b..2138ff26 100644
--- a/src/cost_layer.c
+++ b/src/cost_layer.c
@@ -14,6 +14,7 @@ COST_TYPE get_cost_type(char *s)
     if (strcmp(s, "masked")==0) return MASKED;
     if (strcmp(s, "smooth")==0) return SMOOTH;
     if (strcmp(s, "L1")==0) return L1;
+    if (strcmp(s, "wgan")==0) return WGAN;
     fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s);
     return SSE;
 }
@@ -31,6 +32,8 @@ char *get_cost_string(COST_TYPE a)
             return "smooth";
         case L1:
             return "L1";
+        case WGAN:
+            return "wgan";
     }
     return "sse";
 }
@@ -123,19 +126,18 @@ int float_abs_compare (const void * a, const void * b)
 
 void forward_cost_layer_gpu(cost_layer l, network net)
 {
-    if (!net.truth_gpu) return;
+    if (!net.truth) return;
     if(l.smooth){
         scal_gpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
         add_gpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
     }
-    if (l.cost_type == MASKED) {
-        mask_gpu(l.batch*l.inputs, net.input_gpu, SECRET_NUM, net.truth_gpu);
-    }
 
     if(l.cost_type == SMOOTH){
         smooth_l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
     } else if (l.cost_type == L1){
         l1_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
+    } else if (l.cost_type == WGAN){
+        wgan_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
     } else {
         l2_gpu(l.batch*l.inputs, net.input_gpu, net.truth_gpu, l.delta_gpu, l.output_gpu);
     }
@@ -144,6 +146,9 @@ void forward_cost_layer_gpu(cost_layer l, network net)
         scale_mask_gpu(l.batch*l.inputs, l.delta_gpu, 0, net.truth_gpu, l.noobject_scale);
         scale_mask_gpu(l.batch*l.inputs, l.output_gpu, 0, net.truth_gpu, l.noobject_scale);
     }
+    if (l.cost_type == MASKED) {
+        mask_gpu(l.batch*l.inputs, net.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
+    }
 
     if(l.ratio){
         cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
diff --git a/src/data.c b/src/data.c
index 935e6389..51900f26 100644
--- a/src/data.c
+++ b/src/data.c
@@ -230,7 +230,7 @@ void fill_truth_swag(char *path, float *truth, int classes, int flip, float dx,
     int id;
     int i;
 
-    for (i = 0; i < count && i < 30; ++i) {
+    for (i = 0; i < count && i < 90; ++i) {
         x =  boxes[i].x;
         y =  boxes[i].y;
         w =  boxes[i].w;
@@ -424,6 +424,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
     float x,y,w,h;
     int id;
     int i;
+    int sub = 0;
 
     for (i = 0; i < count; ++i) {
         x =  boxes[i].x;
@@ -432,13 +433,16 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
         h =  boxes[i].h;
         id = boxes[i].id;
 
-        if ((w < .001 || h < .001)) continue;
+        if ((w < .001 || h < .001)) {
+            ++sub;
+            continue;
+        }
 
-        truth[i*5+0] = x;
-        truth[i*5+1] = y;
-        truth[i*5+2] = w;
-        truth[i*5+3] = h;
-        truth[i*5+4] = id;
+        truth[(i-sub)*5+0] = x;
+        truth[(i-sub)*5+1] = y;
+        truth[(i-sub)*5+2] = w;
+        truth[(i-sub)*5+3] = h;
+        truth[(i-sub)*5+4] = id;
     }
     free(boxes);
 }
@@ -506,6 +510,7 @@ void fill_truth(char *path, char **labels, int k, float *truth)
         if(strstr(path, labels[i])){
             truth[i] = 1;
             ++count;
+            //printf("%s %s %d\n", path, labels[i], i);
         }
     }
     if(count != 1 && (k != 1 || count != 0)) printf("Too many or too few labels: %d, %s\n", count, path);
@@ -543,19 +548,31 @@ void fill_hierarchy(float *truth, int k, tree *hierarchy)
     }
 }
 
-matrix load_regression_labels_paths(char **paths, int n)
+matrix load_regression_labels_paths(char **paths, int n, int k)
 {
-    matrix y = make_matrix(n, 1);
-    int i;
+    matrix y = make_matrix(n, k);
+    int i,j;
     for(i = 0; i < n; ++i){
         char labelpath[4096];
-        find_replace(paths[i], "images", "targets", labelpath);
-        find_replace(labelpath, "JPEGImages", "targets", labelpath);
+        find_replace(paths[i], "images", "labels", labelpath);
+        find_replace(labelpath, "JPEGImages", "labels", labelpath);
+        find_replace(labelpath, ".BMP", ".txt", labelpath);
+        find_replace(labelpath, ".JPEG", ".txt", labelpath);
+        find_replace(labelpath, ".JPG", ".txt", labelpath);
+        find_replace(labelpath, ".JPeG", ".txt", labelpath);
+        find_replace(labelpath, ".Jpeg", ".txt", labelpath);
+        find_replace(labelpath, ".PNG", ".txt", labelpath);
+        find_replace(labelpath, ".TIF", ".txt", labelpath);
+        find_replace(labelpath, ".bmp", ".txt", labelpath);
+        find_replace(labelpath, ".jpeg", ".txt", labelpath);
         find_replace(labelpath, ".jpg", ".txt", labelpath);
         find_replace(labelpath, ".png", ".txt", labelpath);
+        find_replace(labelpath, ".tif", ".txt", labelpath);
 
         FILE *file = fopen(labelpath, "r");
-        fscanf(file, "%f", &(y.vals[i][0]));
+        for(j = 0; j < k; ++j){
+            fscanf(file, "%f", &(y.vals[i][j]));
+        }
         fclose(file);
     }
     return y;
@@ -578,18 +595,14 @@ matrix load_tags_paths(char **paths, int n, int k)
 {
     matrix y = make_matrix(n, k);
     int i;
-    int count = 0;
+    //int count = 0;
     for(i = 0; i < n; ++i){
         char label[4096];
-        find_replace(paths[i], "imgs", "labels", label);
-        find_replace(label, "_iconl.jpeg", ".txt", label);
+        find_replace(paths[i], "images", "labels", label);
+        find_replace(label, ".jpg", ".txt", label);
         FILE *file = fopen(label, "r");
-        if(!file){
-            find_replace(label, "labels", "labels2", label);
-            file = fopen(label, "r");
-            if(!file) continue;
-        }
-        ++count;
+        if (!file) continue;
+        //++count;
         int tag;
         while(fscanf(file, "%d", &tag) == 1){
             if(tag < k){
@@ -598,7 +611,7 @@ matrix load_tags_paths(char **paths, int n, int k)
         }
         fclose(file);
     }
-    printf("%d/%d\n", count, n);
+    //printf("%d/%d\n", count, n);
     return y;
 }
 
@@ -906,7 +919,7 @@ data load_data_swag(char **paths, int n, int classes, float jitter)
     d.X.vals = calloc(d.X.rows, sizeof(float*));
     d.X.cols = h*w*3;
 
-    int k = (4+classes)*30;
+    int k = (4+classes)*90;
     d.y = make_matrix(1, k);
 
     int dw = w*jitter;
@@ -1005,7 +1018,7 @@ void *load_thread(void *ptr)
     if (a.type == OLD_CLASSIFICATION_DATA){
         *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
     } else if (a.type == REGRESSION_DATA){
-        *a.d = load_data_regression(a.paths, a.n, a.m, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
+        *a.d = load_data_regression(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
     } else if (a.type == CLASSIFICATION_DATA){
         *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure, a.center);
     } else if (a.type == SUPER_DATA){
@@ -1161,13 +1174,13 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale)
     return d;
 }
 
-data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
+data load_data_regression(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure)
 {
     if(m) paths = get_random_paths(paths, n, m);
     data d = {0};
     d.shallow = 0;
     d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure, 0);
-    d.y = load_regression_labels_paths(paths, n);
+    d.y = load_regression_labels_paths(paths, n, k);
     if(m) free(paths);
     return d;
 }
diff --git a/src/data.h b/src/data.h
index 931b99b2..781906f8 100644
--- a/src/data.h
+++ b/src/data.h
@@ -29,7 +29,7 @@ data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size
 matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
 data load_data_super(char **paths, int n, int m, int w, int h, int scale);
 data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure, int center);
-data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
+data load_data_regression(char **paths, int n, int m, int classes, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure);
 data load_go(char *filename);
 
 
diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu
index d7e29462..8267dcfa 100644
--- a/src/deconvolutional_kernels.cu
+++ b/src/deconvolutional_kernels.cu
@@ -45,7 +45,7 @@ extern "C" void backward_deconvolutional_layer_gpu(layer l, network net)
 {
     int i;
 
-    constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
+    //constrain_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
     gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
 
     if(l.batch_normalize){
@@ -116,18 +116,16 @@ void update_deconvolutional_layer_gpu(layer l, update_args a)
     float decay = a.decay;
     int batch = a.batch;
 
-    int size = l.size*l.size*l.c*l.n;
-
     if(a.adam){
-        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, size, batch, a.t);
+        adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
         adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
         if(l.scales_gpu){
             adam_update_gpu(l.scales_gpu, l.scale_updates_gpu, l.scale_m_gpu, l.scale_v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.n, batch, a.t);
         }
     }else{
-        axpy_gpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
-        axpy_gpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
-        scal_gpu(size, momentum, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
+        axpy_gpu(l.nweights, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
+        scal_gpu(l.nweights, momentum, l.weight_updates_gpu, 1);
 
         axpy_gpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
         scal_gpu(l.n, momentum, l.bias_updates_gpu, 1);
diff --git a/src/deconvolutional_layer.c b/src/deconvolutional_layer.c
index 674ce6b3..00c0e857 100644
--- a/src/deconvolutional_layer.c
+++ b/src/deconvolutional_layer.c
@@ -15,6 +15,22 @@ static size_t get_workspace_size(layer l){
     return (size_t)l.h*l.w*l.size*l.size*l.n*sizeof(float);
 }
 
+void bilinear_init(layer l)
+{
+    int i,j,f;
+    float center = (l.size-1) / 2.;
+    for(f = 0; f < l.n; ++f){
+        for(j = 0; j < l.size; ++j){
+            for(i = 0; i < l.size; ++i){
+                float val = (1 - fabs(i - center)) * (1 - fabs(j - center));
+                int c = f%l.c;
+                int ind = f*l.size*l.size*l.c + c*l.size*l.size + j*l.size + i;
+                l.weights[ind] = val;
+            }
+        }
+    }
+}
+
 
 layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int adam)
 {
@@ -38,8 +54,11 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size
 
     l.biases = calloc(n, sizeof(float));
     l.bias_updates = calloc(n, sizeof(float));
+    //float scale = n/(size*size*c);
+    //printf("scale: %f\n", scale);
     float scale = .02;
     for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_normal();
+    //bilinear_init(l);
     for(i = 0; i < n; ++i){
         l.biases[i] = 0;
     }
@@ -51,6 +70,8 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size
     l.outputs = l.out_w * l.out_h * l.out_c;
     l.inputs = l.w * l.h * l.c;
 
+    scal_cpu(l.nweights, (float)l.out_w*l.out_h/(l.w*l.h), l.weights, 1);
+
     l.output = calloc(l.batch*l.outputs, sizeof(float));
     l.delta  = calloc(l.batch*l.outputs, sizeof(float));
 
@@ -121,7 +142,7 @@ layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size
             l.mean_delta_gpu = cuda_make_array(0, n);
             l.variance_delta_gpu = cuda_make_array(0, n);
 
-            l.scales_gpu = cuda_make_array(0, n);
+            l.scales_gpu = cuda_make_array(l.scales, n);
             l.scale_updates_gpu = cuda_make_array(0, n);
 
             l.x_gpu = cuda_make_array(0, l.batch*l.out_h*l.out_w*n);
diff --git a/src/demo.c b/src/demo.c
index ec73abd5..68294b43 100644
--- a/src/demo.c
+++ b/src/demo.c
@@ -17,8 +17,6 @@ static char **demo_names;
 static image **demo_alphabet;
 static int demo_classes;
 
-static float **probs;
-static box *boxes;
 static network *net;
 static image buff [3];
 static image buff_letter[3];
@@ -31,13 +29,19 @@ static float demo_hier = .5;
 static int running = 0;
 
 static int demo_frame = 3;
-static int demo_detections = 0;
-static float **predictions;
 static int demo_index = 0;
+static int demo_detections = 0;
+//static float **predictions;
+static detection **dets;
+static detection *avg;
+//static float *avg;
 static int demo_done = 0;
-static float *avg;
 double demo_time;
 
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative);
+detection *make_network_boxes(network *net);
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets);
+
 void *detect_in_thread(void *ptr)
 {
     running = 1;
@@ -45,26 +49,45 @@ void *detect_in_thread(void *ptr)
 
     layer l = net->layers[net->n-1];
     float *X = buff_letter[(buff_index+2)%3].data;
-    float *prediction = network_predict(net, X);
+    network_predict(net, X);
 
-    memcpy(predictions[demo_index], prediction, l.outputs*sizeof(float));
-    mean_arrays(predictions, demo_frame, l.outputs, avg);
-    l.output = avg;
+    /*
     if(l.type == DETECTION){
         get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
-    } else if (l.type == REGION){
-        get_region_boxes(l, buff[0].w, buff[0].h, net->w, net->h, demo_thresh, probs, boxes, 0, 0, 0, demo_hier, 1);
+    } else */
+    if (l.type == REGION){
+        fill_network_boxes(net, buff[0].w, buff[0].h, demo_thresh, demo_hier, 0, 1, dets[demo_index]);
     } else {
         error("Last layer must produce detections\n");
     }
-    if (nms > 0) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+
+    int i,j;
+    box zero = {0};
+    int classes = l.classes;
+    for(i = 0; i < demo_detections; ++i){
+        avg[i].objectness = 0;
+        avg[i].bbox = zero;
+        memset(avg[i].prob, 0, classes*sizeof(float));
+        for(j = 0; j < demo_frame; ++j){
+            axpy_cpu(classes, 1./demo_frame, dets[j][i].prob, 1, avg[i].prob, 1);
+            avg[i].objectness += dets[j][i].objectness * 1./demo_frame;
+            avg[i].bbox.x += dets[j][i].bbox.x * 1./demo_frame;
+            avg[i].bbox.y += dets[j][i].bbox.y * 1./demo_frame;
+            avg[i].bbox.w += dets[j][i].bbox.w * 1./demo_frame;
+            avg[i].bbox.h += dets[j][i].bbox.h * 1./demo_frame;
+        }
+        //copy_cpu(classes, dets[0][i].prob, 1, avg[i].prob, 1);
+        //avg[i].objectness = dets[0][i].objectness;
+    }
+
+    if (nms > 0) do_nms_obj(avg, demo_detections, l.classes, nms);
 
     printf("\033[2J");
     printf("\033[1;1H");
     printf("\nFPS:%.1f\n",fps);
     printf("Objects:\n\n");
     image display = buff[(buff_index+2) % 3];
-    draw_detections(display, demo_detections, demo_thresh, boxes, probs, 0, demo_names, demo_alphabet, demo_classes);
+    draw_detections(display, avg, demo_detections, demo_thresh, demo_names, demo_alphabet, demo_classes);
 
     demo_index = (demo_index + 1)%demo_frame;
     running = 0;
@@ -117,8 +140,7 @@ void *detect_loop(void *ptr)
 
 void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
 {
-    demo_frame = avg_frames;
-    predictions = calloc(demo_frame, sizeof(float*));
+    //demo_frame = avg_frames;
     image **alphabet = load_alphabet();
     demo_names = names;
     demo_alphabet = alphabet;
@@ -152,16 +174,11 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
 
     if(!cap) error("Couldn't connect to webcam.\n");
 
-    layer l = net->layers[net->n-1];
-    demo_detections = l.n*l.w*l.h;
-    int j;
-
-    avg = (float *) calloc(l.outputs, sizeof(float));
-    for(j = 0; j < demo_frame; ++j) predictions[j] = (float *) calloc(l.outputs, sizeof(float));
-
-    boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
-    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes+1, sizeof(float));
+    demo_detections = num_boxes(net);
+    avg = make_network_boxes(net);
+    dets = calloc(demo_frame, sizeof(detection*));
+    int i;
+    for(i = 0; i < demo_frame; ++i) dets[i] = make_network_boxes(net);
 
     buff[0] = get_image_from_stream(cap);
     buff[1] = copy_image(buff[0]);
@@ -203,6 +220,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
     }
 }
 
+/*
 void demo_compare(char *cfg1, char *weight1, char *cfg2, char *weight2, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg_frames, float hier, int w, int h, int frames, int fullscreen)
 {
     demo_frame = avg_frames;
@@ -290,6 +308,7 @@ void demo_compare(char *cfg1, char *weight1, char *cfg2, char *weight2, float th
         ++count;
     }
 }
+*/
 #else
 void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int delay, char *prefix, int avg, float hier, int w, int h, int frames, int fullscreen)
 {
diff --git a/src/detection_layer.c b/src/detection_layer.c
index 5c8a1cea..015ee317 100644
--- a/src/detection_layer.c
+++ b/src/detection_layer.c
@@ -222,7 +222,7 @@ void backward_detection_layer(const detection_layer l, network net)
     axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
 }
 
-void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness)
+void get_detection_detections(layer l, int w, int h, float thresh, detection *dets)
 {
     int i,j,n;
     float *predictions = l.output;
@@ -235,17 +235,17 @@ void get_detection_boxes(layer l, int w, int h, float thresh, float **probs, box
             int p_index = l.side*l.side*l.classes + i*l.n + n;
             float scale = predictions[p_index];
             int box_index = l.side*l.side*(l.classes + l.n) + (i*l.n + n)*4;
-            boxes[index].x = (predictions[box_index + 0] + col) / l.side * w;
-            boxes[index].y = (predictions[box_index + 1] + row) / l.side * h;
-            boxes[index].w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
-            boxes[index].h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            box b;
+            b.x = (predictions[box_index + 0] + col) / l.side * w;
+            b.y = (predictions[box_index + 1] + row) / l.side * h;
+            b.w = pow(predictions[box_index + 2], (l.sqrt?2:1)) * w;
+            b.h = pow(predictions[box_index + 3], (l.sqrt?2:1)) * h;
+            dets[index].bbox = b;
+            dets[index].objectness = scale;
             for(j = 0; j < l.classes; ++j){
                 int class_index = i*l.classes;
                 float prob = scale*predictions[class_index+j];
-                probs[index][j] = (prob > thresh) ? prob : 0;
-            }
-            if(only_objectness){
-                probs[index][0] = scale;
+                dets[index].prob[j] = (prob > thresh) ? prob : 0;
             }
         }
     }
diff --git a/src/image.c b/src/image.c
index ac1b6296..b78d4020 100644
--- a/src/image.c
+++ b/src/image.c
@@ -131,6 +131,7 @@ image tile_images(image a, image b, int dx)
 
 image get_label(image **characters, char *string, int size)
 {
+    size = size/10;
     if(size > 7) size = 7;
     image label = make_empty_image(0,0,0);
     while(*string){
@@ -235,7 +236,7 @@ image **load_alphabet()
     return alphabets;
 }
 
-void draw_detections(image im, int num, float thresh, box *boxes, float **probs, float **masks, char **names, image **alphabet, int classes)
+void draw_detections(image im, detection *dets, int num, float thresh, char **names, image **alphabet, int classes)
 {
     int i,j;
 
@@ -243,7 +244,7 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
         char labelstr[4096] = {0};
         int class = -1;
         for(j = 0; j < classes; ++j){
-            if (probs[i][j] > thresh){
+            if (dets[i].prob[j] > thresh){
                 if (class < 0) {
                     strcat(labelstr, names[j]);
                     class = j;
@@ -251,7 +252,7 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
                     strcat(labelstr, ", ");
                     strcat(labelstr, names[j]);
                 }
-                printf("%s: %.0f%%\n", names[j], probs[i][j]*100);
+                printf("%s: %.0f%%\n", names[j], dets[i].prob[j]*100);
             }
         }
         if(class >= 0){
@@ -276,7 +277,8 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
             rgb[0] = red;
             rgb[1] = green;
             rgb[2] = blue;
-            box b = boxes[i];
+            box b = dets[i].bbox;
+            //printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
 
             int left  = (b.x-b.w/2.)*im.w;
             int right = (b.x+b.w/2.)*im.w;
@@ -290,12 +292,12 @@ void draw_detections(image im, int num, float thresh, box *boxes, float **probs,
 
             draw_box_width(im, left, top, right, bot, width, red, green, blue);
             if (alphabet) {
-                image label = get_label(alphabet, labelstr, (im.h*.03)/10);
+                image label = get_label(alphabet, labelstr, (im.h*.03));
                 draw_label(im, top + width, left, label, rgb);
                 free_image(label);
             }
-            if (masks){
-                image mask = float_to_image(14, 14, 1, masks[i]);
+            if (dets[i].mask){
+                image mask = float_to_image(14, 14, 1, dets[i].mask);
                 image resized_mask = resize_image(mask, b.w*im.w, b.h*im.h);
                 image tmask = threshold_image(resized_mask, .5);
                 embed_image(tmask, im, left, top);
@@ -394,6 +396,35 @@ void ghost_image(image source, image dest, int dx, int dy)
     }
 }
 
+void blocky_image(image im, int s)
+{
+    int i,j,k;
+    for(k = 0; k < im.c; ++k){
+        for(j = 0; j < im.h; ++j){
+            for(i = 0; i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+            }
+        }
+    }
+}
+
+void censor_image(image im, int dx, int dy, int w, int h)
+{
+    int i,j,k;
+    int s = 32;
+    if(dx < 0) dx = 0;
+    if(dy < 0) dy = 0;
+
+    for(k = 0; k < im.c; ++k){
+        for(j = dy; j < dy + h && j < im.h; ++j){
+            for(i = dx; i < dx + w && i < im.w; ++i){
+                im.data[i + im.w*(j + im.h*k)] = im.data[i/s*s + im.w*(j/s*s + im.h*k)];
+                //im.data[i + j*im.w + k*im.w*im.h] = 0;
+            }
+        }
+    }
+}
+
 void embed_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
diff --git a/src/image.h b/src/image.h
index 4ff0eacb..789cf18d 100644
--- a/src/image.h
+++ b/src/image.h
@@ -22,12 +22,10 @@ void show_image_cv(image p, const char *name, IplImage *disp);
 float get_color(int c, int x, int max);
 void draw_box(image a, int x1, int y1, int x2, int y2, float r, float g, float b);
 void draw_bbox(image a, box bbox, int w, float r, float g, float b);
-void draw_label(image a, int r, int c, image label, const float *rgb);
 void write_label(image a, int r, int c, image *characters, char *string, float *rgb);
 image image_distance(image a, image b);
 void scale_image(image m, float s);
 image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect);
-image center_crop_image(image im, int w, int h);
 image random_crop_image(image im, int w, int h);
 image random_augment_image(image im, float angle, float aspect, int low, int high, int w, int h);
 augment_args random_augment_args(image im, float angle, float aspect, int low, int high, int w, int h);
diff --git a/src/l2norm_layer.c b/src/l2norm_layer.c
new file mode 100644
index 00000000..d099479b
--- /dev/null
+++ b/src/l2norm_layer.c
@@ -0,0 +1,63 @@
+#include "l2norm_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_l2norm_layer(int batch, int inputs)
+{
+    fprintf(stderr, "l2norm                                         %4d\n",  inputs);
+    layer l = {0};
+    l.type = L2NORM;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.scales = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+
+    l.forward = forward_l2norm_layer;
+    l.backward = backward_l2norm_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_l2norm_layer_gpu;
+    l.backward_gpu = backward_l2norm_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.scales_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_l2norm_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    l2normalize_cpu(l.output, l.scales, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer(const layer l, network net)
+{
+    //axpy_cpu(l.inputs*l.batch, 1, l.scales, 1, l.delta, 1);
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_l2norm_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    l2normalize_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_w*l.out_h);
+}
+
+void backward_l2norm_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.scales_gpu, 1, l.delta_gpu, 1);
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/src/l2norm_layer.h b/src/l2norm_layer.h
new file mode 100644
index 00000000..1ca6f710
--- /dev/null
+++ b/src/l2norm_layer.h
@@ -0,0 +1,15 @@
+#ifndef L2NORM_LAYER_H
+#define L2NORM_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_l2norm_layer(int batch, int inputs);
+void forward_l2norm_layer(const layer l, network net);
+void backward_l2norm_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_l2norm_layer_gpu(const layer l, network net);
+void backward_l2norm_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/src/logistic_layer.c b/src/logistic_layer.c
new file mode 100644
index 00000000..b2b3d6b1
--- /dev/null
+++ b/src/logistic_layer.c
@@ -0,0 +1,71 @@
+#include "logistic_layer.h"
+#include "activations.h"
+#include "blas.h"
+#include "cuda.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <assert.h>
+
+layer make_logistic_layer(int batch, int inputs)
+{
+    fprintf(stderr, "logistic x entropy                             %4d\n",  inputs);
+    layer l = {0};
+    l.type = LOGXENT;
+    l.batch = batch;
+    l.inputs = inputs;
+    l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
+    l.output = calloc(inputs*batch, sizeof(float));
+    l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
+
+    l.forward = forward_logistic_layer;
+    l.backward = backward_logistic_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_logistic_layer_gpu;
+    l.backward_gpu = backward_logistic_layer_gpu;
+
+    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
+    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
+    #endif
+    return l;
+}
+
+void forward_logistic_layer(const layer l, network net)
+{
+    copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
+    activate_array(l.output, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer(const layer l, network net)
+{
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
+}
+
+#ifdef GPU
+
+void forward_logistic_layer_gpu(const layer l, network net)
+{
+    copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
+    activate_array_gpu(l.output_gpu, l.outputs*l.batch, LOGISTIC);
+    if(net.truth){
+        logistic_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
+}
+
+void backward_logistic_layer_gpu(const layer l, network net)
+{
+    axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
+}
+
+#endif
diff --git a/src/logistic_layer.h b/src/logistic_layer.h
new file mode 100644
index 00000000..9c25bee3
--- /dev/null
+++ b/src/logistic_layer.h
@@ -0,0 +1,15 @@
+#ifndef LOGISTIC_LAYER_H
+#define LOGISTIC_LAYER_H
+#include "layer.h"
+#include "network.h"
+
+layer make_logistic_layer(int batch, int inputs);
+void forward_logistic_layer(const layer l, network net);
+void backward_logistic_layer(const layer l, network net);
+
+#ifdef GPU
+void forward_logistic_layer_gpu(const layer l, network net);
+void backward_logistic_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/src/network.c b/src/network.c
index 1b4df6bc..f4966211 100644
--- a/src/network.c
+++ b/src/network.c
@@ -26,6 +26,7 @@
 #include "softmax_layer.h"
 #include "dropout_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
 #include "parser.h"
 #include "data.h"
@@ -377,6 +378,10 @@ int resize_network(network *net, int w, int h)
             resize_region_layer(&l, w, h);
         }else if(l.type == ROUTE){
             resize_route_layer(&l, net);
+        }else if(l.type == SHORTCUT){
+            resize_shortcut_layer(&l, w, h);
+        }else if(l.type == UPSAMPLE){
+            resize_upsample_layer(&l, w, h);
         }else if(l.type == REORG){
             resize_reorg_layer(&l, w, h);
         }else if(l.type == AVGPOOL){
@@ -412,7 +417,9 @@ int resize_network(network *net, int w, int h)
         cuda_free(net->truth_gpu);
         net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch);
         net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch);
-        net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
+        if(workspace_size){
+            net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1);
+        }
     }else {
         free(net->workspace);
         net->workspace = calloc(1, workspace_size);
@@ -497,34 +504,62 @@ float *network_predict(network *net, float *input)
 
 int num_boxes(network *net)
 {
-    layer l = net->layers[net->n-1];
-    return l.w*l.h*l.n;
+    int i;
+    int s = 0;
+    for(i = 0; i < net->n; ++i){
+        layer l = net->layers[i];
+        if(l.type == REGION || l.type == DETECTION){
+            s += l.w*l.h*l.n;
+        }
+    }
+    return s;
 }
 
-box *make_boxes(network *net)
+detection *make_network_boxes(network *net)
 {
-    layer l = net->layers[net->n-1];
-    box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
-    return boxes;
+    layer l = net->layers[net->n - 1];
+    int i;
+    int nboxes = num_boxes(net);
+    detection *dets = calloc(nboxes, sizeof(detection));
+    for(i = 0; i < nboxes; ++i){
+        dets[i].prob = calloc(l.classes, sizeof(float));
+        if(l.coords > 4){
+            dets[i].mask = calloc(l.coords-4, sizeof(float));
+        }
+    }
+    return dets;
 }
-
-float **make_probs(network *net)
+void fill_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative, detection *dets)
 {
     int j;
-    layer l = net->layers[net->n-1];
-    float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes + 1, sizeof(float *));
-    return probs;
+    for(j = 0; j < net->n; ++j){
+        layer l = net->layers[j];
+        if(l.type == REGION){
+            get_region_detections(l, w, h, net->w, net->h, thresh, map, hier, relative, dets);
+            dets += l.w*l.h*l.n;
+        }
+        if(l.type == DETECTION){
+            get_detection_detections(l, w, h, thresh, dets);
+            dets += l.w*l.h*l.n;
+        }
+    }
 }
 
-void network_detect(network *net, image im, float thresh, float hier_thresh, float nms, box *boxes, float **probs)
+detection *get_network_boxes(network *net, int w, int h, float thresh, float hier, int *map, int relative)
 {
-    network_predict_image(net, im);
-    layer l = net->layers[net->n-1];
-    if(l.type == REGION){
-        get_region_boxes(l, im.w, im.h, net->w, net->h, thresh, probs, boxes, 0, 0, 0, hier_thresh, 0);
-        if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+    detection *dets = make_network_boxes(net);
+    fill_network_boxes(net, w, h, thresh, hier, map, relative, dets);
+    return dets;
+}
+
+void free_detections(detection *dets, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        free(dets[i].prob);
+        if(dets[i].mask) free(dets[i].mask);
     }
+    free(dets);
 }
 
 float *network_predict_image(network *net, image im)
diff --git a/src/parser.c b/src/parser.c
index da7487b0..e6dd2380 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -4,6 +4,8 @@
 #include <assert.h>
 
 #include "activation_layer.h"
+#include "logistic_layer.h"
+#include "l2norm_layer.h"
 #include "activations.h"
 #include "avgpool_layer.h"
 #include "batchnorm_layer.h"
@@ -27,6 +29,7 @@
 #include "reorg_layer.h"
 #include "rnn_layer.h"
 #include "route_layer.h"
+#include "upsample_layer.h"
 #include "shortcut_layer.h"
 #include "softmax_layer.h"
 #include "lstm_layer.h"
@@ -53,6 +56,8 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[deconv]")==0
             || strcmp(type, "[deconvolutional]")==0) return DECONVOLUTIONAL;
     if (strcmp(type, "[activation]")==0) return ACTIVE;
+    if (strcmp(type, "[logistic]")==0) return LOGXENT;
+    if (strcmp(type, "[l2norm]")==0) return L2NORM;
     if (strcmp(type, "[net]")==0
             || strcmp(type, "[network]")==0) return NETWORK;
     if (strcmp(type, "[crnn]")==0) return CRNN;
@@ -73,6 +78,7 @@ LAYER_TYPE string_to_layer_type(char * type)
     if (strcmp(type, "[soft]")==0
             || strcmp(type, "[softmax]")==0) return SOFTMAX;
     if (strcmp(type, "[route]")==0) return ROUTE;
+    if (strcmp(type, "[upsample]")==0) return UPSAMPLE;
     return BLANK;
 }
 
@@ -275,9 +281,27 @@ layer parse_region(list *options, size_params params)
 {
     int coords = option_find_int(options, "coords", 4);
     int classes = option_find_int(options, "classes", 20);
-    int num = option_find_int(options, "num", 1);
+    int total = option_find_int(options, "num", 1);
+    int num = total;
 
-    layer l = make_region_layer(params.batch, params.w, params.h, num, classes, coords);
+    char *a = option_find_str(options, "mask", 0);
+    int *mask = 0;
+    if(a){
+        int len = strlen(a);
+        int n = 1;
+        int i;
+        for(i = 0; i < len; ++i){
+            if (a[i] == ',') ++n;
+        }
+        mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            int val = atoi(a);
+            mask[i] = val;
+            a = strchr(a, ',')+1;
+        }
+        num = n;
+    }
+    layer l = make_region_layer(params.batch, params.w, params.h, num, total, mask, classes, coords);
     assert(l.outputs == params.inputs);
 
     l.log = option_find_int_quiet(options, "log", 0);
@@ -285,11 +309,12 @@ layer parse_region(list *options, size_params params)
 
     l.softmax = option_find_int(options, "softmax", 0);
     l.background = option_find_int_quiet(options, "background", 0);
-    l.max_boxes = option_find_int_quiet(options, "max",30);
+    l.max_boxes = option_find_int_quiet(options, "max",90);
     l.jitter = option_find_float(options, "jitter", .2);
     l.rescore = option_find_int_quiet(options, "rescore",0);
 
-    l.thresh = option_find_float(options, "thresh", .5);
+    l.ignore_thresh = option_find_float(options, "ignore_thresh", .5);
+    l.truth_thresh = option_find_float(options, "truth_thresh", 1);
     l.classfix = option_find_int_quiet(options, "classfix", 0);
     l.absolute = option_find_int_quiet(options, "absolute", 0);
     l.random = option_find_int_quiet(options, "random", 0);
@@ -297,16 +322,17 @@ layer parse_region(list *options, size_params params)
     l.coord_scale = option_find_float(options, "coord_scale", 1);
     l.object_scale = option_find_float(options, "object_scale", 1);
     l.noobject_scale = option_find_float(options, "noobject_scale", 1);
-    l.mask_scale = option_find_float(options, "mask_scale", 1);
+    l.mask_scale = option_find_float_quiet(options, "mask_scale", 1);
     l.class_scale = option_find_float(options, "class_scale", 1);
     l.bias_match = option_find_int_quiet(options, "bias_match",0);
+    l.focus = option_find_float_quiet(options, "focus", 0);
 
     char *tree_file = option_find_str(options, "tree", 0);
     if (tree_file) l.softmax_tree = read_tree(tree_file);
     char *map_file = option_find_str(options, "map", 0);
     if (map_file) l.map = read_map(map_file);
 
-    char *a = option_find_str(options, "anchors", 0);
+    a = option_find_str(options, "anchors", 0);
     if(a){
         int len = strlen(a);
         int n = 1;
@@ -334,7 +360,7 @@ detection_layer parse_detection(list *options, size_params params)
     layer.softmax = option_find_int(options, "softmax", 0);
     layer.sqrt = option_find_int(options, "sqrt", 0);
 
-    layer.max_boxes = option_find_int_quiet(options, "max",30);
+    layer.max_boxes = option_find_int_quiet(options, "max",90);
     layer.coord_scale = option_find_float(options, "coord_scale", 1);
     layer.forced = option_find_int(options, "forced", 0);
     layer.object_scale = option_find_float(options, "object_scale", 1);
@@ -470,10 +496,31 @@ layer parse_shortcut(list *options, size_params params, network *net)
     char *activation_s = option_find_str(options, "activation", "linear");
     ACTIVATION activation = get_activation(activation_s);
     s.activation = activation;
+    s.alpha = option_find_float_quiet(options, "alpha", 1);
+    s.beta = option_find_float_quiet(options, "beta", 1);
     return s;
 }
 
 
+layer parse_l2norm(list *options, size_params params)
+{
+    layer l = make_l2norm_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
+
+layer parse_logistic(list *options, size_params params)
+{
+    layer l = make_logistic_layer(params.batch, params.inputs);
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
+    return l;
+}
+
 layer parse_activation(list *options, size_params params)
 {
     char *activation_s = option_find_str(options, "activation", "linear");
@@ -481,16 +528,22 @@ layer parse_activation(list *options, size_params params)
 
     layer l = make_activation_layer(params.batch, params.inputs, activation);
 
-    l.out_h = params.h;
-    l.out_w = params.w;
-    l.out_c = params.c;
-    l.h = params.h;
-    l.w = params.w;
-    l.c = params.c;
+    l.h = l.out_h = params.h;
+    l.w = l.out_w = params.w;
+    l.c = l.out_c = params.c;
 
     return l;
 }
 
+layer parse_upsample(list *options, size_params params, network *net)
+{
+
+    int stride = option_find_int(options, "stride",2);
+    layer l = make_upsample_layer(params.batch, params.w, params.h, params.c, stride);
+    l.scale = option_find_float_quiet(options, "scale", 1);
+    return l;
+}
+
 route_layer parse_route(list *options, size_params params, network *net)
 {
     char *l = option_find(options, "layers");
@@ -575,6 +628,7 @@ void parse_net_options(list *options, network *net)
     net->max_ratio = option_find_float_quiet(options, "max_ratio", (float) net->max_crop / net->w);
     net->min_ratio = option_find_float_quiet(options, "min_ratio", (float) net->min_crop / net->w);
     net->center = option_find_int_quiet(options, "center",0);
+    net->clip = option_find_float_quiet(options, "clip", 0);
 
     net->angle = option_find_float_quiet(options, "angle", 0);
     net->aspect = option_find_float_quiet(options, "aspect", 1);
@@ -673,6 +727,10 @@ network *parse_network_cfg(char *filename)
             l = parse_local(options, params);
         }else if(lt == ACTIVE){
             l = parse_activation(options, params);
+        }else if(lt == LOGXENT){
+            l = parse_logistic(options, params);
+        }else if(lt == L2NORM){
+            l = parse_l2norm(options, params);
         }else if(lt == RNN){
             l = parse_rnn(options, params);
         }else if(lt == GRU){
@@ -706,6 +764,8 @@ network *parse_network_cfg(char *filename)
             l = parse_avgpool(options, params);
         }else if(lt == ROUTE){
             l = parse_route(options, params, net);
+        }else if(lt == UPSAMPLE){
+            l = parse_upsample(options, params, net);
         }else if(lt == SHORTCUT){
             l = parse_shortcut(options, params, net);
         }else if(lt == DROPOUT){
@@ -719,9 +779,11 @@ network *parse_network_cfg(char *filename)
         }else{
             fprintf(stderr, "Type not recognized: %s\n", s->type);
         }
+        l.clip = net->clip;
         l.truth = option_find_int_quiet(options, "truth", 0);
         l.onlyforward = option_find_int_quiet(options, "onlyforward", 0);
         l.stopbackward = option_find_int_quiet(options, "stopbackward", 0);
+        l.dontsave = option_find_int_quiet(options, "dontsave", 0);
         l.dontload = option_find_int_quiet(options, "dontload", 0);
         l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0);
         l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1);
@@ -905,6 +967,7 @@ void save_weights_upto(network *net, char *filename, int cutoff)
     int i;
     for(i = 0; i < net->n && i < cutoff; ++i){
         layer l = net->layers[i];
+        if (l.dontsave) continue;
         if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){
             save_convolutional_weights(l, fp);
         } if(l.type == CONNECTED){
diff --git a/src/region_layer.c b/src/region_layer.c
index 449957c3..5ad7bc0a 100644
--- a/src/region_layer.c
+++ b/src/region_layer.c
@@ -10,12 +10,14 @@
 #include <string.h>
 #include <stdlib.h>
 
-layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
+layer make_region_layer(int batch, int w, int h, int n, int total, int *mask, int classes, int coords)
 {
+    int i;
     layer l = {0};
     l.type = REGION;
 
     l.n = n;
+    l.total = total;
     l.batch = batch;
     l.h = h;
     l.w = w;
@@ -26,15 +28,21 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
     l.classes = classes;
     l.coords = coords;
     l.cost = calloc(1, sizeof(float));
-    l.biases = calloc(n*2, sizeof(float));
+    l.biases = calloc(total*2, sizeof(float));
+    if(mask) l.mask = mask;
+    else{
+        l.mask = calloc(n, sizeof(int));
+        for(i = 0; i < n; ++i){
+            l.mask[i] = i;
+        }
+    }
     l.bias_updates = calloc(n*2, sizeof(float));
     l.outputs = h*w*n*(classes + coords + 1);
     l.inputs = l.outputs;
-    l.truths = 30*(l.coords + 1);
+    l.truths = 90*(l.coords + 1);
     l.delta = calloc(batch*l.outputs, sizeof(float));
     l.output = calloc(batch*l.outputs, sizeof(float));
-    int i;
-    for(i = 0; i < n*2; ++i){
+    for(i = 0; i < total*2; ++i){
         l.biases[i] = .5;
     }
 
@@ -73,30 +81,37 @@ void resize_region_layer(layer *l, int w, int h)
 #endif
 }
 
-box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride)
+box get_region_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
 {
     box b;
-    b.x = (i + x[index + 0*stride]) / w;
-    b.y = (j + x[index + 1*stride]) / h;
+    b.x = (i + x[index + 0*stride]) / lw;
+    b.y = (j + x[index + 1*stride]) / lh;
     b.w = exp(x[index + 2*stride]) * biases[2*n]   / w;
     b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
     return b;
 }
 
-float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale, int stride)
+float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
 {
-    box pred = get_region_box(x, biases, n, index, i, j, w, h, stride);
+    box pred = get_region_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
     float iou = box_iou(pred, truth);
 
-    float tx = (truth.x*w - i);
-    float ty = (truth.y*h - j);
+    float tx = (truth.x*lw - i);
+    float ty = (truth.y*lh - j);
     float tw = log(truth.w*w / biases[2*n]);
     float th = log(truth.h*h / biases[2*n + 1]);
 
+    //printf("%f %f %f %f\n", tx, ty, tw, th);
+
     delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
     delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
     delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
     delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
+    //printf("x: %f %f\n",tx , x[index + 0*stride]);
+    //printf("y: %f %f\n",ty , x[index + 1*stride]);
+    //printf("w: %f %f\n",tw , x[index + 2*stride]);
+    //printf("h: %f %f\n\n",th , x[index + 3*stride]);
+    //printf("%f %f %f %f\n", x[index + 0*stride], x[index + 1*stride], x[index + 2*stride], x[index + 3*stride]);
     return iou;
 }
 
@@ -109,7 +124,7 @@ void delta_region_mask(float *truth, float *x, int n, int index, float *delta, i
 }
 
 
-void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat, int tag)
+void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat, int tag, float focus)
 {
     int i, n;
     if(hier){
@@ -125,15 +140,30 @@ void delta_region_class(float *output, float *delta, int index, int class, int c
 
             class = hier->parent[class];
         }
-        *avg_cat += pred;
+        if(avg_cat) *avg_cat += pred;
     } else {
         if (delta[index] && tag){
-            delta[index + stride*class] = scale * (1 - output[index + stride*class]);
+            if(focus){
+                float y = -1;
+                float p = output[index + stride*class];
+                float lg = p > .0000000001 ? log(p) : -10;
+                delta[index + stride*class] = y * pow(1-p, focus) * (focus*p*lg + p - 1);
+            }else{
+                delta[index + stride*class] = scale * (1 - output[index + stride*class]);
+                if(avg_cat) *avg_cat += output[index + stride*class];
+            }
             return;
         }
         for(n = 0; n < classes; ++n){
-            delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]);
-            if(n == class) *avg_cat += output[index + stride*n];
+            if(focus){
+                float y = (n == class) ? -1 : 1;
+                float p = (n == class) ? output[index + stride*n] : 1 - output[index + stride*n];
+                float lg = p > .0000000001 ? log(p) : -10;
+                delta[index + stride*n] = y * pow(1-p, focus) * (focus*p*lg + p - 1);
+            }else{
+                delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]);
+            }
+            if(n == class && avg_cat) *avg_cat += output[index + stride*n];
         }
     }
 }
@@ -189,6 +219,7 @@ void forward_region_layer(const layer l, network net)
     if(!net.train) return;
     float avg_iou = 0;
     float recall = 0;
+    float recall75 = 0;
     float avg_cat = 0;
     float avg_obj = 0;
     float avg_anyobj = 0;
@@ -198,7 +229,7 @@ void forward_region_layer(const layer l, network net)
     for (b = 0; b < l.batch; ++b) {
         if(l.softmax_tree){
             int onlyclass = 0;
-            for(t = 0; t < 30; ++t){
+            for(t = 0; t < l.max_boxes; ++t){
                 box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                 if(!truth.x) break;
                 int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
@@ -218,7 +249,7 @@ void forward_region_layer(const layer l, network net)
                     }
                     int class_index = entry_index(l, b, maxi, l.coords + 1);
                     int obj_index = entry_index(l, b, maxi, l.coords);
-                    delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
+                    delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax, l.focus);
                     if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]);
                     else  l.delta[obj_index] = 0;
                     l.delta[obj_index] = 0;
@@ -233,36 +264,50 @@ void forward_region_layer(const layer l, network net)
             for (i = 0; i < l.w; ++i) {
                 for (n = 0; n < l.n; ++n) {
                     int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
-                    box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
+                    box pred = get_region_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
                     float best_iou = 0;
-                    for(t = 0; t < 30; ++t){
+                    int best_t = 0;
+                    for(t = 0; t < l.max_boxes; ++t){
                         box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
                         if(!truth.x) break;
                         float iou = box_iou(pred, truth);
                         if (iou > best_iou) {
                             best_iou = iou;
+                            best_t = t;
                         }
                     }
                     int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords);
                     avg_anyobj += l.output[obj_index];
                     l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]);
                     if(l.background) l.delta[obj_index] = l.noobject_scale * (1 - l.output[obj_index]);
-                    if (best_iou > l.thresh) {
+                    if (best_iou > l.ignore_thresh) {
                         l.delta[obj_index] = 0;
                     }
+                    if (best_iou > l.truth_thresh) {
+                        l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
 
-                    if(*(net.seen) < 12800){
-                        box truth = {0};
-                        truth.x = (i + .5)/l.w;
-                        truth.y = (j + .5)/l.h;
-                        truth.w = l.biases[2*n]/l.w;
-                        truth.h = l.biases[2*n+1]/l.h;
-                        delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h);
+                        int class = net.truth[best_t*(l.coords + 1) + b*l.truths + l.coords];
+                        if (l.map) class = l.map[class];
+                        int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, l.coords + 1);
+                        delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, 0, !l.softmax, l.focus);
+                        box truth = float_to_box(net.truth + best_t*(l.coords + 1) + b*l.truths, 1);
+                        delta_region_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, l.coord_scale*(2-truth.w*truth.h), l.w*l.h);
                     }
+
+                    /*
+                       if(*(net.seen) < 12800){
+                       box truth = {0};
+                       truth.x = (i + .5)/l.w;
+                       truth.y = (j + .5)/l.h;
+                       truth.w = l.biases[2*l.mask[n]]/net.w;
+                       truth.h = l.biases[2*l.mask[n]+1]/net.h;
+                       delta_region_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, .01, l.w*l.h);
+                       }
+                     */
                 }
             }
         }
-        for(t = 0; t < 30; ++t){
+        for(t = 0; t < l.max_boxes; ++t){
             box truth = float_to_box(net.truth + t*(l.coords + 1) + b*l.truths, 1);
 
             if(!truth.x) break;
@@ -275,16 +320,11 @@ void forward_region_layer(const layer l, network net)
             truth_shift.x = 0;
             truth_shift.y = 0;
             //printf("index %d %d\n",i, j);
-            for(n = 0; n < l.n; ++n){
-                int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
-                box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h);
-                if(l.bias_match){
-                    pred.w = l.biases[2*n]/l.w;
-                    pred.h = l.biases[2*n+1]/l.h;
-                }
+            for(n = 0; n < l.total; ++n){
+                box pred = {0};
+                pred.w = l.biases[2*n]/net.w;
+                pred.h = l.biases[2*n+1]/net.h;
                 //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
-                pred.x = 0;
-                pred.y = 0;
                 float iou = box_iou(pred, truth_shift);
                 if (iou > best_iou){
                     best_iou = iou;
@@ -293,37 +333,42 @@ void forward_region_layer(const layer l, network net)
             }
             //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h);
 
-            int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0);
-            float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale *  (2 - truth.w*truth.h), l.w*l.h);
-            if(l.coords > 4){
-                int mask_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4);
-                delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale);
-            }
-            if(iou > .5) recall += 1;
-            avg_iou += iou;
+            int mask_n = int_index(l.mask, best_n, l.n);
+            //printf("%d %d\n", best_n, mask_n);
+            if(mask_n >= 0){
+                int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
+                float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, l.coord_scale*(2-truth.w*truth.h), l.w*l.h);
+                if(l.coords > 4){
+                    int mask_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
+                    delta_region_mask(net.truth + t*(l.coords + 1) + b*l.truths + 5, l.output, l.coords - 4, mask_index, l.delta, l.w*l.h, l.mask_scale);
+                }
+                if(iou > .5) recall += 1;
+                if(iou > .75) recall75 += 1;
+                avg_iou += iou;
 
-            //l.delta[best_index + 4] = iou - l.output[best_index + 4];
-            int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords);
-            avg_obj += l.output[obj_index];
-            l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
-            if (l.rescore) {
-                l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]);
-            }
-            if(l.background){
-                l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]);
-            }
+                //l.delta[best_index + 4] = iou - l.output[best_index + 4];
+                int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, l.coords);
+                avg_obj += l.output[obj_index];
+                l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]);
+                if (l.rescore) {
+                    l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]);
+                }
+                if(l.background){
+                    l.delta[obj_index] = l.object_scale * (0 - l.output[obj_index]);
+                }
 
-            int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
-            if (l.map) class = l.map[class];
-            int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, l.coords + 1);
-            delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax);
-            ++count;
-            ++class_count;
+                int class = net.truth[t*(l.coords + 1) + b*l.truths + l.coords];
+                if (l.map) class = l.map[class];
+                int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, l.coords + 1);
+                delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat, !l.softmax, l.focus);
+                ++count;
+                ++class_count;
+            }
         }
     }
     //printf("\n");
     *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
-    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
+    printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f,  count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
 }
 
 void backward_region_layer(const layer l, network net)
@@ -339,7 +384,7 @@ void backward_region_layer(const layer l, network net)
      */
 }
 
-void correct_region_boxes(box *boxes, int n, int w, int h, int netw, int neth, int relative)
+void correct_region_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
 {
     int i;
     int new_w=0;
@@ -352,7 +397,7 @@ void correct_region_boxes(box *boxes, int n, int w, int h, int netw, int neth, i
         new_w = (w * neth)/h;
     }
     for (i = 0; i < n; ++i){
-        box b = boxes[i];
+        box b = dets[i].bbox;
         b.x =  (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); 
         b.y =  (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); 
         b.w *= (float)netw/new_w;
@@ -363,11 +408,11 @@ void correct_region_boxes(box *boxes, int n, int w, int h, int netw, int neth, i
             b.y *= h;
             b.h *= h;
         }
-        boxes[i] = b;
+        dets[i].bbox = b;
     }
 }
 
-void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, float **probs, box *boxes, float **masks, int only_objectness, int *map, float tree_thresh, int relative)
+void get_region_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, float tree_thresh, int relative, detection *dets)
 {
     int i,j,n,z;
     float *predictions = l.output;
@@ -399,17 +444,19 @@ void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, f
         int col = i % l.w;
         for(n = 0; n < l.n; ++n){
             int index = n*l.w*l.h + i;
-            for(j = 0; j < l.classes; ++j){
-                probs[index][j] = 0;
+            for (j = 0; j < l.classes; ++j) {
+                dets[index].prob[j] = 0;
             }
             int obj_index  = entry_index(l, 0, n*l.w*l.h + i, l.coords);
             int box_index  = entry_index(l, 0, n*l.w*l.h + i, 0);
             int mask_index = entry_index(l, 0, n*l.w*l.h + i, 4);
             float scale = l.background ? 1 : predictions[obj_index];
-            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h);
-            if(masks){
+            dets[index].bbox = get_region_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
+            dets[index].objectness = scale > thresh ? scale : 0;
+            dets[index].classes = l.classes;
+            if(dets[index].mask){
                 for(j = 0; j < l.coords - 4; ++j){
-                    masks[index][j] = l.output[mask_index + j*l.w*l.h];
+                    dets[index].mask[j] = l.output[mask_index + j*l.w*l.h];
                 }
             }
 
@@ -421,39 +468,24 @@ void get_region_boxes(layer l, int w, int h, int netw, int neth, float thresh, f
                     for(j = 0; j < 200; ++j){
                         int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + map[j]);
                         float prob = scale*predictions[class_index];
-                        probs[index][j] = (prob > thresh) ? prob : 0;
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
                     }
                 } else {
                     int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h);
-                    probs[index][j] = (scale > thresh) ? scale : 0;
-                    probs[index][l.classes] = scale;
+                    dets[index].prob[j] = (scale > thresh) ? scale : 0;
                 }
             } else {
-                float max = 0;
-                for(j = 0; j < l.classes; ++j){
-                    int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
-                    float prob = scale*predictions[class_index];
-                    probs[index][j] = (prob > thresh) ? prob : 0;
-                    if(prob > max) max = prob;
-                    // TODO REMOVE
-                    // if (j == 56 ) probs[index][j] = 0; 
-                    /*
-                       if (j != 0) probs[index][j] = 0; 
-                       int blacklist[] = {121, 497, 482, 504, 122, 518,481, 418, 542, 491, 914, 478, 120, 510,500};
-                       int bb;
-                       for (bb = 0; bb < sizeof(blacklist)/sizeof(int); ++bb){
-                       if(index == blacklist[bb]) probs[index][j] = 0;
-                       }
-                     */
+                if(dets[index].objectness){
+                    for(j = 0; j < l.classes; ++j){
+                        int class_index = entry_index(l, 0, n*l.w*l.h + i, l.coords + 1 + j);
+                        float prob = scale*predictions[class_index];
+                        dets[index].prob[j] = (prob > thresh) ? prob : 0;
+                    }
                 }
-                probs[index][l.classes] = max;
-            }
-            if(only_objectness){
-                probs[index][0] = scale;
             }
         }
     }
-    correct_region_boxes(boxes, l.w*l.h*l.n, w, h, netw, neth, relative);
+    correct_region_boxes(dets, l.w*l.h*l.n, w, h, netw, neth, relative);
 }
 
 #ifdef GPU
@@ -479,17 +511,17 @@ void forward_region_layer_gpu(const layer l, network net)
     if (l.softmax_tree){
         int index = entry_index(l, 0, 0, l.coords + 1);
         softmax_tree(net.input_gpu + index, l.w*l.h, l.batch*l.n, l.inputs/l.n, 1, l.output_gpu + index, *l.softmax_tree);
-    /*
-        int mmin = 9000;
-        int mmax = 0;
-        int i;
-        for(i = 0; i < l.softmax_tree->groups; ++i){
-            int group_size = l.softmax_tree->group_size[i];
-            if (group_size < mmin) mmin = group_size;
-            if (group_size > mmax) mmax = group_size;
-        }
+        /*
+           int mmin = 9000;
+           int mmax = 0;
+           int i;
+           for(i = 0; i < l.softmax_tree->groups; ++i){
+           int group_size = l.softmax_tree->group_size[i];
+           if (group_size < mmin) mmin = group_size;
+           if (group_size > mmax) mmax = group_size;
+           }
         //printf("%d %d %d \n", l.softmax_tree->groups, mmin, mmax);
-        */
+         */
         /*
         // TIMING CODE
         int zz;
@@ -573,13 +605,13 @@ void backward_region_layer_gpu(const layer l, network net)
     for (b = 0; b < l.batch; ++b){
         for(n = 0; n < l.n; ++n){
             int index = entry_index(l, b, n*l.w*l.h, 0);
-            gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            //gradient_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index);
             if(l.coords > 4){
                 index = entry_index(l, b, n*l.w*l.h, 4);
                 gradient_array_gpu(l.output_gpu + index, (l.coords - 4)*l.w*l.h, LOGISTIC, l.delta_gpu + index);
             }
             index = entry_index(l, b, n*l.w*l.h, l.coords);
-            if(!l.background) gradient_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC, l.delta_gpu + index);
+            //if(!l.background) gradient_array_gpu(l.output_gpu + index,   l.w*l.h, LOGISTIC, l.delta_gpu + index);
         }
     }
     axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
diff --git a/src/region_layer.h b/src/region_layer.h
index f67f9016..5aafe249 100644
--- a/src/region_layer.h
+++ b/src/region_layer.h
@@ -5,7 +5,7 @@
 #include "layer.h"
 #include "network.h"
 
-layer make_region_layer(int batch, int h, int w, int n, int classes, int coords);
+layer make_region_layer(int batch, int h, int w, int n, int total, int *mask, int classes, int coords);
 void forward_region_layer(const layer l, network net);
 void backward_region_layer(const layer l, network net);
 void resize_region_layer(layer *l, int w, int h);
diff --git a/src/shortcut_layer.c b/src/shortcut_layer.c
index 0818ca7e..49d17f56 100644
--- a/src/shortcut_layer.c
+++ b/src/shortcut_layer.c
@@ -8,7 +8,7 @@
 
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2)
 {
-    fprintf(stderr,"Shortcut Layer: %d\n", index);
+    fprintf(stderr, "res  %3d                %4d x%4d x%4d   ->  %4d x%4d x%4d\n",index, w2,h2,c2, w,h,c);
     layer l = {0};
     l.type = SHORTCUT;
     l.batch = batch;
@@ -38,32 +38,53 @@ layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int
     return l;
 }
 
+void resize_shortcut_layer(layer *l, int w, int h)
+{
+    assert(l->w == l->out_w);
+    assert(l->h == l->out_h);
+    l->w = l->out_w = w;
+    l->h = l->out_h = h;
+    l->outputs = w*h*l->out_c;
+    l->inputs = l->outputs;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+
 void forward_shortcut_layer(const layer l, network net)
 {
     copy_cpu(l.outputs*l.batch, net.input, 1, l.output, 1);
-    shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.output);
+    shortcut_cpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output);
     activate_array(l.output, l.outputs*l.batch, l.activation);
 }
 
 void backward_shortcut_layer(const layer l, network net)
 {
     gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
-    axpy_cpu(l.outputs*l.batch, 1, l.delta, 1, net.delta, 1);
-    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, net.layers[l.index].delta);
+    axpy_cpu(l.outputs*l.batch, l.alpha, l.delta, 1, net.delta, 1);
+    shortcut_cpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta);
 }
 
 #ifdef GPU
 void forward_shortcut_layer_gpu(const layer l, network net)
 {
     copy_gpu(l.outputs*l.batch, net.input_gpu, 1, l.output_gpu, 1);
-    shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.output_gpu);
+    shortcut_gpu(l.batch, l.w, l.h, l.c, net.layers[l.index].output_gpu, l.out_w, l.out_h, l.out_c, l.alpha, l.beta, l.output_gpu);
     activate_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation);
 }
 
 void backward_shortcut_layer_gpu(const layer l, network net)
 {
     gradient_array_gpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
-    axpy_gpu(l.outputs*l.batch, 1, l.delta_gpu, 1, net.delta_gpu, 1);
-    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, net.layers[l.index].delta_gpu);
+    axpy_gpu(l.outputs*l.batch, l.alpha, l.delta_gpu, 1, net.delta_gpu, 1);
+    shortcut_gpu(l.batch, l.out_w, l.out_h, l.out_c, l.delta_gpu, l.w, l.h, l.c, 1, l.beta, net.layers[l.index].delta_gpu);
 }
 #endif
diff --git a/src/shortcut_layer.h b/src/shortcut_layer.h
index 32e4ebdc..5f684fc1 100644
--- a/src/shortcut_layer.h
+++ b/src/shortcut_layer.h
@@ -7,6 +7,7 @@
 layer make_shortcut_layer(int batch, int index, int w, int h, int c, int w2, int h2, int c2);
 void forward_shortcut_layer(const layer l, network net);
 void backward_shortcut_layer(const layer l, network net);
+void resize_shortcut_layer(layer *l, int w, int h);
 
 #ifdef GPU
 void forward_shortcut_layer_gpu(const layer l, network net);
diff --git a/src/softmax_layer.c b/src/softmax_layer.c
index 372b037c..afcc6342 100644
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@@ -18,8 +18,10 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.groups = groups;
     l.inputs = inputs;
     l.outputs = inputs;
+    l.loss = calloc(inputs*batch, sizeof(float));
     l.output = calloc(inputs*batch, sizeof(float));
     l.delta = calloc(inputs*batch, sizeof(float));
+    l.cost = calloc(1, sizeof(float));
 
     l.forward = forward_softmax_layer;
     l.backward = backward_softmax_layer;
@@ -28,6 +30,7 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups)
     l.backward_gpu = backward_softmax_layer_gpu;
 
     l.output_gpu = cuda_make_array(l.output, inputs*batch); 
+    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
     l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
     #endif
     return l;
@@ -46,6 +49,11 @@ void forward_softmax_layer(const softmax_layer l, network net)
     } else {
         softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
     }
+
+    if(net.truth){
+        softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
 }
 
 void backward_softmax_layer(const softmax_layer l, network net)
@@ -63,6 +71,8 @@ void pull_softmax_layer_output(const softmax_layer layer)
 void forward_softmax_layer_gpu(const softmax_layer l, network net)
 {
     if(l.softmax_tree){
+        softmax_tree(net.input_gpu, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree);
+        /*
         int i;
         int count = 0;
         for (i = 0; i < l.softmax_tree->groups; ++i) {
@@ -70,6 +80,7 @@ void forward_softmax_layer_gpu(const softmax_layer l, network net)
             softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
             count += group_size;
         }
+        */
     } else {
         if(l.spatial){
             softmax_gpu(net.input_gpu, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu);
@@ -77,6 +88,15 @@ void forward_softmax_layer_gpu(const softmax_layer l, network net)
             softmax_gpu(net.input_gpu, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
         }
     }
+    if(net.truth){
+        softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth_gpu, l.delta_gpu, l.loss_gpu);
+        if(l.softmax_tree){
+            mask_gpu(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth_gpu, 0);
+            mask_gpu(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth_gpu, 0);
+        }
+        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
+        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
+    }
 }
 
 void backward_softmax_layer_gpu(const softmax_layer layer, network net)
diff --git a/src/tree.h b/src/tree.h
index 100858a3..3802b8ea 100644
--- a/src/tree.h
+++ b/src/tree.h
@@ -2,7 +2,6 @@
 #define TREE_H
 #include "darknet.h"
 
-tree *read_tree(char *filename);
 int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride);
 float get_hierarchy_probability(float *x, tree *hier, int c, int stride);
 
diff --git a/src/upsample_layer.c b/src/upsample_layer.c
new file mode 100644
index 00000000..605f21f8
--- /dev/null
+++ b/src/upsample_layer.c
@@ -0,0 +1,106 @@
+#include "upsample_layer.h"
+#include "cuda.h"
+#include "blas.h"
+
+#include <stdio.h>
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride)
+{
+    layer l = {0};
+    l.type = UPSAMPLE;
+    l.batch = batch;
+    l.w = w;
+    l.h = h;
+    l.c = c;
+    l.out_w = w*stride;
+    l.out_h = h*stride;
+    l.out_c = c;
+    if(stride < 0){
+        stride = -stride;
+        l.reverse=1;
+        l.out_w = w/stride;
+        l.out_h = h/stride;
+    }
+    l.stride = stride;
+    l.outputs = l.out_w*l.out_h*l.out_c;
+    l.inputs = l.w*l.h*l.c;
+    l.delta =  calloc(l.outputs*batch, sizeof(float));
+    l.output = calloc(l.outputs*batch, sizeof(float));;
+
+    l.forward = forward_upsample_layer;
+    l.backward = backward_upsample_layer;
+    #ifdef GPU
+    l.forward_gpu = forward_upsample_layer_gpu;
+    l.backward_gpu = backward_upsample_layer_gpu;
+
+    l.delta_gpu =  cuda_make_array(l.delta, l.outputs*batch);
+    l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
+    #endif
+    if(l.reverse) fprintf(stderr, "downsample         %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    else fprintf(stderr, "upsample           %2dx  %4d x%4d x%4d   ->  %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
+    return l;
+}
+
+void resize_upsample_layer(layer *l, int w, int h)
+{
+    l->w = w;
+    l->h = h;
+    l->out_w = w*l->stride;
+    l->out_h = h*l->stride;
+    if(l->reverse){
+        l->out_w = w/l->stride;
+        l->out_h = h/l->stride;
+    }
+    l->outputs = l->out_w*l->out_h*l->out_c;
+    l->inputs = l->h*l->w*l->c;
+    l->delta =  realloc(l->delta, l->outputs*l->batch*sizeof(float));
+    l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));
+
+#ifdef GPU
+    cuda_free(l->output_gpu);
+    cuda_free(l->delta_gpu);
+    l->output_gpu  = cuda_make_array(l->output, l->outputs*l->batch);
+    l->delta_gpu   = cuda_make_array(l->delta,  l->outputs*l->batch);
+#endif
+    
+}
+
+void forward_upsample_layer(const layer l, network net)
+{
+    fill_cpu(l.outputs*l.batch, 0, l.output, 1);
+    if(l.reverse){
+        upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
+    }else{
+        upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
+    }
+}
+
+void backward_upsample_layer(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta);
+    }else{
+        upsample_cpu(net.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
+    }
+}
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net)
+{
+    fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
+    if(l.reverse){
+        upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input_gpu);
+    }else{
+        upsample_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
+    }
+}
+
+void backward_upsample_layer_gpu(const layer l, network net)
+{
+    if(l.reverse){
+        upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta_gpu);
+    }else{
+        upsample_gpu(net.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
+    }
+}
+#endif
diff --git a/src/upsample_layer.h b/src/upsample_layer.h
new file mode 100644
index 00000000..86790d10
--- /dev/null
+++ b/src/upsample_layer.h
@@ -0,0 +1,15 @@
+#ifndef UPSAMPLE_LAYER_H
+#define UPSAMPLE_LAYER_H
+#include "darknet.h"
+
+layer make_upsample_layer(int batch, int w, int h, int c, int stride);
+void forward_upsample_layer(const layer l, network net);
+void backward_upsample_layer(const layer l, network net);
+void resize_upsample_layer(layer *l, int w, int h);
+
+#ifdef GPU
+void forward_upsample_layer_gpu(const layer l, network net);
+void backward_upsample_layer_gpu(const layer l, network net);
+#endif
+
+#endif
diff --git a/src/utils.c b/src/utils.c
index 9f1af1df..4e4efc2c 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -627,6 +627,15 @@ int max_index(float *a, int n)
     return max_i;
 }
 
+int int_index(int *a, int val, int n)
+{
+    int i;
+    for(i = 0; i < n; ++i){
+        if(a[i] == val) return i;
+    }
+    return -1;
+}
+
 int rand_int(int min, int max)
 {
     if (max < min){
diff --git a/src/utils.h b/src/utils.h
index b0db7abf..ef24da79 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -37,11 +37,9 @@ list *parse_csv_line(char *line);
 char *copy_string(char *s);
 int count_fields(char *line);
 float *parse_fields(char *line, int n);
-void scale_array(float *a, int n, float s);
 void translate_array(float *a, int n, float s);
 float constrain(float min, float max, float a);
 int constrain_int(int a, int min, int max);
-float rand_uniform(float min, float max);
 float rand_scale(float s);
 int rand_int(int min, int max);
 void mean_arrays(float **a, int n, int els, float *avg);
@@ -49,6 +47,7 @@ float dist_array(float *a, float *b, int n, int sub);
 float **one_hot_encode(float *a, int n, int k);
 float sec(clock_t clocks);
 void print_statistics(float *a, int n);
+int int_index(int *a, int val, int n);
 
 #endif