adding yolo9000

2023-08-10 21:13:14 +03:00 · 2017-01-04 04:44:00 -08:00
parent 2710d63257
commit d2dece3df7
29 changed files with 19591 additions and 596 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,6 +3,8 @@
 *.csv
 *.out
 *.png
+*.jpg
+old/
 mnist/
 data/
 caffe/
--- a/cfg/coco.data
+++ b/cfg/coco.data
@ -1,7 +1,7 @@
 classes= 80
 train  = /home/pjreddie/data/coco/trainvalno5k.txt
-valid  = coco_testdev
-#valid = data/coco_val_5k.list
+#valid  = coco_testdev
+valid = data/coco_val_5k.list
 names = data/coco.names
 backup = /home/pjreddie/backup/
 eval=coco
--- a/cfg/voc.data
+++ b/cfg/voc.data
@ -2,5 +2,5 @@ classes= 20
 train  = /home/pjreddie/data/voc/train.txt
 valid  = /home/pjreddie/data/voc/2007_test.txt
 names = data/voc.names
-backup = /home/pjreddie/backup/
+backup = backup

--- a/cfg/yolo9000.cfg
+++ b/cfg/yolo9000.cfg
@ -0,0 +1,211 @@
+[net]
+batch=1
+subdivisions=1
+height=416
+width=416
+channels=3
+momentum=0.9
+decay=0.0005
+
+learning_rate=0.00001
+max_batches = 242200
+policy=steps
+steps=500,200000,240000
+scales=10,.1,.1
+
+hue=.1
+saturation=.75
+exposure=.75
+
+[convolutional]
+batch_normalize=1
+filters=32
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=64
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=128
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=256
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[maxpool]
+size=2
+stride=2
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=1024
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+filters=28269
+size=1
+stride=1
+pad=1
+activation=linear
+
+[region]
+anchors = 0.77871, 1.14074, 3.00525, 4.31277, 9.22725, 9.61974
+bias_match=1
+classes=9418
+coords=4
+num=3
+softmax=1
+jitter=.2
+rescore=1
+
+object_scale=5
+noobject_scale=1
+class_scale=1
+coord_scale=1
+
+thresh = .6
+absolute=1
+random=1
+
+tree=data/9k.tree
+map = data/coco9k.map
--- a/data/9k.labels
+++ b/data/9k.labels
--- a/data/9k.names
+++ b/data/9k.names
--- a/data/coco9k.map
+++ b/data/coco9k.map
@ -0,0 +1,80 @@
+5177
+3768
+3802
+3800
+4107
+4072
+4071
+3797
+4097
+2645
+5150
+2644
+3257
+2523
+6527
+6866
+6912
+7342
+7255
+7271
+7217
+6858
+7343
+7233
+3704
+4374
+3641
+5001
+3899
+2999
+2631
+5141
+2015
+1133
+1935
+1930
+5144
+5143
+2371
+3916
+3745
+3640
+4749
+4736
+4735
+3678
+58
+42
+771
+81
+152
+141
+786
+700
+218
+791
+2518
+2521
+3637
+2458
+2505
+2519
+3499
+2837
+3503
+2597
+3430
+2080
+5103
+5111
+5102
+3013
+5096
+1102
+3218
+4010
+2266
+1127
+5122
+2360
--- a/data/inet9k.map
+++ b/data/inet9k.map
@ -0,0 +1,200 @@
+2687
+4107
+8407
+7254
+42
+6797
+127
+2268
+2442
+3704
+260
+1970
+58
+4443
+2661
+2043
+2039
+4858
+4007
+6858
+8408
+166
+2523
+3768
+4347
+6527
+2446
+5005
+3274
+3678
+4918
+709
+4072
+8428
+7223
+2251
+3802
+3848
+7271
+2677
+8267
+2849
+2518
+2738
+3746
+5105
+3430
+3503
+2249
+1841
+2032
+2358
+122
+3984
+4865
+3246
+5095
+6912
+6878
+8467
+2741
+1973
+3057
+7217
+1872
+44
+2452
+3637
+2704
+6917
+2715
+6734
+2325
+6864
+6677
+2035
+1949
+338
+2664
+5122
+1844
+784
+2223
+7188
+2719
+2670
+4830
+158
+4818
+7228
+1965
+7342
+786
+2095
+8281
+8258
+7406
+3915
+8382
+2437
+2837
+82
+6871
+1876
+7447
+8285
+5007
+2740
+3463
+5103
+3755
+4910
+6809
+3800
+118
+3396
+3092
+2709
+81
+7105
+4036
+2366
+1846
+5177
+2684
+64
+2041
+3919
+700
+3724
+1742
+39
+807
+7184
+2256
+235
+2778
+2996
+2030
+3714
+7167
+2369
+6705
+6861
+5096
+2597
+2166
+2036
+3228
+3747
+2711
+8300
+2226
+7153
+7255
+2631
+7109
+8242
+7445
+3776
+3803
+3690
+2025
+2521
+2316
+7190
+8249
+3352
+2639
+2887
+100
+4219
+3344
+5008
+7224
+3351
+2434
+2074
+2034
+8304
+5004
+6868
+5102
+2645
+4071
+2716
+2717
+7420
+3499
+3763
+5084
+2676
+2046
+5107
+5097
+3944
+4097
+7132
+3956
+7343
--- a/src/box.c
+++ b/src/box.c
@ -246,6 +246,34 @@ int nms_comparator(const void *pa, const void *pb)
    return 0;
 }

+void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh)
+{
+    int i, j, k;
+    sortable_bbox *s = calloc(total, sizeof(sortable_bbox));
+
+    for(i = 0; i < total; ++i){
+        s[i].index = i;       
+        s[i].class = classes;
+        s[i].probs = probs;
+    }
+
+    qsort(s, total, sizeof(sortable_bbox), nms_comparator);
+    for(i = 0; i < total; ++i){
+        if(probs[s[i].index][classes] == 0) continue;
+        box a = boxes[s[i].index];
+        for(j = i+1; j < total; ++j){
+            box b = boxes[s[j].index];
+            if (box_iou(a, b) > thresh){
+                for(k = 0; k < classes+1; ++k){
+                    probs[s[j].index][k] = 0;
+                }
+            }
+        }
+    }
+    free(s);
+}
+
+
 void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh)
 {
    int i, j, k;
--- a/src/box.h
+++ b/src/box.h
@ -15,6 +15,7 @@ float box_rmse(box a, box b);
 dbox diou(box a, box b);
 void do_nms(box *boxes, float **probs, int total, int classes, float thresh);
 void do_nms_sort(box *boxes, float **probs, int total, int classes, float thresh);
+void do_nms_obj(box *boxes, float **probs, int total, int classes, float thresh);
 box decode_box(box b, box anchor);
 box encode_box(box b, box anchor);

--- a/src/coco.c
+++ b/src/coco.c
@ -384,5 +384,5 @@ void run_coco(int argc, char **argv)
    else if(0==strcmp(argv[2], "train")) train_coco(cfg, weights);
    else if(0==strcmp(argv[2], "valid")) validate_coco(cfg, weights);
    else if(0==strcmp(argv[2], "recall")) validate_coco_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, coco_classes, 80, frame_skip, prefix, .5);
 }
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -127,6 +127,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)

 void backward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 {
+    //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);

    backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h);
--- a/src/cpu_gemm.c
+++ b/src/cpu_gemm.c
@ -1,91 +0,0 @@
-#include "mini_blas.h"
-
-void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            register float A_PART = ALPHA*A[i*lda+k];
-            for(j = 0; j < N; ++j){
-                C[i*ldc+j] += A_PART*B[k*ldb+j];
-            }
-        }
-    }
-}
-
-void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            register float sum = 0;
-            for(k = 0; k < K; ++k){
-                sum += ALPHA*A[i*lda+k]*B[k+j*ldb];
-            }
-            C[i*ldc+j] += sum;
-        }
-    }
-}
-
-void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(k = 0; k < K; ++k){
-            register float A_PART = ALPHA*A[k*lda+i];
-            for(j = 0; j < N; ++j){
-                C[i*ldc+j] += A_PART*B[k*ldb+j];
-            }
-        }
-    }
-}
-void cpu_gemm_tt(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    int i,j,k;
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            for(k = 0; k < K; ++k){
-                C[i*ldc+j] += ALPHA*A[i+k*lda]*B[k+j*ldb];
-            }
-        }
-    }
-}
-
-
-void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, 
-        float *A, int lda, 
-        float *B, int ldb,
-        float BETA,
-        float *C, int ldc)
-{
-    int i, j;
-    for(i = 0; i < M; ++i){
-        for(j = 0; j < N; ++j){
-            C[i*ldc + j] *= BETA;
-        }
-    }
-    if(!TA && !TB)
-        cpu_gemm_nn( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-    else if(TA && !TB)
-        cpu_gemm_tn( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-    else if(!TA && TB)
-        cpu_gemm_nt( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-    else
-        cpu_gemm_tt( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-}
--- a/src/darknet.c
+++ b/src/darknet.c
@ -13,7 +13,7 @@
 #endif

 extern void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename, int top);
-extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh);
+extern void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh);
 extern void run_voxel(int argc, char **argv);
 extern void run_yolo(int argc, char **argv);
 extern void run_detector(int argc, char **argv);
@ -129,7 +129,9 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile)
    network net = parse_network_cfg(cfgfile);
    int oldn = net.layers[net.n - 2].n;
    int c = net.layers[net.n - 2].c;
-    net.layers[net.n - 2].n = 9372;
+    scal_cpu(oldn*c, .1, net.layers[net.n - 2].weights, 1);
+    scal_cpu(oldn, 0, net.layers[net.n - 2].biases, 1);
+    net.layers[net.n - 2].n = 9418;
    net.layers[net.n - 2].biases += 5;
    net.layers[net.n - 2].weights += 5*c;
    if(weightfile){
@ -383,7 +385,7 @@ int main(int argc, char **argv)
    } else if (0 == strcmp(argv[1], "detect")){
        float thresh = find_float_arg(argc, argv, "-thresh", .24);
        char *filename = (argc > 4) ? argv[4]: 0;
-        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh);
+        test_detector("cfg/coco.data", argv[2], argv[3], filename, thresh, .5);
    } else if (0 == strcmp(argv[1], "cifar")){
        run_cifar(argc, argv);
    } else if (0 == strcmp(argv[1], "go")){
--- a/src/data.c
+++ b/src/data.c
@ -267,7 +267,7 @@ void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int
        h =  boxes[i].h;
        id = boxes[i].id;

-        if (w < .01 || h < .01) continue;
+        if (w < .005 || h < .005) continue;

        int col = (int)(x*num_boxes);
        int row = (int)(y*num_boxes);
@ -317,7 +317,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes,
        h =  boxes[i].h;
        id = boxes[i].id;

-        if ((w < .01 || h < .01)) continue;
+        if ((w < .005 || h < .005)) continue;

        truth[i*5+0] = x;
        truth[i*5+1] = y;
--- a/src/demo.c
+++ b/src/demo.c
@ -31,6 +31,7 @@ static image disp = {0};
 static CvCapture * cap;
 static float fps = 0;
 static float demo_thresh = 0;
+static float demo_hier_thresh = .5;

 static float *predictions[FRAMES];
 static int demo_index = 0;
@ -63,7 +64,7 @@ void *detect_in_thread(void *ptr)
    if(l.type == DETECTION){
        get_detection_boxes(l, 1, 1, demo_thresh, probs, boxes, 0);
    } else if (l.type == REGION){
-        get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0);
+        get_region_boxes(l, 1, 1, demo_thresh, probs, boxes, 0, 0, demo_hier_thresh);
    } else {
        error("Last layer must produce detections\n");
    }
@ -91,7 +92,7 @@ double get_wall_time()
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
 }

-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix)
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
 {
    //skip = frame_skip;
    image **alphabet = load_alphabet();
@ -100,6 +101,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
    demo_alphabet = alphabet;
    demo_classes = classes;
    demo_thresh = thresh;
+    demo_hier_thresh = hier_thresh;
    printf("Demo\n");
    net = parse_network_cfg(cfgfile);
    if(weightfile){
@ -127,7 +129,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch

    boxes = (box *)calloc(l.w*l.h*l.n, sizeof(box));
    probs = (float **)calloc(l.w*l.h*l.n, sizeof(float *));
-    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float *));
+    for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float));

    pthread_t fetch_thread;
    pthread_t detect_thread;
@ -213,7 +215,7 @@ void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const ch
    }
 }
 #else
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix)
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh)
 {
    fprintf(stderr, "Demo needs OpenCV for webcam images.\n");
 }
--- a/src/demo.h
+++ b/src/demo.h
@ -2,6 +2,6 @@
 #define DEMO

 #include "image.h"
-void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix);
+void demo(char *cfgfile, char *weightfile, float thresh, int cam_index, const char *filename, char **names, int classes, int frame_skip, char *prefix, float hier_thresh);

 #endif
--- a/src/detector.c
+++ b/src/detector.c
@ -81,7 +81,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
        if(l.random && count++%10 == 0){
            printf("Resizing\n");
            int dim = (rand() % 10 + 10) * 32;
-            if (get_current_batch(net)+100 > net.max_batches) dim = 544;
+            if (get_current_batch(net)+200 > net.max_batches) dim = 608;
            //int dim = (rand() % 4 + 16) * 32;
            printf("%d\n", dim);
            args.w = dim;
@ -231,7 +231,7 @@ void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int
    }
 }

-void validate_detector(char *datacfg, char *cfgfile, char *weightfile)
+void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile)
 {
    int j;
    list *options = read_data_cfg(datacfg);
@ -251,7 +251,6 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile)
    fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
    srand(time(0));

-    char *base = "comp4_det_test_";
    list *plist = get_paths(valid_images);
    char **paths = (char **)list_to_array(plist);

@ -265,19 +264,22 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile)
    int coco = 0;
    int imagenet = 0;
    if(0==strcmp(type, "coco")){
-        snprintf(buff, 1024, "%s/coco_results.json", prefix);
+        if(!outfile) outfile = "coco_results";
+        snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
        fp = fopen(buff, "w");
        fprintf(fp, "[\n");
        coco = 1;
    } else if(0==strcmp(type, "imagenet")){
-        snprintf(buff, 1024, "%s/imagenet-detection.txt", prefix);
+        if(!outfile) outfile = "imagenet-detection";
+        snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
        fp = fopen(buff, "w");
        imagenet = 1;
        classes = 200;
    } else {
+        if(!outfile) outfile = "comp4_det_test_";
        fps = calloc(classes, sizeof(FILE *));
        for(j = 0; j < classes; ++j){
-            snprintf(buff, 1024, "%s/%s%s.txt", prefix, base, names[j]);
+            snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
            fps[j] = fopen(buff, "w");
        }
    }
@ -333,7 +335,7 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile)
            network_predict(net, X);
            int w = val[t].w;
            int h = val[t].h;
-            get_region_boxes(l, w, h, thresh, probs, boxes, 0, map);
+            get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5);
            if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms);
            if (coco){
                print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h);
@ -397,7 +399,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
        image sized = resize_image(orig, net.w, net.h);
        char *id = basecfg(path);
        network_predict(net, sized.data);
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0);
+        get_region_boxes(l, 1, 1, thresh, probs, boxes, 1, 0, .5);
        if (nms) do_nms(boxes, probs, l.w*l.h*l.n, 1, nms);

        char labelpath[4096];
@ -436,7 +438,7 @@ void validate_detector_recall(char *cfgfile, char *weightfile)
    }
 }

-void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh)
+void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filename, float thresh, float hier_thresh)
 {
    list *options = read_data_cfg(datacfg);
    char *name_list = option_find_str(options, "names", "data/names.list");
@ -470,14 +472,15 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam

        box *boxes = calloc(l.w*l.h*l.n, sizeof(box));
        float **probs = calloc(l.w*l.h*l.n, sizeof(float *));
-        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *));
+        for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes + 1, sizeof(float *));

        float *X = sized.data;
        time=clock();
        network_predict(net, X);
        printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time));
-        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0);
-        if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh);
+        if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms);
+        else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms);
        draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes);
        save_image(im, "predictions");
        show_image(im, "predictions");
@ -498,6 +501,7 @@ void run_detector(int argc, char **argv)
 {
    char *prefix = find_char_arg(argc, argv, "-prefix", 0);
    float thresh = find_float_arg(argc, argv, "-thresh", .24);
+    float hier_thresh = find_float_arg(argc, argv, "-hier", .5);
    int cam_index = find_int_arg(argc, argv, "-c", 0);
    int frame_skip = find_int_arg(argc, argv, "-s", 0);
    if(argc < 4){
@ -505,6 +509,7 @@ void run_detector(int argc, char **argv)
        return;
    }
    char *gpu_list = find_char_arg(argc, argv, "-gpus", 0);
+    char *outfile = find_char_arg(argc, argv, "-out", 0);
    int *gpus = 0;
    int gpu = 0;
    int ngpus = 0;
@ -533,15 +538,15 @@ void run_detector(int argc, char **argv)
    char *cfg = argv[4];
    char *weights = (argc > 5) ? argv[5] : 0;
    char *filename = (argc > 6) ? argv[6]: 0;
-    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh);
+    if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh);
    else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear);
-    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights);
+    else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile);
    else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights);
    else if(0==strcmp(argv[2], "demo")) {
        list *options = read_data_cfg(datacfg);
        int classes = option_find_int(options, "classes", 20);
        char *name_list = option_find_str(options, "names", "data/names.list");
        char **names = get_labels(name_list);
-        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix);
+        demo(cfg, weights, thresh, cam_index, filename, names, classes, frame_skip, prefix, hier_thresh);
    }
 }
--- a/src/layer.c
+++ b/src/layer.c
@ -11,34 +11,88 @@ void free_layer(layer l)
 #endif
        return;
    }
-    if(l.indexes)        free(l.indexes);
-    if(l.rand)           free(l.rand);
-    if(l.cost)           free(l.cost);
-    if(l.biases)         free(l.biases);
-    if(l.bias_updates)   free(l.bias_updates);
-    if(l.weights)        free(l.weights);
-    if(l.weight_updates) free(l.weight_updates);
-    if(l.col_image)      free(l.col_image);
-    if(l.input_layers)   free(l.input_layers);
-    if(l.input_sizes)    free(l.input_sizes);
-    if(l.delta)          free(l.delta);
-    if(l.output)         free(l.output);
-    if(l.squared)        free(l.squared);
-    if(l.norms)          free(l.norms);
+    if(l.cweights)           free(l.cweights);
+    if(l.indexes)            free(l.indexes);
+    if(l.input_layers)       free(l.input_layers);
+    if(l.input_sizes)        free(l.input_sizes);
+    if(l.map)                free(l.map);
+    if(l.rand)               free(l.rand);
+    if(l.cost)               free(l.cost);
+    if(l.state)              free(l.state);
+    if(l.prev_state)         free(l.prev_state);
+    if(l.forgot_state)       free(l.forgot_state);
+    if(l.forgot_delta)       free(l.forgot_delta);
+    if(l.state_delta)        free(l.state_delta);
+    if(l.concat)             free(l.concat);
+    if(l.concat_delta)       free(l.concat_delta);
+    if(l.binary_weights)     free(l.binary_weights);
+    if(l.biases)             free(l.biases);
+    if(l.bias_updates)       free(l.bias_updates);
+    if(l.scales)             free(l.scales);
+    if(l.scale_updates)      free(l.scale_updates);
+    if(l.weights)            free(l.weights);
+    if(l.weight_updates)     free(l.weight_updates);
+    if(l.col_image)          free(l.col_image);
+    if(l.delta)              free(l.delta);
+    if(l.output)             free(l.output);
+    if(l.squared)            free(l.squared);
+    if(l.norms)              free(l.norms);
+    if(l.spatial_mean)       free(l.spatial_mean);
+    if(l.mean)               free(l.mean);
+    if(l.variance)           free(l.variance);
+    if(l.mean_delta)         free(l.mean_delta);
+    if(l.variance_delta)     free(l.variance_delta);
+    if(l.rolling_mean)       free(l.rolling_mean);
+    if(l.rolling_variance)   free(l.rolling_variance);
+    if(l.x)                  free(l.x);
+    if(l.x_norm)             free(l.x_norm);
+    if(l.m)                  free(l.m);
+    if(l.v)                  free(l.v);
+    if(l.z_cpu)              free(l.z_cpu);
+    if(l.r_cpu)              free(l.r_cpu);
+    if(l.h_cpu)              free(l.h_cpu);
+    if(l.binary_input)       free(l.binary_input);

 #ifdef GPU
-    if(l.indexes_gpu)          cuda_free((float *)l.indexes_gpu);
-    if(l.weights_gpu)          cuda_free(l.weights_gpu);
-    if(l.weight_updates_gpu)   cuda_free(l.weight_updates_gpu);
-    if(l.col_image_gpu)        cuda_free(l.col_image_gpu);
-    if(l.weights_gpu)          cuda_free(l.weights_gpu);
-    if(l.biases_gpu)           cuda_free(l.biases_gpu);
-    if(l.weight_updates_gpu)   cuda_free(l.weight_updates_gpu);
-    if(l.bias_updates_gpu)     cuda_free(l.bias_updates_gpu);
-    if(l.output_gpu)           cuda_free(l.output_gpu);
-    if(l.delta_gpu)            cuda_free(l.delta_gpu);
-    if(l.rand_gpu)             cuda_free(l.rand_gpu);
-    if(l.squared_gpu)          cuda_free(l.squared_gpu);
-    if(l.norms_gpu)            cuda_free(l.norms_gpu);
+    if(l.indexes_gpu)           cuda_free((float *)l.indexes_gpu);
+
+    if(l.z_gpu)                   cuda_free(l.z_gpu);
+    if(l.r_gpu)                   cuda_free(l.r_gpu);
+    if(l.h_gpu)                   cuda_free(l.h_gpu);
+    if(l.m_gpu)                   cuda_free(l.m_gpu);
+    if(l.v_gpu)                   cuda_free(l.v_gpu);
+    if(l.prev_state_gpu)          cuda_free(l.prev_state_gpu);
+    if(l.forgot_state_gpu)        cuda_free(l.forgot_state_gpu);
+    if(l.forgot_delta_gpu)        cuda_free(l.forgot_delta_gpu);
+    if(l.state_gpu)               cuda_free(l.state_gpu);
+    if(l.state_delta_gpu)         cuda_free(l.state_delta_gpu);
+    if(l.gate_gpu)                cuda_free(l.gate_gpu);
+    if(l.gate_delta_gpu)          cuda_free(l.gate_delta_gpu);
+    if(l.save_gpu)                cuda_free(l.save_gpu);
+    if(l.save_delta_gpu)          cuda_free(l.save_delta_gpu);
+    if(l.concat_gpu)              cuda_free(l.concat_gpu);
+    if(l.concat_delta_gpu)        cuda_free(l.concat_delta_gpu);
+    if(l.binary_input_gpu)        cuda_free(l.binary_input_gpu);
+    if(l.binary_weights_gpu)      cuda_free(l.binary_weights_gpu);
+    if(l.mean_gpu)                cuda_free(l.mean_gpu);
+    if(l.variance_gpu)            cuda_free(l.variance_gpu);
+    if(l.rolling_mean_gpu)        cuda_free(l.rolling_mean_gpu);
+    if(l.rolling_variance_gpu)    cuda_free(l.rolling_variance_gpu);
+    if(l.variance_delta_gpu)      cuda_free(l.variance_delta_gpu);
+    if(l.mean_delta_gpu)          cuda_free(l.mean_delta_gpu);
+    if(l.col_image_gpu)           cuda_free(l.col_image_gpu);
+    if(l.x_gpu)                   cuda_free(l.x_gpu);
+    if(l.x_norm_gpu)              cuda_free(l.x_norm_gpu);
+    if(l.weights_gpu)             cuda_free(l.weights_gpu);
+    if(l.weight_updates_gpu)      cuda_free(l.weight_updates_gpu);
+    if(l.biases_gpu)              cuda_free(l.biases_gpu);
+    if(l.bias_updates_gpu)        cuda_free(l.bias_updates_gpu);
+    if(l.scales_gpu)              cuda_free(l.scales_gpu);
+    if(l.scale_updates_gpu)       cuda_free(l.scale_updates_gpu);
+    if(l.output_gpu)              cuda_free(l.output_gpu);
+    if(l.delta_gpu)               cuda_free(l.delta_gpu);
+    if(l.rand_gpu)                cuda_free(l.rand_gpu);
+    if(l.squared_gpu)             cuda_free(l.squared_gpu);
+    if(l.norms_gpu)               cuda_free(l.norms_gpu);
 #endif
 }
--- a/src/layer.h
+++ b/src/layer.h
@ -99,14 +99,7 @@ struct layer{
    float B1;
    float B2;
    float eps;
-    float *m_gpu;
-    float *v_gpu;
    int t;
-    float *m;
-    float *v;
-
-    tree *softmax_tree;
-    int  *map;

    float alpha;
    float beta;
@ -129,33 +122,34 @@ struct layer{
    float probability;
    float scale;

-    int *indexes;
-    float *rand;
-    float *cost;
-    char  *cweights;
-    float *state;
-    float *prev_state;
-    float *forgot_state;
-    float *forgot_delta;
-    float *state_delta;
-
-    float *concat;
-    float *concat_delta;
-
-    float *binary_weights;
-
-    float *biases;
-    float *bias_updates;
-
-    float *scales;
-    float *scale_updates;
-
-    float *weights;
-    float *weight_updates;
-
-    float *col_image;
+    char  * cweights;
+    int   * indexes;
    int   * input_layers;
    int   * input_sizes;
+    int   * map;
+    float * rand;
+    float * cost;
+    float * state;
+    float * prev_state;
+    float * forgot_state;
+    float * forgot_delta;
+    float * state_delta;
+
+    float * concat;
+    float * concat_delta;
+
+    float * binary_weights;
+
+    float * biases;
+    float * bias_updates;
+
+    float * scales;
+    float * scale_updates;
+
+    float * weights;
+    float * weight_updates;
+
+    float * col_image;
    float * delta;
    float * output;
    float * squared;
@ -174,6 +168,15 @@ struct layer{
    float * x;
    float * x_norm;

+    float * m;
+    float * v;
+
+    float * z_cpu;
+    float * r_cpu;
+    float * h_cpu;
+
+    float * binary_input;
+
    struct layer *input_layer;
    struct layer *self_layer;
    struct layer *output_layer;
@ -194,20 +197,20 @@ struct layer{
    struct layer *input_h_layer;
    struct layer *state_h_layer;

-    float *z_cpu;
-    float *r_cpu;
-    float *h_cpu;
-
-    float *binary_input;
+    tree *softmax_tree;

    size_t workspace_size;

    #ifdef GPU
+    int *indexes_gpu;
+
    float *z_gpu;
    float *r_gpu;
    float *h_gpu;

-    int *indexes_gpu;
+    float *m_gpu;
+    float *v_gpu;
+
    float * prev_state_gpu;
    float * forgot_state_gpu;
    float * forgot_delta_gpu;
--- a/src/parser.c
+++ b/src/parser.c
@ -826,7 +826,7 @@ void save_weights_upto(network net, char *filename, int cutoff)
    }
 #endif
    fprintf(stderr, "Saving weights to %s\n", filename);
-    FILE *fp = fopen(filename, "w");
+    FILE *fp = fopen(filename, "wb");
    if(!fp) file_error(filename);

    int major = 0;
--- a/src/region_layer.c
+++ b/src/region_layer.c
@ -9,11 +9,9 @@
 #include <string.h>
 #include <stdlib.h>

-#define DOABS 1
-
-region_layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
+layer make_region_layer(int batch, int w, int h, int n, int classes, int coords)
 {
-    region_layer l = {0};
+    layer l = {0};
    l.type = REGION;

    l.n = n;
@ -75,12 +73,8 @@ box get_region_box(float *x, float *biases, int n, int index, int i, int j, int
    box b;
    b.x = (i + logistic_activate(x[index + 0])) / w;
    b.y = (j + logistic_activate(x[index + 1])) / h;
-    b.w = exp(x[index + 2]) * biases[2*n];
-    b.h = exp(x[index + 3]) * biases[2*n+1];
-    if(DOABS){
-        b.w = exp(x[index + 2]) * biases[2*n]   / w;
-        b.h = exp(x[index + 3]) * biases[2*n+1] / h;
-    }
+    b.w = exp(x[index + 2]) * biases[2*n]   / w;
+    b.h = exp(x[index + 3]) * biases[2*n+1] / h;
    return b;
 }

@ -91,12 +85,8 @@ float delta_region_box(box truth, float *x, float *biases, int n, int index, int

    float tx = (truth.x*w - i);
    float ty = (truth.y*h - j);
-    float tw = log(truth.w / biases[2*n]);
-    float th = log(truth.h / biases[2*n + 1]);
-    if(DOABS){
-        tw = log(truth.w*w / biases[2*n]);
-        th = log(truth.h*h / biases[2*n + 1]);
-    }
+    float tw = log(truth.w*w / biases[2*n]);
+    float th = log(truth.h*h / biases[2*n + 1]);

    delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0]));
    delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1]));
@ -141,14 +131,14 @@ float tisnan(float x)
 }

 void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output);
-void forward_region_layer(const region_layer l, network_state state)
+void forward_region_layer(const layer l, network_state state)
 {
    int i,j,b,t,n;
    int size = l.coords + l.classes + 1;
    memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
-    #ifndef GPU
+#ifndef GPU
    flatten(l.output, l.w*l.h, size*l.n, l.batch, 1);
-    #endif
+#endif
    for (b = 0; b < l.batch; ++b){
        for(i = 0; i < l.h*l.w*l.n; ++i){
            int index = size*i + b*l.outputs;
@ -197,6 +187,7 @@ void forward_region_layer(const region_layer l, network_state state)
                    for(n = 0; n < l.n*l.w*l.h; ++n){
                        int index = size*n + b*l.outputs + 5;
                        float scale =  l.output[index-1];
+                        l.delta[index - 1] = l.noobject_scale * ((0 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
                        float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class);
                        if(p > maxp){
                            maxp = p;
@ -205,6 +196,8 @@ void forward_region_layer(const region_layer l, network_state state)
                    }
                    int index = size*maxi + b*l.outputs + 5;
                    delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat);
+                    if(l.output[index - 1] < .3) l.delta[index - 1] = l.object_scale * ((.3 - l.output[index - 1]) * logistic_gradient(l.output[index - 1]));
+                    else  l.delta[index - 1] = 0;
                    ++class_count;
                    onlyclass = 1;
                    break;
@ -218,39 +211,26 @@ void forward_region_layer(const region_layer l, network_state state)
                    int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
                    box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
                    float best_iou = 0;
-                    int best_class = -1;
                    for(t = 0; t < 30; ++t){
                        box truth = float_to_box(state.truth + t*5 + b*l.truths);
                        if(!truth.x) break;
                        float iou = box_iou(pred, truth);
                        if (iou > best_iou) {
-                            best_class = state.truth[t*5 + b*l.truths + 4];
                            best_iou = iou;
                        }
                    }
                    avg_anyobj += l.output[index + 4];
                    l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
-                    if(l.classfix == -1) l.delta[index + 4] = l.noobject_scale * ((best_iou - l.output[index + 4]) * logistic_gradient(l.output[index + 4]));
-                    else{
-                        if (best_iou > l.thresh) {
-                            l.delta[index + 4] = 0;
-                            if(l.classfix > 0){
-                                delta_region_class(l.output, l.delta, index + 5, best_class, l.classes, l.softmax_tree, l.class_scale*(l.classfix == 2 ? l.output[index + 4] : 1), &avg_cat);
-                                ++class_count;
-                            }
-                        }
+                    if (best_iou > l.thresh) {
+                        l.delta[index + 4] = 0;
                    }

                    if(*(state.net.seen) < 12800){
                        box truth = {0};
                        truth.x = (i + .5)/l.w;
                        truth.y = (j + .5)/l.h;
-                        truth.w = l.biases[2*n];
-                        truth.h = l.biases[2*n+1];
-                        if(DOABS){
-                            truth.w = l.biases[2*n]/l.w;
-                            truth.h = l.biases[2*n+1]/l.h;
-                        }
+                        truth.w = l.biases[2*n]/l.w;
+                        truth.h = l.biases[2*n+1]/l.h;
                        delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01);
                    }
                }
@ -274,12 +254,8 @@ void forward_region_layer(const region_layer l, network_state state)
                int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs;
                box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h);
                if(l.bias_match){
-                    pred.w = l.biases[2*n];
-                    pred.h = l.biases[2*n+1];
-                    if(DOABS){
-                        pred.w = l.biases[2*n]/l.w;
-                        pred.h = l.biases[2*n+1]/l.h;
-                    }
+                    pred.w = l.biases[2*n]/l.w;
+                    pred.h = l.biases[2*n+1]/l.h;
                }
                //printf("pred: (%f, %f) %f x %f\n", pred.x, pred.y, pred.w, pred.h);
                pred.x = 0;
@ -313,19 +289,19 @@ void forward_region_layer(const region_layer l, network_state state)
        }
    }
    //printf("\n");
-    #ifndef GPU
+#ifndef GPU
    flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0);
-    #endif
+#endif
    *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
    printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f,  count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count);
 }

-void backward_region_layer(const region_layer l, network_state state)
+void backward_region_layer(const layer l, network_state state)
 {
    axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1);
 }

-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map)
+void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh)
 {
    int i,j,n;
    float *predictions = l.output;
@ -336,7 +312,6 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b
            int index = i*l.n + n;
            int p_index = index * (l.classes + 5) + 4;
            float scale = predictions[p_index];
-            if(l.classfix == -1 && scale < .5) scale = 0;
            int box_index = index * (l.classes + 5);
            boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h);
            boxes[index].x *= w;
@ -348,22 +323,15 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b
            if(l.softmax_tree){

                hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0);
-                int found = 0;
                if(map){
                    for(j = 0; j < 200; ++j){
                        float prob = scale*predictions[class_index+map[j]];
                        probs[index][j] = (prob > thresh) ? prob : 0;
                    }
                } else {
-                    for(j = l.classes - 1; j >= 0; --j){
-                        if(!found && predictions[class_index + j] > .5){
-                            found = 1;
-                        } else {
-                            predictions[class_index + j] = 0;
-                        }
-                        float prob = predictions[class_index+j];
-                        probs[index][j] = (scale > thresh) ? prob : 0;
-                    }
+                    int j =  hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh);
+                    probs[index][j] = (scale > thresh) ? scale : 0;
+                    probs[index][l.classes] = scale;
                }
            } else {
                for(j = 0; j < l.classes; ++j){
@ -380,7 +348,7 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b

 #ifdef GPU

-void forward_region_layer_gpu(const region_layer l, network_state state)
+void forward_region_layer_gpu(const layer l, network_state state)
 {
    /*
       if(!state.train){
@ -421,7 +389,7 @@ void forward_region_layer_gpu(const region_layer l, network_state state)
    if(cpu_state.truth) free(cpu_state.truth);
 }

-void backward_region_layer_gpu(region_layer l, network_state state)
+void backward_region_layer_gpu(layer l, network_state state)
 {
    flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta);
 }
--- a/src/region_layer.h
+++ b/src/region_layer.h
@ -4,17 +4,15 @@
 #include "layer.h"
 #include "network.h"

-typedef layer region_layer;
-
-region_layer make_region_layer(int batch, int h, int w, int n, int classes, int coords);
-void forward_region_layer(const region_layer l, network_state state);
-void backward_region_layer(const region_layer l, network_state state);
-void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map);
+layer make_region_layer(int batch, int h, int w, int n, int classes, int coords);
+void forward_region_layer(const layer l, network_state state);
+void backward_region_layer(const layer l, network_state state);
+void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh);
 void resize_region_layer(layer *l, int w, int h);

 #ifdef GPU
-void forward_region_layer_gpu(const region_layer l, network_state state);
-void backward_region_layer_gpu(region_layer l, network_state state);
+void forward_region_layer_gpu(const layer l, network_state state);
+void backward_region_layer_gpu(layer l, network_state state);
 #endif

 #endif
--- a/src/server.c
+++ b/src/server.c
@ -1,205 +0,0 @@
-#include <stdio.h> /* needed for sockaddr_in */
-#include <string.h> /* needed for sockaddr_in */
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <netinet/in.h> /* needed for sockaddr_in */
-#include <netdb.h>
-#include <pthread.h>
-#include <time.h>
-
-#include "mini_blas.h"
-#include "utils.h"
-#include "parser.h"
-#include "server.h"
-#include "connected_layer.h"
-#include "convolutional_layer.h"
-
-#define SERVER_PORT 9423
-#define STR(x) #x
-
-int socket_setup(int server)
-{
-    int fd = 0;                         /* our socket */
-    struct sockaddr_in me;      /* our address */
-
-    /* create a UDP socket */
-
-    if ((fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
-        error("cannot create socket");
-    }
-
-    /* bind the socket to any valid IP address and a specific port */
-    if (server == 1){
-        bzero((char *) &me, sizeof(me));
-        me.sin_family = AF_INET;
-        me.sin_addr.s_addr = htonl(INADDR_ANY);
-        me.sin_port = htons(SERVER_PORT);
-
-        if (bind(fd, (struct sockaddr *)&me, sizeof(me)) < 0) {
-            error("bind failed");
-        }
-    }
-
-    return fd;
-}
-
-typedef struct{
-    int fd;
-    int counter;
-    network net;
-} connection_info;
-
-void read_and_add_into(int fd, float *a, int n)
-{
-    float *buff = calloc(n, sizeof(float));
-    read_all(fd, (char*) buff, n*sizeof(float));
-    axpy_cpu(n, 1, buff, 1, a, 1);
-    free(buff);
-}
-
-void handle_connection(void *pointer)
-{
-    connection_info info = *(connection_info *) pointer;
-    free(pointer);
-    //printf("New Connection\n");
-    if(info.counter%100==0){
-        char buff[256];
-        sprintf(buff, "unikitty/net_%d.part", info.counter);
-        save_network(info.net, buff);
-    }
-    int fd = info.fd;
-    network net = info.net;
-    int i;
-    for(i = 0; i < net.n; ++i){
-        if(net.types[i] == CONVOLUTIONAL){
-            convolutional_layer layer = *(convolutional_layer *) net.layers[i];
-
-            read_and_add_into(fd, layer.bias_updates, layer.n);
-            int num = layer.n*layer.c*layer.size*layer.size;
-            read_and_add_into(fd, layer.filter_updates, num);
-        }
-        if(net.types[i] == CONNECTED){
-            connected_layer layer = *(connected_layer *) net.layers[i];
-
-            read_and_add_into(fd, layer.bias_updates, layer.outputs);
-            read_and_add_into(fd, layer.weight_updates, layer.inputs*layer.outputs);
-        }
-    }
-    for(i = 0; i < net.n; ++i){
-        if(net.types[i] == CONVOLUTIONAL){
-            convolutional_layer layer = *(convolutional_layer *) net.layers[i];
-            update_convolutional_layer(layer);
-
-            write_all(fd, (char*) layer.biases, layer.n*sizeof(float));
-            int num = layer.n*layer.c*layer.size*layer.size;
-            write_all(fd, (char*) layer.filters, num*sizeof(float));
-        }
-        if(net.types[i] == CONNECTED){
-            connected_layer layer = *(connected_layer *) net.layers[i];
-            update_connected_layer(layer);
-            write_all(fd, (char *)layer.biases, layer.outputs*sizeof(float));
-            write_all(fd, (char *)layer.weights, layer.outputs*layer.inputs*sizeof(float));
-        }
-    }
-    //printf("Received updates\n");
-    close(fd);
-}
-
-void server_update(network net)
-{
-    int fd = socket_setup(1);
-    int counter = 18000;
-    listen(fd, 64);
-    struct sockaddr_in client;     /* remote address */
-    socklen_t client_size = sizeof(client);   /* length of addresses */
-    time_t t=0;
-    while(1){
-        connection_info *info = calloc(1, sizeof(connection_info));
-        info->net = net;
-        info->counter = counter;
-        pthread_t worker;
-        int connection = accept(fd, (struct sockaddr *) &client, &client_size);
-        if(!t) t=time(0);
-        info->fd = connection;
-        pthread_create(&worker, NULL, (void *) &handle_connection, info);
-        ++counter;
-        printf("%d\n", counter);
-        //if(counter == 1024) break;
-    }
-    close(fd);
-}
-
-void client_update(network net, char *address)
-{
-    int fd = socket_setup(0);
-
-    struct hostent *hp;     /* host information */
-    struct sockaddr_in server;    /* server address */
-
-    /* fill in the server's address and data */
-    bzero((char*)&server, sizeof(server));
-    server.sin_family = AF_INET;
-    server.sin_port = htons(SERVER_PORT);
-
-    /* look up the address of the server given its name */
-    hp = gethostbyname(address);
-    if (!hp) {
-        perror("no such host");
-        fprintf(stderr, "could not obtain address of %s\n", "localhost");
-    }
-
-    /* put the host's address into the server address structure */
-    memcpy((void *)&server.sin_addr, hp->h_addr_list[0], hp->h_length);
-    if (connect(fd, (struct sockaddr *) &server, sizeof(server)) < 0) {
-        error("error connecting");
-    }
-
-    /* send a message to the server */
-    int i;
-    //printf("Sending\n");
-    for(i = 0; i < net.n; ++i){
-        if(net.types[i] == CONVOLUTIONAL){
-            convolutional_layer layer = *(convolutional_layer *) net.layers[i];
-            write_all(fd, (char*) layer.bias_updates, layer.n*sizeof(float));
-            int num = layer.n*layer.c*layer.size*layer.size;
-            write_all(fd, (char*) layer.filter_updates, num*sizeof(float));
-            memset(layer.bias_updates, 0, layer.n*sizeof(float));
-            memset(layer.filter_updates, 0, num*sizeof(float));
-        }
-        if(net.types[i] == CONNECTED){
-            connected_layer layer = *(connected_layer *) net.layers[i];
-            write_all(fd, (char *)layer.bias_updates, layer.outputs*sizeof(float));
-            write_all(fd, (char *)layer.weight_updates, layer.outputs*layer.inputs*sizeof(float));
-            memset(layer.bias_updates, 0, layer.outputs*sizeof(float));
-            memset(layer.weight_updates, 0, layer.inputs*layer.outputs*sizeof(float));
-        }
-    }
-    //printf("Sent\n");
-
-    for(i = 0; i < net.n; ++i){
-        if(net.types[i] == CONVOLUTIONAL){
-            convolutional_layer layer = *(convolutional_layer *) net.layers[i];
-
-            read_all(fd, (char*) layer.biases, layer.n*sizeof(float));
-            int num = layer.n*layer.c*layer.size*layer.size;
-            read_all(fd, (char*) layer.filters, num*sizeof(float));
-
-#ifdef GPU
-            push_convolutional_layer(layer);
-            #endif
-        }
-        if(net.types[i] == CONNECTED){
-            connected_layer layer = *(connected_layer *) net.layers[i];
-
-            read_all(fd, (char *)layer.biases, layer.outputs*sizeof(float));
-            read_all(fd, (char *)layer.weights, layer.outputs*layer.inputs*sizeof(float));
-
-#ifdef GPU
-            push_connected_layer(layer);
-            #endif
-        }
-    }
-    //printf("Updated\n");
-    close(fd);
-}
--- a/src/server.h
+++ b/src/server.h
@ -1,4 +0,0 @@
-#include "network.h"
-
-void client_update(network net, char *address);
-void server_update(network net);
--- a/src/tree.c
+++ b/src/tree.c
@ -50,6 +50,34 @@ void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leave
    }
 }

+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh)
+{
+    float p = 1;
+    int group = 0;
+    int i;
+    while(1){
+        float max = 0;
+        int max_i = 0;
+
+        for(i = 0; i < hier->group_size[group]; ++i){
+            int index = i + hier->group_offset[group];
+            float val = predictions[i + hier->group_offset[group]];
+            if(val > max){
+                max_i = index;
+                max = val;
+            }
+        }
+        if(p*max > thresh){
+            p = p*max;
+            group = hier->child[max_i];
+            if(hier->child[max_i] < 0) return max_i;
+        } else {
+            return hier->parent[hier->group_offset[group]];
+        }
+    }
+    return 0;
+}
+
 tree *read_tree(char *filename)
 {
    tree t = {0};
@ -67,6 +95,9 @@ tree *read_tree(char *filename)
        t.parent = realloc(t.parent, (n+1)*sizeof(int));
        t.parent[n] = parent;

+        t.child = realloc(t.child, (n+1)*sizeof(int));
+        t.child[n] = -1;
+
        t.name = realloc(t.name, (n+1)*sizeof(char *));
        t.name[n] = id;
        if(parent != last_parent){
@ -80,6 +111,9 @@ tree *read_tree(char *filename)
        }
        t.group = realloc(t.group, (n+1)*sizeof(int));
        t.group[n] = groups;
+        if (parent >= 0) {
+            t.child[parent] = groups;
+        }
        ++n;
        ++group_size;
    }
--- a/src/tree.h
+++ b/src/tree.h
@ -5,6 +5,7 @@ typedef struct{
    int *leaf;
    int n;
    int *parent;
+    int *child;
    int *group;
    char **name;

@ -16,6 +17,7 @@ typedef struct{
 tree *read_tree(char *filename);
 void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves);
 void change_leaves(tree *t, char *leaf_list);
+int hierarchy_top_prediction(float *predictions, tree *hier, float thresh);
 float get_hierarchy_probability(float *x, tree *hier, int c);

 #endif
--- a/src/yolo.c
+++ b/src/yolo.c
@ -351,5 +351,5 @@ void run_yolo(int argc, char **argv)
    else if(0==strcmp(argv[2], "train")) train_yolo(cfg, weights);
    else if(0==strcmp(argv[2], "valid")) validate_yolo(cfg, weights);
    else if(0==strcmp(argv[2], "recall")) validate_yolo_recall(cfg, weights);
-    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix);
+    else if(0==strcmp(argv[2], "demo")) demo(cfg, weights, thresh, cam_index, filename, voc_names, 20, frame_skip, prefix, .5);
 }
--- a/src/yolo_kernels.cu
+++ b/src/yolo_kernels.cu
@ -1,132 +0,0 @@
-#include "cuda_runtime.h"
-#include "curand.h"
-#include "cublas_v2.h"
-
-extern "C" {
-#include "network.h"
-#include "detection_layer.h"
-#include "cost_layer.h"
-#include "utils.h"
-#include "parser.h"
-#include "box.h"
-#include "image.h"
-#include <sys/time.h>
-}
-
-#ifdef OPENCV
-#include "opencv2/highgui/highgui.hpp"
-#include "opencv2/imgproc/imgproc.hpp"
-extern "C" image ipl_to_image(IplImage* src);
-extern "C" void convert_yolo_detections(float *predictions, int classes, int num, int square, int side, int w, int h, float thresh, float **probs, box *boxes, int only_objectness);
-
-extern "C" char *voc_names[];
-extern "C" image voc_labels[];
-
-static float **probs;
-static box *boxes;
-static network net;
-static image in   ;
-static image in_s ;
-static image det  ;
-static image det_s;
-static image disp ;
-static cv::VideoCapture cap;
-static float fps = 0;
-static float demo_thresh = 0;
-
-void *fetch_in_thread(void *ptr)
-{
-    cv::Mat frame_m;
-    cap >> frame_m;
-    IplImage frame = frame_m;
-    in = ipl_to_image(&frame);
-    rgbgr_image(in);
-    in_s = resize_image(in, net.w, net.h);
-    return 0;
-}
-
-void *detect_in_thread(void *ptr)
-{
-    float nms = .4;
-
-    detection_layer l = net.layers[net.n-1];
-    float *X = det_s.data;
-    float *predictions = network_predict(net, X);
-    free_image(det_s);
-    convert_yolo_detections(predictions, l.classes, l.n, l.sqrt, l.side, 1, 1, demo_thresh, probs, boxes, 0);
-    if (nms > 0) do_nms(boxes, probs, l.side*l.side*l.n, l.classes, nms);
-    printf("\033[2J");
-    printf("\033[1;1H");
-    printf("\nFPS:%.0f\n",fps);
-    printf("Objects:\n\n");
-    draw_detections(det, l.side*l.side*l.n, demo_thresh, boxes, probs, voc_names, voc_labels, 20);
-    return 0;
-}
-
-extern "C" void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index)
-{
-    demo_thresh = thresh;
-    printf("YOLO demo\n");
-    net = parse_network_cfg(cfgfile);
-    if(weightfile){
-        load_weights(&net, weightfile);
-    }
-    set_batch_network(&net, 1);
-
-    srand(2222222);
-
-    cv::VideoCapture cam(cam_index);
-    cap = cam;
-    if(!cap.isOpened()) error("Couldn't connect to webcam.\n");
-
-    detection_layer l = net.layers[net.n-1];
-    int j;
-
-    boxes = (box *)calloc(l.side*l.side*l.n, sizeof(box));
-    probs = (float **)calloc(l.side*l.side*l.n, sizeof(float *));
-    for(j = 0; j < l.side*l.side*l.n; ++j) probs[j] = (float *)calloc(l.classes, sizeof(float *));
-
-    pthread_t fetch_thread;
-    pthread_t detect_thread;
-
-    fetch_in_thread(0);
-    det = in;
-    det_s = in_s;
-
-    fetch_in_thread(0);
-    detect_in_thread(0);
-    disp = det;
-    det = in;
-    det_s = in_s;
-
-    cvNamedWindow("YOLO", CV_WINDOW_NORMAL); 
-    cvMoveWindow("YOLO", 0, 0);
-    cvResizeWindow("YOLO", 1352, 1013);
-
-    while(1){
-        struct timeval tval_before, tval_after, tval_result;
-        gettimeofday(&tval_before, NULL);
-        if(pthread_create(&fetch_thread, 0, fetch_in_thread, 0)) error("Thread creation failed");
-        if(pthread_create(&detect_thread, 0, detect_in_thread, 0)) error("Thread creation failed");
-        show_image(disp, "YOLO");
-        free_image(disp);
-        cvWaitKey(1);
-        pthread_join(fetch_thread, 0);
-        pthread_join(detect_thread, 0);
-
-        disp  = det;
-        det   = in;
-        det_s = in_s;
-
-        gettimeofday(&tval_after, NULL);
-        timersub(&tval_after, &tval_before, &tval_result);
-        float curr = 1000000.f/((long int)tval_result.tv_usec);
-        fps = .9*fps + .1*curr;
-    }
-}
-#else
-extern "C" void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index){
-    fprintf(stderr, "YOLO demo needs OpenCV for webcam images.\n");
-}
-#endif
-