diff --git a/Makefile b/Makefile index cdf200c5..22e89a10 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ -GPU=0 -OPENCV=0 +GPU=1 +OPENCV=1 DEBUG=0 ARCH= --gpu-architecture=compute_20 --gpu-code=compute_20 diff --git a/cfg/yolo.cfg b/cfg/yolo.cfg index ab46729c..140de88a 100644 --- a/cfg/yolo.cfg +++ b/cfg/yolo.cfg @@ -1,17 +1,17 @@ [net] batch=64 -subdivisions=64 +subdivisions=4 height=448 width=448 channels=3 -learning_rate=0.001 +learning_rate=0.01 momentum=0.9 decay=0.0005 policy=steps -steps=50, 5000 -scales=10, .1 -max_batches = 8000 +steps=20000 +scales=.1 +max_batches = 35000 [crop] crop_width=448 diff --git a/src/layer.h b/src/layer.h index d13cdbfa..808aba49 100644 --- a/src/layer.h +++ b/src/layer.h @@ -28,6 +28,7 @@ typedef struct { ACTIVATION activation; COST_TYPE cost_type; int batch; + int forced; int inputs; int outputs; int truths; diff --git a/src/parser.c b/src/parser.c index 7ea1b3ff..6daeb137 100644 --- a/src/parser.c +++ b/src/parser.c @@ -187,6 +187,7 @@ region_layer parse_region(list *options, size_params params) layer.sqrt = option_find_int(options, "sqrt", 0); layer.coord_scale = option_find_float(options, "coord_scale", 1); + layer.forced = option_find_int(options, "forced", 0); layer.object_scale = option_find_float(options, "object_scale", 1); layer.noobject_scale = option_find_float(options, "noobject_scale", 1); layer.class_scale = option_find_float(options, "class_scale", 1); diff --git a/src/region_layer.c b/src/region_layer.c index 39af5ee8..4d8c2a45 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -82,9 +82,12 @@ void forward_region_layer(const region_layer l, network_state state) int best_index = -1; float best_iou = 0; - float best_rmse = 4; + float best_rmse = 20; - if (!is_obj) continue; + if (!is_obj){ + //printf("."); + continue; + } int class_index = index + i*l.classes; for(j = 0; j < l.classes; ++j) { @@ -123,18 +126,38 @@ void forward_region_layer(const region_layer l, network_state state) } } } - int p_index = index + locations*l.classes + i*l.n + best_index; - *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2); - *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2); - avg_obj += l.output[p_index]; - l.delta[p_index+0] = l.object_scale * (1.-l.output[p_index]); - if(l.rescore){ - l.delta[p_index+0] = l.object_scale * (best_iou - l.output[p_index]); + if(l.forced){ + if(truth.w*truth.h < .1){ + best_index = 1; + }else{ + best_index = 0; + } } int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords; int tbox_index = truth_index + 1 + l.classes; + + box out = float_to_box(l.output + box_index); + out.x /= l.side; + out.y /= l.side; + if (l.sqrt) { + out.w = out.w*out.w; + out.h = out.h*out.h; + } + float iou = box_iou(out, truth); + + //printf("%d", best_index); + int p_index = index + locations*l.classes + i*l.n + best_index; + *(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2); + *(l.cost) += l.object_scale * pow(1-l.output[p_index], 2); + avg_obj += l.output[p_index]; + l.delta[p_index] = l.object_scale * (1.-l.output[p_index]); + + if(l.rescore){ + l.delta[p_index] = l.object_scale * (iou - l.output[p_index]); + } + l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]); l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]); l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]); @@ -144,14 +167,15 @@ void forward_region_layer(const region_layer l, network_state state) l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]); } - *(l.cost) += pow(1-best_iou, 2); - avg_iou += best_iou; + *(l.cost) += pow(1-iou, 2); + avg_iou += iou; ++count; } if(l.softmax){ gradient_array(l.output + index + locations*l.classes, locations*l.n*(1+l.coords), LOGISTIC, l.delta + index + locations*l.classes); } + //printf("\n"); } printf("Region Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count); } diff --git a/src/swag.c b/src/swag.c index 7058df56..ec58f0d5 100644 --- a/src/swag.c +++ b/src/swag.c @@ -1,4 +1,5 @@ #include "network.h" +#include "region_layer.h" #include "detection_layer.h" #include "cost_layer.h" #include "utils.h" @@ -11,40 +12,37 @@ char *voc_names[] = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"}; -void draw_swag(image im, float *box, int side, int objectness, char *label, float thresh) +void draw_swag(image im, float *predictions, int side, int num, char *label, float thresh) { int classes = 20; - int elems = 4+classes+objectness; - int j; - int r, c; + int i,n; - for(r = 0; r < side; ++r){ - for(c = 0; c < side; ++c){ - j = (r*side + c) * elems; - float scale = 1; - if(objectness) scale = 1 - box[j++]; - int class = max_index(box+j, classes); - if(scale * box[j+class] > thresh){ - int width = sqrt(scale*box[j+class])*5 + 1; - printf("%f %s\n", scale * box[j+class], voc_names[class]); + for(i = 0; i < side*side; ++i){ + int row = i / side; + int col = i % side; + for(n = 0; n < num; ++n){ + int p_index = side*side*classes + i*num + n; + int box_index = side*side*(classes + num) + (i*num + n)*4; + int class_index = i*classes; + float scale = predictions[p_index]; + int class = max_index(predictions+class_index, classes); + float prob = scale * predictions[class_index + class]; + if(prob > thresh){ + int width = sqrt(prob)*5 + 1; + printf("%f %s\n", prob, voc_names[class]); float red = get_color(0,class,classes); float green = get_color(1,class,classes); float blue = get_color(2,class,classes); + box b = float_to_box(predictions+box_index); + b.x = (b.x + col)/side; + b.y = (b.y + row)/side; + b.w = b.w*b.w; + b.h = b.h*b.h; - j += classes; - float x = box[j+0]; - float y = box[j+1]; - x = (x+c)/side; - y = (y+r)/side; - float w = box[j+2]; //*maxwidth; - float h = box[j+3]; //*maxheight; - h = h*h; - w = w*w; - - int left = (x-w/2)*im.w; - int right = (x+w/2)*im.w; - int top = (y-h/2)*im.h; - int bot = (y+h/2)*im.h; + int left = (b.x-b.w/2)*im.w; + int right = (b.x+b.w/2)*im.w; + int top = (b.y-b.h/2)*im.h; + int bot = (b.y+b.h/2)*im.h; draw_box_width(im, left, top, right, bot, width, red, green, blue); } } @@ -103,13 +101,13 @@ void train_swag(char *cfgfile, char *weightfile) printf("Loaded: %lf seconds\n", sec(clock()-time)); -/* - image im = float_to_image(net.w, net.h, 3, train.X.vals[113]); - image copy = copy_image(im); - draw_swag(copy, train.y.vals[113], 7, "truth"); - cvWaitKey(0); - free_image(copy); - */ + /* + image im = float_to_image(net.w, net.h, 3, train.X.vals[113]); + image copy = copy_image(im); + draw_swag(copy, train.y.vals[113], 7, "truth"); + cvWaitKey(0); + free_image(copy); + */ time=clock(); float loss = train_network(net, train); @@ -270,7 +268,7 @@ void test_swag(char *cfgfile, char *weightfile, char *filename, float thresh) if(weightfile){ load_weights(&net, weightfile); } - detection_layer layer = get_network_detection_layer(net); + region_layer layer = net.layers[net.n-1]; set_batch_network(&net, 1); srand(2222222); clock_t time; @@ -292,7 +290,8 @@ void test_swag(char *cfgfile, char *weightfile, char *filename, float thresh) time=clock(); float *predictions = network_predict(net, X); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); - draw_swag(im, predictions, 7, layer.objectness, "predictions", thresh); + draw_swag(im, predictions, layer.side, layer.n, "predictions", thresh); + show_image(sized, "resized"); free_image(im); free_image(sized); #ifdef OPENCV diff --git a/src/yolo.c b/src/yolo.c index b2c89d81..4b241f32 100644 --- a/src/yolo.c +++ b/src/yolo.c @@ -65,7 +65,6 @@ void train_yolo(char *cfgfile, char *weightfile) if(weightfile){ load_weights(&net, weightfile); } - detection_layer layer = get_network_detection_layer(net); int imgs = 128; int i = *net.seen/imgs; @@ -74,11 +73,16 @@ void train_yolo(char *cfgfile, char *weightfile) int N = plist->size; paths = (char **)list_to_array(plist); + if(i*imgs > N*80){ + net.layers[net.n-1].objectness = 0; + net.layers[net.n-1].joint = 1; + } if(i*imgs > N*120){ net.layers[net.n-1].rescore = 1; } data train, buffer; + detection_layer layer = get_network_detection_layer(net); int classes = layer.classes; int background = layer.objectness; int side = sqrt(get_detection_layer_locations(layer));