From 989ab8c38a02fa7ea9c25108151736c62e81c972 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Fri, 24 Apr 2015 10:27:50 -0700 Subject: [PATCH] IOU loss function --- src/darknet.c | 1 + src/data.c | 20 ++-- src/detection.c | 60 +++++------ src/detection_layer.c | 242 ++++++++++++++++++++++++++++++++++++++---- src/detection_layer.h | 2 + src/imagenet.c | 2 +- src/network.c | 3 + src/utils.h | 4 + 8 files changed, 272 insertions(+), 62 deletions(-) diff --git a/src/darknet.c b/src/darknet.c index 46a8c821..411efdf7 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -93,6 +93,7 @@ void visualize(char *cfgfile, char *weightfile) int main(int argc, char **argv) { + //test_box(); //test_convolutional_layer(); if(argc < 2){ fprintf(stderr, "usage: %s \n", argv[0]); diff --git a/src/data.c b/src/data.c index 2b74386f..f1f5b80b 100644 --- a/src/data.c +++ b/src/data.c @@ -65,22 +65,22 @@ matrix load_image_paths(char **paths, int n, int w, int h) return X; } -typedef struct box{ +typedef struct{ int id; float x,y,w,h; float left, right, top, bottom; -} box; +} box_label; -box *read_boxes(char *filename, int *n) +box_label *read_boxes(char *filename, int *n) { - box *boxes = calloc(1, sizeof(box)); + box_label *boxes = calloc(1, sizeof(box_label)); FILE *file = fopen(filename, "r"); if(!file) file_error(filename); float x, y, h, w; int id; int count = 0; while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){ - boxes = realloc(boxes, (count+1)*sizeof(box)); + boxes = realloc(boxes, (count+1)*sizeof(box_label)); boxes[count].id = id; boxes[count].x = x; boxes[count].y = y; @@ -97,11 +97,11 @@ box *read_boxes(char *filename, int *n) return boxes; } -void randomize_boxes(box *b, int n) +void randomize_boxes(box_label *b, int n) { int i; for(i = 0; i < n; ++i){ - box swap = b[i]; + box_label swap = b[i]; int index = rand_r(&data_seed)%n; b[i] = b[index]; b[index] = swap; @@ -114,7 +114,7 @@ void fill_truth_detection(char *path, float *truth, int classes, int num_boxes, labelpath = find_replace(labelpath, ".jpg", ".txt"); labelpath = find_replace(labelpath, ".JPEG", ".txt"); int count = 0; - box *boxes = read_boxes(labelpath, &count); + box_label *boxes = read_boxes(labelpath, &count); randomize_boxes(boxes, count); float x,y,w,h; float left, top, right, bot; @@ -174,10 +174,10 @@ void fill_truth_detection(char *path, float *truth, int classes, int num_boxes, if(background) truth[index++] = 0; truth[index+id] = 1; index += classes; - truth[index++] = y; truth[index++] = x; - truth[index++] = h; + truth[index++] = y; truth[index++] = w; + truth[index++] = h; } free(boxes); } diff --git a/src/detection.c b/src/detection.c index c61c799c..f61da67d 100644 --- a/src/detection.c +++ b/src/detection.c @@ -81,9 +81,9 @@ void train_detection(char *cfgfile, char *weightfile) if (imgnet){ plist = get_paths("/home/pjreddie/data/imagenet/det.train.list"); }else{ - //plist = get_paths("/home/pjreddie/data/voc/trainall.txt"); + plist = get_paths("/home/pjreddie/data/voc/trainall.txt"); //plist = get_paths("/home/pjreddie/data/coco/trainval.txt"); - plist = get_paths("/home/pjreddie/data/voc/all2007-2012.txt"); + //plist = get_paths("/home/pjreddie/data/voc/all2007-2012.txt"); } paths = (char **)list_to_array(plist); pthread_t load_thread = load_data_detection_thread(imgs, paths, plist->size, classes, net.w, net.h, side, side, background, &buffer); @@ -95,12 +95,12 @@ void train_detection(char *cfgfile, char *weightfile) train = buffer; load_thread = load_data_detection_thread(imgs, paths, plist->size, classes, net.w, net.h, side, side, background, &buffer); -/* - image im = float_to_image(net.w, net.h, 3, train.X.vals[114]); - image copy = copy_image(im); - draw_detection(copy, train.y.vals[114], 7); - free_image(copy); - */ + /* + image im = float_to_image(net.w, net.h, 3, train.X.vals[114]); + image copy = copy_image(im); + draw_detection(copy, train.y.vals[114], 7); + free_image(copy); + */ printf("Loaded: %lf seconds\n", sec(clock()-time)); time=clock(); @@ -120,30 +120,30 @@ void train_detection(char *cfgfile, char *weightfile) void predict_detections(network net, data d, float threshold, int offset, int classes, int nuisance, int background, int num_boxes, int per_box) { - matrix pred = network_predict_data(net, d); - int j, k, class; - for(j = 0; j < pred.rows; ++j){ - for(k = 0; k < pred.cols; k += per_box){ - float scale = 1.; - int index = k/per_box; - int row = index / num_boxes; - int col = index % num_boxes; - if (nuisance) scale = 1.-pred.vals[j][k]; - for (class = 0; class < classes; ++class){ - int ci = k+classes+background+nuisance; - float y = (pred.vals[j][ci + 0] + row)/num_boxes; - float x = (pred.vals[j][ci + 1] + col)/num_boxes; - float h = pred.vals[j][ci + 2]; //* distance_from_edge(row, num_boxes); - h = h*h; - float w = pred.vals[j][ci + 3]; //* distance_from_edge(col, num_boxes); - w = w*w; - float prob = scale*pred.vals[j][k+class+background+nuisance]; - if(prob < threshold) continue; - printf("%d %d %f %f %f %f %f\n", offset + j, class, prob, y, x, h, w); - } + matrix pred = network_predict_data(net, d); + int j, k, class; + for(j = 0; j < pred.rows; ++j){ + for(k = 0; k < pred.cols; k += per_box){ + float scale = 1.; + int index = k/per_box; + int row = index / num_boxes; + int col = index % num_boxes; + if (nuisance) scale = 1.-pred.vals[j][k]; + for (class = 0; class < classes; ++class){ + int ci = k+classes+background+nuisance; + float y = (pred.vals[j][ci + 0] + row)/num_boxes; + float x = (pred.vals[j][ci + 1] + col)/num_boxes; + float h = pred.vals[j][ci + 2]; //* distance_from_edge(row, num_boxes); + h = h*h; + float w = pred.vals[j][ci + 3]; //* distance_from_edge(col, num_boxes); + w = w*w; + float prob = scale*pred.vals[j][k+class+background+nuisance]; + if(prob < threshold) continue; + printf("%d %d %f %f %f %f %f\n", offset + j, class, prob, y, x, h, w); } } - free_matrix(pred); + } + free_matrix(pred); } void validate_detection(char *cfgfile, char *weightfile) diff --git a/src/detection_layer.c b/src/detection_layer.c index 73b2862b..7eaabb4f 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -3,7 +3,9 @@ #include "softmax_layer.h" #include "blas.h" #include "cuda.h" +#include "utils.h" #include +#include #include int get_detection_layer_locations(detection_layer layer) @@ -26,6 +28,8 @@ detection_layer *make_detection_layer(int batch, int inputs, int classes, int co layer->coords = coords; layer->rescore = rescore; layer->nuisance = nuisance; + layer->cost = calloc(1, sizeof(float)); + layer->does_cost=1; layer->background = background; int outputs = get_detection_layer_output_size(*layer); layer->output = calloc(batch*outputs, sizeof(float)); @@ -63,6 +67,169 @@ void dark_zone(detection_layer layer, int class, int start, network_state state) } } +typedef struct{ + float dx, dy, dw, dh; +} dbox; + +dbox derivative(box a, box b) +{ + dbox d; + d.dx = 0; + d.dw = 0; + float l1 = a.x - a.w/2; + float l2 = b.x - b.w/2; + if (l1 > l2){ + d.dx -= 1; + d.dw += .5; + } + float r1 = a.x + a.w/2; + float r2 = b.x + b.w/2; + if(r1 < r2){ + d.dx += 1; + d.dw += .5; + } + if (l1 > r2) { + d.dx = -1; + d.dw = 0; + } + if (r1 < l2){ + d.dx = 1; + d.dw = 0; + } + + d.dy = 0; + d.dh = 0; + float t1 = a.y - a.h/2; + float t2 = b.y - b.h/2; + if (t1 > t2){ + d.dy -= 1; + d.dh += .5; + } + float b1 = a.y + a.h/2; + float b2 = b.y + b.h/2; + if(b1 < b2){ + d.dy += 1; + d.dh += .5; + } + if (t1 > b2) { + d.dy = -1; + d.dh = 0; + } + if (b1 < t2){ + d.dy = 1; + d.dh = 0; + } + return d; +} + +float overlap(float x1, float w1, float x2, float w2) +{ + float l1 = x1 - w1/2; + float l2 = x2 - w2/2; + float left = l1 > l2 ? l1 : l2; + float r1 = x1 + w1/2; + float r2 = x2 + w2/2; + float right = r1 < r2 ? r1 : r2; + return right - left; +} + +float box_intersection(box a, box b) +{ + float w = overlap(a.x, a.w, b.x, b.w); + float h = overlap(a.y, a.h, b.y, b.h); + if(w < 0 || h < 0) return 0; + float area = w*h; + return area; +} + +float box_union(box a, box b) +{ + float i = box_intersection(a, b); + float u = a.w*a.h + b.w*b.h - i; + return u; +} + +float box_iou(box a, box b) +{ + return box_intersection(a, b)/box_union(a, b); +} + +dbox dintersect(box a, box b) +{ + float w = overlap(a.x, a.w, b.x, b.w); + float h = overlap(a.y, a.h, b.y, b.h); + dbox dover = derivative(a, b); + dbox di; + + di.dw = dover.dw*h; + di.dx = dover.dx*h; + di.dh = dover.dh*w; + di.dy = dover.dy*w; + if(h < 0 || w < 0){ + di.dx = dover.dx; + di.dy = dover.dy; + } + return di; +} + +dbox dunion(box a, box b) +{ + dbox du = {0,0,0,0};; + float w = overlap(a.x, a.w, b.x, b.w); + float h = overlap(a.y, a.h, b.y, b.h); + if(w > 0 && h > 0){ + dbox di = dintersect(a, b); + du.dw = h - di.dw; + du.dh = w - di.dw; + du.dx = -di.dx; + du.dy = -di.dy; + } + return du; +} + +dbox diou(box a, box b) +{ + float u = box_union(a,b); + float i = box_intersection(a,b); + dbox di = dintersect(a,b); + dbox du = dunion(a,b); + dbox dd = {0,0,0,0}; + if(i < 0) { + dd.dx = b.x - a.x; + dd.dy = b.y - a.y; + dd.dw = b.w - a.w; + dd.dh = b.h - a.h; + return dd; + } + dd.dx = 2*pow((1-(i/u)),1)*(di.dx*u - du.dx*i)/(u*u); + dd.dy = 2*pow((1-(i/u)),1)*(di.dy*u - du.dy*i)/(u*u); + dd.dw = 2*pow((1-(i/u)),1)*(di.dw*u - du.dw*i)/(u*u); + dd.dh = 2*pow((1-(i/u)),1)*(di.dh*u - du.dh*i)/(u*u); + return dd; +} + +void test_box() +{ + box a = {1, 1, 1, 1}; + box b = {0, 0, .5, .2}; + int count = 0; + while(count++ < 300){ + dbox d = diou(a, b); + printf("%f %f %f %f\n", a.x, a.y, a.w, a.h); + a.x += .1*d.dx; + a.w += .1*d.dw; + a.y += .1*d.dy; + a.h += .1*d.dh; + printf("inter: %f\n", box_intersection(a, b)); + printf("union: %f\n", box_union(a, b)); + printf("IOU: %f\n", box_iou(a, b)); + if(d.dx==0 && d.dw==0 && d.dy==0 && d.dh==0) { + printf("break!!!\n"); + break; + } + } +} + void forward_detection_layer(const detection_layer layer, network_state state) { int in_i = 0; @@ -92,31 +259,63 @@ void forward_detection_layer(const detection_layer layer, network_state state) layer.output[out_i++] = mask*state.input[in_i++]; } } - /* - int count = 0; - for(i = 0; i < layer.batch*locations; ++i){ - for(j = 0; j < layer.classes+layer.background; ++j){ - printf("%f, ", layer.output[count++]); - } - printf("\n"); - for(j = 0; j < layer.coords; ++j){ - printf("%f, ", layer.output[count++]); - } - printf("\n"); - } - */ - /* - if(layer.background || 1){ + if(layer.does_cost){ + *(layer.cost) = 0; + int size = get_detection_layer_output_size(layer) * layer.batch; + memset(layer.delta, 0, size * sizeof(float)); for(i = 0; i < layer.batch*locations; ++i){ - int index = i*(layer.classes+layer.coords+layer.background); - for(j= 0; j < layer.classes; ++j){ - if(state.truth[index+j+layer.background]){ - //dark_zone(layer, j, index, state); - } + int classes = layer.nuisance+layer.classes; + int offset = i*(classes+layer.coords); + for(j = offset; j < offset+classes; ++j){ + *(layer.cost) += pow(state.truth[j] - layer.output[j], 2); + layer.delta[j] = state.truth[j] - layer.output[j]; } + box truth; + truth.x = state.truth[j+0]; + truth.y = state.truth[j+1]; + truth.w = state.truth[j+2]; + truth.h = state.truth[j+3]; + box out; + out.x = layer.output[j+0]; + out.y = layer.output[j+1]; + out.w = layer.output[j+2]; + out.h = layer.output[j+3]; + if(!(truth.w*truth.h)) continue; + float iou = box_iou(truth, out); + //printf("iou: %f\n", iou); + *(layer.cost) += pow((1-iou), 2); + dbox d = diou(out, truth); + layer.delta[j+0] = d.dx; + layer.delta[j+1] = d.dy; + layer.delta[j+2] = d.dw; + layer.delta[j+3] = d.dh; } } - */ + /* + int count = 0; + for(i = 0; i < layer.batch*locations; ++i){ + for(j = 0; j < layer.classes+layer.background; ++j){ + printf("%f, ", layer.output[count++]); + } + printf("\n"); + for(j = 0; j < layer.coords; ++j){ + printf("%f, ", layer.output[count++]); + } + printf("\n"); + } + */ + /* + if(layer.background || 1){ + for(i = 0; i < layer.batch*locations; ++i){ + int index = i*(layer.classes+layer.coords+layer.background); + for(j= 0; j < layer.classes; ++j){ + if(state.truth[index+j+layer.background]){ +//dark_zone(layer, j, index, state); +} +} +} +} + */ } void backward_detection_layer(const detection_layer layer, network_state state) @@ -164,6 +363,7 @@ void forward_detection_layer_gpu(const detection_layer layer, network_state stat cpu_state.input = in_cpu; forward_detection_layer(layer, cpu_state); cuda_push_array(layer.output_gpu, layer.output, layer.batch*outputs); + cuda_push_array(layer.delta_gpu, layer.delta, layer.batch*outputs); free(cpu_state.input); if(cpu_state.truth) free(cpu_state.truth); } diff --git a/src/detection_layer.h b/src/detection_layer.h index a56cb25c..0aa5f665 100644 --- a/src/detection_layer.h +++ b/src/detection_layer.h @@ -11,6 +11,8 @@ typedef struct { int background; int rescore; int nuisance; + int does_cost; + float *cost; float *output; float *delta; #ifdef GPU diff --git a/src/imagenet.c b/src/imagenet.c index 906dbd48..3f88b368 100644 --- a/src/imagenet.c +++ b/src/imagenet.c @@ -47,7 +47,7 @@ void train_imagenet(char *cfgfile, char *weightfile) printf("%d: %f, %f avg, %lf seconds, %d images\n", i, loss, avg_loss, sec(clock()-time), net.seen); free_data(train); //if(i%100 == 0 && net.learning_rate > .00001) net.learning_rate *= .97; - if(i%100==0){ + if(i%1000==0){ char buff[256]; sprintf(buff, "/home/pjreddie/imagenet_backup/%s_%d.weights",base, i); save_weights(net, buff); diff --git a/src/network.c b/src/network.c index 55710763..3247a315 100644 --- a/src/network.c +++ b/src/network.c @@ -186,6 +186,9 @@ float get_network_cost(network net) if(net.types[net.n-1] == COST){ return ((cost_layer *)net.layers[net.n-1])->output[0]; } + if(net.types[net.n-1] == DETECTION){ + return ((detection_layer *)net.layers[net.n-1])->cost[0]; + } return 0; } diff --git a/src/utils.h b/src/utils.h index 578abc37..0db16de1 100644 --- a/src/utils.h +++ b/src/utils.h @@ -36,5 +36,9 @@ float variance_array(float *a, int n); float mag_array(float *a, int n); float **one_hot_encode(float *a, int n, int k); float sec(clock_t clocks); + +typedef struct{ + float x, y, w, h; +} box; #endif