From d6fbe86e7a8c1bc389902c90c57ee7e80f5475b9 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Tue, 16 Dec 2014 11:40:05 -0800 Subject: [PATCH] updates? --- src/cnn.c | 71 ++++++++++++++++++++-------------------- src/crop_layer.c | 82 +++++++++++++++++++++++++++++++++++------------ src/crop_layer.cl | 16 +++++++++ src/crop_layer.h | 10 ++++-- src/network.c | 12 +++++++ src/network_gpu.c | 11 ++++++- 6 files changed, 144 insertions(+), 58 deletions(-) create mode 100644 src/crop_layer.cl diff --git a/src/cnn.c b/src/cnn.c index 43676c1b..8c56bda5 100644 --- a/src/cnn.c +++ b/src/cnn.c @@ -429,15 +429,16 @@ void train_imagenet_distributed(char *address) } } -void train_imagenet() +void train_imagenet(char *cfgfile) { float avg_loss = 1; //network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg"); srand(time(0)); - network net = parse_network_cfg("cfg/net.part"); + network net = parse_network_cfg(cfgfile); + set_learning_network(&net, .000001, .9, .0005); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = 1000/net.batch+1; - int i = 9540; + int i = 20590; char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); list *plist = get_paths("/data/imagenet/cls.train.list"); char **paths = (char **)list_to_array(plist); @@ -446,14 +447,14 @@ void train_imagenet() pthread_t load_thread; data train; data buffer; - load_thread = load_data_thread(paths, imgs*net.batch, plist->size, labels, 1000, 224, 224, &buffer); + load_thread = load_data_thread(paths, imgs*net.batch, plist->size, labels, 1000, 256, 256, &buffer); while(1){ i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; normalize_data_rows(train); - load_thread = load_data_thread(paths, imgs*net.batch, plist->size, labels, 1000, 224, 224, &buffer); + load_thread = load_data_thread(paths, imgs*net.batch, plist->size, labels, 1000, 256, 256, &buffer); printf("Loaded: %lf seconds\n", sec(clock()-time)); time=clock(); #ifdef GPU @@ -490,7 +491,7 @@ void validate_imagenet(char *filename) int num = (i+1)*m/splits - i*m/splits; data val, buffer; - pthread_t load_thread = load_data_thread(paths, num, 0, labels, 1000, 224, 224, &buffer); + pthread_t load_thread = load_data_thread(paths, num, 0, labels, 1000, 256, 256, &buffer); for(i = 1; i <= splits; ++i){ time=clock(); @@ -500,7 +501,7 @@ void validate_imagenet(char *filename) num = (i+1)*m/splits - i*m/splits; char **part = paths+(i*m/splits); - if(i != splits) load_thread = load_data_thread(part, num, 0, labels, 1000, 224, 224, &buffer); + if(i != splits) load_thread = load_data_thread(part, num, 0, labels, 1000, 256, 256, &buffer); printf("Loaded: %d images in %lf seconds\n", val.X.rows, sec(clock()-time)); time=clock(); @@ -514,9 +515,10 @@ void validate_imagenet(char *filename) } } -void test_detection() +void test_detection(char *cfgfile) { - network net = parse_network_cfg("cfg/detnet.test"); + network net = parse_network_cfg(cfgfile); + set_batch_network(&net, 1); srand(2222222); clock_t time; char filename[256]; @@ -618,14 +620,14 @@ void test_cifar10() void train_cifar10() { srand(555555); - network net = parse_network_cfg("cfg/cifar10.cfg"); + network net = parse_network_cfg("cfg/cifar_ramp.part"); data test = load_cifar10_data("data/cifar10/test_batch.bin"); int count = 0; int iters = 10000/net.batch; data train = load_all_cifar10(); while(++count <= 10000){ clock_t start = clock(), end; - float loss = train_network_sgd(net, train, iters); + float loss = train_network_sgd_gpu(net, train, iters); end = clock(); //visualize_network(net); //cvWaitKey(5000); @@ -633,10 +635,10 @@ void train_cifar10() //float test_acc = network_accuracy(net, test); //printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); if(count%10 == 0){ - float test_acc = network_accuracy(net, test); + float test_acc = network_accuracy_gpu(net, test); printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); char buff[256]; - sprintf(buff, "/home/pjreddie/cifar/cifar10_2_%d.cfg", count); + sprintf(buff, "/home/pjreddie/cifar/cifar10_%d.cfg", count); save_network(net, buff); }else{ printf("%d: Loss: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, (float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); @@ -899,31 +901,16 @@ void test_correct_alexnet() printf("%d\n", plist->size); clock_t time; int count = 0; - - srand(222222); - network net = parse_network_cfg("cfg/net.cfg"); - printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + network net; int imgs = 1000/net.batch+1; imgs = 1; - - while(++count <= 5){ - time=clock(); - data train = load_data(paths, imgs*net.batch, plist->size, labels, 1000, 224,224); - //translate_data_rows(train, -144); - normalize_data_rows(train); - printf("Loaded: %lf seconds\n", sec(clock()-time)); - time=clock(); - float loss = train_network_data_cpu(net, train, imgs); - printf("%d: %f, %lf seconds, %d images\n", count, loss, sec(clock()-time), imgs*net.batch); - free_data(train); - } #ifdef GPU count = 0; srand(222222); net = parse_network_cfg("cfg/net.cfg"); while(++count <= 5){ time=clock(); - data train = load_data(paths, imgs*net.batch, plist->size, labels, 1000, 224, 224); + data train = load_data(paths, imgs*net.batch, plist->size, labels, 1000, 256, 256); //translate_data_rows(train, -144); normalize_data_rows(train); printf("Loaded: %lf seconds\n", sec(clock()-time)); @@ -933,6 +920,21 @@ void test_correct_alexnet() free_data(train); } #endif + count = 0; + srand(222222); + net = parse_network_cfg("cfg/net.cfg"); + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + while(++count <= 5){ + time=clock(); + data train = load_data(paths, imgs*net.batch, plist->size, labels, 1000, 256,256); + //translate_data_rows(train, -144); + normalize_data_rows(train); + printf("Loaded: %lf seconds\n", sec(clock()-time)); + time=clock(); + float loss = train_network_data_cpu(net, train, imgs); + printf("%d: %f, %lf seconds, %d images\n", count, loss, sec(clock()-time), imgs*net.batch); + free_data(train); + } } void run_server() @@ -972,22 +974,23 @@ int main(int argc, char *argv[]) #ifdef GPU cl_setup(index); #endif - if(0==strcmp(argv[1], "train")) train_imagenet(); - else if(0==strcmp(argv[1], "detection")) train_detection_net(); + if(0==strcmp(argv[1], "detection")) train_detection_net(); else if(0==strcmp(argv[1], "asirra")) train_asirra(); else if(0==strcmp(argv[1], "nist")) train_nist(); + else if(0==strcmp(argv[1], "cifar")) train_cifar10(); else if(0==strcmp(argv[1], "test_correct")) test_correct_alexnet(); else if(0==strcmp(argv[1], "test")) test_imagenet(); else if(0==strcmp(argv[1], "server")) run_server(); - else if(0==strcmp(argv[1], "detect")) test_detection(); #ifdef GPU else if(0==strcmp(argv[1], "test_gpu")) test_gpu_blas(); #endif else if(argc < 3){ - fprintf(stderr, "usage: %s \n", argv[0]); + fprintf(stderr, "usage: %s \n", argv[0]); return 0; } + else if(0==strcmp(argv[1], "train")) train_imagenet(argv[2]); else if(0==strcmp(argv[1], "client")) train_imagenet_distributed(argv[2]); + else if(0==strcmp(argv[1], "detect")) test_detection(argv[2]); else if(0==strcmp(argv[1], "init")) test_init(argv[2]); else if(0==strcmp(argv[1], "visualize")) test_visualize(argv[2]); else if(0==strcmp(argv[1], "valid")) validate_imagenet(argv[2]); diff --git a/src/crop_layer.c b/src/crop_layer.c index 58e1b55c..2a5007ad 100644 --- a/src/crop_layer.c +++ b/src/crop_layer.c @@ -21,37 +21,77 @@ crop_layer *make_crop_layer(int batch, int h, int w, int c, int crop_height, int layer->crop_width = crop_width; layer->crop_height = crop_height; layer->output = calloc(crop_width*crop_height * c*batch, sizeof(float)); - layer->delta = calloc(crop_width*crop_height * c*batch, sizeof(float)); + #ifdef GPU + layer->output_cl = cl_make_array(layer->output, crop_width*crop_height*c*batch); + #endif return layer; } + void forward_crop_layer(const crop_layer layer, float *input) { - int i,j,c,b; + int i,j,c,b,row,col; + int index; + int count = 0; + int flip = (layer.flip && rand()%2); int dh = rand()%(layer.h - layer.crop_height); int dw = rand()%(layer.w - layer.crop_width); - int count = 0; - if(layer.flip && rand()%2){ - for(b = 0; b < layer.batch; ++b){ - for(c = 0; c < layer.c; ++c){ - for(i = dh; i < dh+layer.crop_height; ++i){ - for(j = dw+layer.crop_width-1; j >= dw; --j){ - int index = j+layer.w*(i+layer.h*(c + layer.c*b)); - layer.output[count++] = input[index]; - } - } - } - } - }else{ - for(b = 0; b < layer.batch; ++b){ - for(c = 0; c < layer.c; ++c){ - for(i = dh; i < dh+layer.crop_height; ++i){ - for(j = dw; j < dw+layer.crop_width; ++j){ - int index = j+layer.w*(i+layer.h*(c + layer.c*b)); - layer.output[count++] = input[index]; + for(b = 0; b < layer.batch; ++b){ + for(c = 0; c < layer.c; ++c){ + for(i = 0; i < layer.crop_height; ++i){ + for(j = 0; j < layer.crop_width; ++j){ + if(flip){ + col = layer.w - dw - j - 1; + }else{ + col = j + dw; } + row = i + dh; + index = col+layer.w*(row+layer.h*(c + layer.c*b)); + layer.output[count++] = input[index]; } } } } } +#ifdef GPU +cl_kernel get_crop_kernel() +{ + static int init = 0; + static cl_kernel kernel; + if(!init){ + kernel = get_kernel("src/crop_layer.cl", "forward", 0); + init = 1; + } + return kernel; +} + +void forward_crop_layer_gpu(crop_layer layer, cl_mem input) +{ + int flip = (layer.flip && rand()%2); + int dh = rand()%(layer.h - layer.crop_height); + int dw = rand()%(layer.w - layer.crop_width); + int size = layer.batch*layer.c*layer.crop_width*layer.crop_height; + + cl_kernel kernel = get_crop_kernel(); + cl_command_queue queue = cl.queue; + + cl_uint i = 0; + cl.error = clSetKernelArg(kernel, i++, sizeof(input), (void*) &input); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.c), (void*) &layer.c); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.h), (void*) &layer.h); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.w), (void*) &layer.w); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.crop_height), (void*) &layer.crop_height); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.crop_width), (void*) &layer.crop_width); + cl.error = clSetKernelArg(kernel, i++, sizeof(dh), (void*) &dh); + cl.error = clSetKernelArg(kernel, i++, sizeof(dw), (void*) &dw); + cl.error = clSetKernelArg(kernel, i++, sizeof(flip), (void*) &flip); + cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl); + check_error(cl); + + const size_t global_size[] = {size}; + + cl.error = clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0); + check_error(cl); +} + +#endif diff --git a/src/crop_layer.cl b/src/crop_layer.cl new file mode 100644 index 00000000..a61b733d --- /dev/null +++ b/src/crop_layer.cl @@ -0,0 +1,16 @@ +__kernel void forward(__global float *input, int c, int h, int w, int crop_height, int crop_width, int dh, int dw, int flip, __global float *output) +{ + int id = get_global_id(0); + int count = id; + int j = id % crop_width; + id /= crop_width; + int i = id % crop_height; + id /= crop_height; + int k = id % c; + id /= c; + int b = id; + int col = (flip) ? w - dw - j - 1 : j + dw; + int row = i + dh; + int index = col+w*(row+h*(k + c*b)); + output[count] = input[index]; +} diff --git a/src/crop_layer.h b/src/crop_layer.h index a0cd9392..508487a2 100644 --- a/src/crop_layer.h +++ b/src/crop_layer.h @@ -1,6 +1,7 @@ #ifndef CROP_LAYER_H #define CROP_LAYER_H +#include "opencl.h" #include "image.h" typedef struct { @@ -9,14 +10,19 @@ typedef struct { int crop_width; int crop_height; int flip; - float *delta; float *output; +#ifdef GPU + cl_mem output_cl; +#endif } crop_layer; image get_crop_image(crop_layer layer); crop_layer *make_crop_layer(int batch, int h, int w, int c, int crop_height, int crop_width, int flip); void forward_crop_layer(const crop_layer layer, float *input); -void backward_crop_layer(const crop_layer layer, float *input, float *delta); + +#ifdef GPU +void forward_crop_layer_gpu(crop_layer layer, cl_mem input); +#endif #endif diff --git a/src/network.c b/src/network.c index 64a60321..f451fd90 100644 --- a/src/network.c +++ b/src/network.c @@ -125,6 +125,9 @@ float *get_network_output_layer(network net, int i) } else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; return layer.output; + } else if(net.types[i] == CROP){ + crop_layer layer = *(crop_layer *)net.layers[i]; + return layer.output; } else if(net.types[i] == NORMALIZATION){ normalization_layer layer = *(normalization_layer *)net.layers[i]; return layer.output; @@ -402,6 +405,9 @@ int get_network_input_size_layer(network net, int i) } else if(net.types[i] == DROPOUT){ dropout_layer layer = *(dropout_layer *) net.layers[i]; return layer.inputs; + } else if(net.types[i] == CROP){ + crop_layer layer = *(crop_layer *) net.layers[i]; + return layer.c*layer.h*layer.w; } else if(net.types[i] == FREEWEIGHT){ freeweight_layer layer = *(freeweight_layer *) net.layers[i]; @@ -411,6 +417,7 @@ int get_network_input_size_layer(network net, int i) softmax_layer layer = *(softmax_layer *)net.layers[i]; return layer.inputs; } + printf("Can't find input size\n"); return 0; } @@ -425,6 +432,10 @@ int get_network_output_size_layer(network net, int i) maxpool_layer layer = *(maxpool_layer *)net.layers[i]; image output = get_maxpool_image(layer); return output.h*output.w*output.c; + } + else if(net.types[i] == CROP){ + crop_layer layer = *(crop_layer *) net.layers[i]; + return layer.c*layer.crop_height*layer.crop_width; } else if(net.types[i] == CONNECTED){ connected_layer layer = *(connected_layer *)net.layers[i]; @@ -442,6 +453,7 @@ int get_network_output_size_layer(network net, int i) softmax_layer layer = *(softmax_layer *)net.layers[i]; return layer.inputs; } + printf("Can't find output size\n"); return 0; } diff --git a/src/network_gpu.c b/src/network_gpu.c index d09aa714..c3f22d36 100644 --- a/src/network_gpu.c +++ b/src/network_gpu.c @@ -55,6 +55,11 @@ void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) dropout_layer layer = *(dropout_layer *)net.layers[i]; forward_dropout_layer_gpu(layer, input); } + else if(net.types[i] == CROP){ + crop_layer layer = *(crop_layer *)net.layers[i]; + forward_crop_layer_gpu(layer, input); + input = layer.output_cl; + } //printf("%d %f\n", i, sec(clock()-time)); /* else if(net.types[i] == CROP){ @@ -142,6 +147,10 @@ cl_mem get_network_output_cl_layer(network net, int i) maxpool_layer layer = *(maxpool_layer *)net.layers[i]; return layer.output_cl; } + else if(net.types[i] == CROP){ + crop_layer layer = *(crop_layer *)net.layers[i]; + return layer.output_cl; + } else if(net.types[i] == SOFTMAX){ softmax_layer layer = *(softmax_layer *)net.layers[i]; return layer.output_cl; @@ -260,7 +269,7 @@ float *get_network_output_gpu(network net) float *network_predict_gpu(network net, float *input) { - + int size = get_network_input_size(net) * net.batch; cl_mem input_cl = cl_make_array(input, size); forward_network_gpu(net, input_cl, 0, 0);