diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index 44432918..883ab344 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -54,8 +54,8 @@ layer make_batchnorm_layer(int batch, int w, int h, int c) layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs); #ifdef CUDNN cudnnCreateTensorDescriptor(&layer.normTensorDesc); - cudnnCreateTensorDescriptor(&layer.dstTensorDesc); - cudnnSetTensor4dDescriptor(layer.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w); + cudnnCreateTensorDescriptor(&layer.normDstTensorDesc); + cudnnSetTensor4dDescriptor(layer.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, layer.batch, layer.out_c, layer.out_h, layer.out_w); cudnnSetTensor4dDescriptor(layer.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, layer.out_c, 1, 1); #endif #endif @@ -189,9 +189,9 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) CUDNN_BATCHNORM_SPATIAL, &one, &zero, - l.dstTensorDesc, + l.normDstTensorDesc, l.x_gpu, - l.dstTensorDesc, + l.normDstTensorDesc, l.output_gpu, l.normTensorDesc, l.scales_gpu, @@ -242,11 +242,11 @@ void backward_batchnorm_layer_gpu(layer l, network_state state) &zero, &one, &one, - l.dstTensorDesc, + l.normDstTensorDesc, l.x_gpu, - l.dstTensorDesc, + l.normDstTensorDesc, l.delta_gpu, - l.dstTensorDesc, + l.normDstTensorDesc, l.x_norm_gpu, l.normTensorDesc, l.scales_gpu, diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index fb606ae8..cd369292 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -177,6 +177,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) // batch norm cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); + cudnnSetTensor4dDescriptor(l->normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); #if(CUDNN_MAJOR >= 6) cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION, CUDNN_DATA_FLOAT); // cudnn >= 6.0 #else @@ -190,6 +191,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) forward_algo = CUDNN_CONVOLUTION_FWD_NO_WORKSPACE; backward_algo = CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE; backward_filter = CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE; + printf(" CUDNN-slow "); } cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), @@ -216,6 +218,38 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) backward_filter, 0, &l->bf_algo); + + if (data_type == CUDNN_DATA_HALF) + { + // HALF-16 if(data_type == CUDNN_DATA_HALF) + l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + + // FLOAT-32 if(data_type == CUDNN_DATA_FLOAT) + //l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED; + //l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED; + //l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED; + + int fw = 0, bd = 0, bf = 0; + if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) fw = 1; + //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM \n"); + if (l->fw_algo == CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED) fw = 2; + //printf("Tensor Cores - Forward enabled: l->fw_algo = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED \n"); + + if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_1) bd = 1; + //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 \n"); + if (l->bd_algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED) bd = 2; + //printf("Tensor Cores - Backward-data enabled: l->bd_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED \n"); + + if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1) bf = 1; + //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 \n"); + if (l->bf_algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED) bf = 2; + //printf("Tensor Cores - Backward-filter enabled: l->bf_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED \n"); + + if (fw == 2 && bd == 2 && bf == 2) printf("TF "); + else if (fw >= 1 && bd >= 1 && bf >= 1) printf("TH "); + } } #endif #endif @@ -343,7 +377,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.x_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } -#ifdef CUDNN +#ifdef CUDNN + cudnnCreateTensorDescriptor(&l.normDstTensorDesc); cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); diff --git a/src/detector.c b/src/detector.c index 46ea1daf..a0372ab7 100644 --- a/src/detector.c +++ b/src/detector.c @@ -91,7 +91,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i args.small_object = net.small_object; args.d = &buffer; args.type = DETECTION_DATA; - args.threads = 64; // 8 + args.threads = 16; // 64 args.angle = net.angle; args.exposure = net.exposure; @@ -99,6 +99,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i args.hue = net.hue; #ifdef OPENCV + args.threads = 7; IplImage* img = NULL; float max_img_loss = 5; int number_of_lines = 100; @@ -108,7 +109,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i #endif //OPENCV pthread_t load_thread = load_data(args); - clock_t time; + double time; int count = 0; //while(i*imgs < N*120){ while(get_current_batch(net) < net.max_batches){ @@ -131,7 +132,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i } net = nets[0]; } - time=clock(); + time=what_time_is_it_now(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data(args); @@ -153,9 +154,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i save_image(im, "truth11"); */ - printf("Loaded: %lf seconds\n", sec(clock()-time)); + printf("Loaded: %lf seconds\n", (what_time_is_it_now()-time)); - time=clock(); + time=what_time_is_it_now(); float loss = 0; #ifdef GPU if(ngpus == 1){ @@ -170,7 +171,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i avg_loss = avg_loss*.9 + loss*.1; i = get_current_batch(net); - printf("\n %d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); + printf("\n %d: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), (what_time_is_it_now()-time), i*imgs); #ifdef OPENCV if(!dont_show) @@ -291,11 +292,11 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out int *map = 0; if (mapf) map = read_map(mapf); - network net = parse_network_cfg_custom(cfgfile, 1); + network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 if (weightfile) { load_weights(&net, weightfile); } - set_batch_network(&net, 1); + //set_batch_network(&net, 1); fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); srand(time(0)); @@ -414,11 +415,11 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out void validate_detector_recall(char *datacfg, char *cfgfile, char *weightfile) { - network net = parse_network_cfg_custom(cfgfile, 1); + network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 if (weightfile) { load_weights(&net, weightfile); } - set_batch_network(&net, 1); + //set_batch_network(&net, 1); fuse_conv_batchnorm(net); srand(time(0)); @@ -522,11 +523,11 @@ void validate_detector_map(char *datacfg, char *cfgfile, char *weightfile, float int *map = 0; if (mapf) map = read_map(mapf); - network net = parse_network_cfg_custom(cfgfile, 1); + network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 if (weightfile) { load_weights(&net, weightfile); } - set_batch_network(&net, 1); + //set_batch_network(&net, 1); fuse_conv_batchnorm(net); srand(time(0)); @@ -1020,14 +1021,14 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam char **names = get_labels(name_list); image **alphabet = load_alphabet(); - network net = parse_network_cfg_custom(cfgfile, 1); + network net = parse_network_cfg_custom(cfgfile, 1); // set batch=1 if(weightfile){ load_weights(&net, weightfile); } - set_batch_network(&net, 1); + //set_batch_network(&net, 1); fuse_conv_batchnorm(net); srand(2222222); - clock_t time; + double time; char buff[256]; char *input = buff; int j; @@ -1054,10 +1055,10 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam //for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(l.classes, sizeof(float *)); float *X = sized.data; - time=clock(); + time= what_time_is_it_now(); network_predict(net, X); //network_predict_image(&net, im); - printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); + printf("%s: Predicted in %f seconds.\n", input, (what_time_is_it_now()-time)); //get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0); // if (nms) do_nms_sort_v2(boxes, probs, l.w*l.h*l.n, l.classes, nms); //draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); diff --git a/src/layer.h b/src/layer.h index 5d55e1cb..75c0358a 100644 --- a/src/layer.h +++ b/src/layer.h @@ -281,7 +281,7 @@ struct layer{ #ifdef CUDNN cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc; cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc; - cudnnTensorDescriptor_t normTensorDesc; + cudnnTensorDescriptor_t normTensorDesc, normDstTensorDesc; cudnnFilterDescriptor_t weightDesc; cudnnFilterDescriptor_t dweightDesc; cudnnConvolutionDescriptor_t convDesc; diff --git a/src/utils.c b/src/utils.c index a97d9661..615d8369 100644 --- a/src/utils.c +++ b/src/utils.c @@ -7,13 +7,24 @@ #include #ifdef WIN32 #include "unistd.h" +#include "gettimeofday.h" #else #include +#include #endif #include "utils.h" #pragma warning(disable: 4996) +double what_time_is_it_now() +{ + struct timeval time; + if (gettimeofday(&time, NULL)) { + return 0; + } + return (double)time.tv_sec + (double)time.tv_usec * .000001; +} + int *read_map(char *filename) { int n = 0; diff --git a/src/utils.h b/src/utils.h index d56931c4..8e8e1c75 100644 --- a/src/utils.h +++ b/src/utils.h @@ -25,6 +25,7 @@ #endif #endif +double what_time_is_it_now(); int *read_map(char *filename); void shuffle(void *arr, size_t n, size_t size); void sorta_shuffle(void *arr, size_t n, size_t size, size_t sections);