diff --git a/src/cnn.c b/src/cnn.c index 10705fd3..be93e8c0 100644 --- a/src/cnn.c +++ b/src/cnn.c @@ -210,10 +210,10 @@ void train_imagenet(char *cfgfile) //network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg"); srand(time(0)); network net = parse_network_cfg(cfgfile); - set_learning_network(&net, net.learning_rate, 0, net.decay); + set_learning_network(&net, net.learning_rate*10., net.momentum, net.decay); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = 1024; - int i = 0; + int i = 6600; char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); list *plist = get_paths("/data/imagenet/cls.train.list"); char **paths = (char **)list_to_array(plist); @@ -228,9 +228,9 @@ void train_imagenet(char *cfgfile) time=clock(); pthread_join(load_thread, 0); train = buffer; - //normalize_data_rows(train); - translate_data_rows(train, -128); - scale_data_rows(train, 1./128); + normalize_data_rows(train); + //translate_data_rows(train, -128); + //scale_data_rows(train, 1./128); load_thread = load_data_thread(paths, imgs, plist->size, labels, 1000, 256, 256, &buffer); printf("Loaded: %lf seconds\n", sec(clock()-time)); time=clock(); @@ -539,12 +539,14 @@ void visualize_cat() void test_correct_nist() { + network net = parse_network_cfg("cfg/nist_conv.cfg"); + test_learn_bias(*(convolutional_layer *)net.layers[0]); srand(222222); - network net = parse_network_cfg("cfg/nist.cfg"); + net = parse_network_cfg("cfg/nist_conv.cfg"); data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10); - translate_data_rows(train, -144); - translate_data_rows(test, -144); + normalize_data_rows(train); + normalize_data_rows(test); int count = 0; int iters = 1000/net.batch; @@ -555,11 +557,12 @@ void test_correct_nist() float test_acc = network_accuracy(net, test); printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); } + save_network(net, "cfg/nist_gpu.cfg"); gpu_index = -1; count = 0; srand(222222); - net = parse_network_cfg("cfg/nist.cfg"); + net = parse_network_cfg("cfg/nist_conv.cfg"); while(++count <= 5){ clock_t start = clock(), end; float loss = train_network_sgd(net, train, iters); @@ -567,6 +570,7 @@ void test_correct_nist() float test_acc = network_accuracy(net, test); printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); } + save_network(net, "cfg/nist_cpu.cfg"); } void test_correct_alexnet() diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index fc5cb0e5..4e8c44bd 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -305,6 +305,27 @@ void learn_bias_convolutional_layer_ongpu(convolutional_layer layer) check_error(cl); } +void test_learn_bias(convolutional_layer l) +{ + int i; + int size = convolutional_out_height(l) * convolutional_out_width(l); + for(i = 0; i < size*l.batch*l.n; ++i){ + l.delta[i] = rand_uniform(); + } + for(i = 0; i < l.n; ++i){ + l.bias_updates[i] = rand_uniform(); + } + cl_write_array(l.delta_cl, l.delta, size*l.batch*l.n); + cl_write_array(l.bias_updates_cl, l.bias_updates, l.n); + float *gpu = calloc(l.n, sizeof(float)); + cl_read_array(l.bias_updates_cl, gpu, l.n); + for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); + learn_bias_convolutional_layer_ongpu(l); + learn_bias_convolutional_layer(l); + cl_read_array(l.bias_updates_cl, gpu, l.n); + for(i = 0; i < l.n; ++i) printf("%.9g %.9g\n", l.bias_updates[i], gpu[i]); +} + cl_kernel get_convolutional_bias_kernel() { static int init = 0; diff --git a/src/convolutional_layer.cl b/src/convolutional_layer.cl index 3b091cf0..903471be 100644 --- a/src/convolutional_layer.cl +++ b/src/convolutional_layer.cl @@ -19,7 +19,7 @@ __kernel void learn_bias(int batch, int n, int size, __global float *delta, __gl for(b = 0; b < batch; ++b){ for(i = 0; i < size; i += BLOCK){ int index = p + i + size*(filter + n*b); - sum += (index < size) ? delta[index] : 0; + sum += (p+i < size) ? delta[index] : 0; } } part[p] = sum; diff --git a/src/gemm.c b/src/gemm.c index d1782b1a..83949914 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -162,6 +162,26 @@ cl_kernel get_gemm_nn_kernel() return gemm_kernel; } +#define TILE 64 +#define TILE_K 16 +#define WPT 8 +#define THREADS (TILE*TILE)/(WPT*WPT) + +cl_kernel get_gemm_nn_fast_kernel() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE) + " -cl-nv-verbose " + " -D TILE_K=" STR(TILE_K) + " -D WPT=" STR(WPT) + " -D THREADS=" STR(THREADS)); + init = 1; + } + return gemm_kernel; +} + void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, cl_mem A_gpu, int lda, cl_mem B_gpu, int ldb, @@ -171,6 +191,45 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, gemm_ongpu_offset(TA, TB, M, N, K, ALPHA, A_gpu, 0, lda, B_gpu, 0, ldb, BETA, C_gpu, 0, ldc); } +void gemm_ongpu_fast(int TA, int TB, int M, int N, int K, float ALPHA, + cl_mem A_gpu, int lda, + cl_mem B_gpu, int ldb, + float BETA, + cl_mem C_gpu, int ldc) +{ + int a_off = 0; + int b_off = 0; + int c_off = 0; + //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); + cl_kernel gemm_kernel = get_gemm_nn_fast_kernel(); + cl_command_queue queue = cl.queue; + + cl_uint i = 0; + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(a_off), (void*) &a_off); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(b_off), (void*) &b_off); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(c_off), (void*) &c_off); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); + check_error(cl); + + const size_t global_size[] = {THREADS*((N-1)/TILE + 1), (M-1)/TILE + 1}; + const size_t local_size[] = {THREADS, 1}; + + cl.error = clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); + check_error(cl); +} + void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, cl_mem A_gpu, int a_off, int lda, cl_mem B_gpu, int b_off, int ldb, @@ -214,7 +273,7 @@ void gemm_ongpu_offset(int TA, int TB, int M, int N, int K, float ALPHA, cl.error = clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); check_error(cl); - #endif +#endif } void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, @@ -244,7 +303,9 @@ void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, size, C, &cl.error); check_error(cl); - gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); + // TODO + //gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); + gemm_ongpu_fast(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); check_error(cl); @@ -303,7 +364,7 @@ void time_ongpu(int TA, int TB, int m, int k, int n) for(i = 0; i