From 14303717dcddae43cdc55beb0685dae86f566fd8 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Sat, 25 Oct 2014 11:57:26 -0700 Subject: [PATCH] Fast, needs to be faster --- Makefile | 4 +- src/axpy.c | 8 +- src/cnn.c | 82 +++++++++++++++++-- src/connected_layer.c | 6 ++ src/connected_layer.h | 1 + src/convolutional_layer.c | 8 +- src/convolutional_layer.h | 1 + src/data.c | 26 +++++- src/data.h | 2 + src/gemm.c | 146 ++++++++++++++++++++++++++++++++-- src/gemm.cl | 8 +- src/gemm_new.cl | 162 ++++++++++++++++++++++++++++++++++++++ src/image.c | 20 ++++- src/image.h | 1 + src/mini_blas.h | 8 +- src/network.c | 23 +++++- src/network.h | 2 + src/opencl.c | 6 +- src/parser.c | 44 ++--------- 19 files changed, 484 insertions(+), 74 deletions(-) create mode 100644 src/gemm_new.cl diff --git a/Makefile b/Makefile index b5ad1eb0..29dccbbe 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ CC=gcc GPU=1 -COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ +COMMON=-Wall -Wfatal-errors `pkg-config --cflags opencv` -I/usr/local/cuda/include/ -I/usr/local/clblas/include/ ifeq ($(GPU), 1) COMMON+=-DGPU else @@ -15,7 +15,7 @@ endif else OPTS+= -march=native ifeq ($(GPU), 1) -LDFLAGS= -lOpenCL +LDFLAGS= -lOpenCL -lclBLAS endif endif CFLAGS= $(COMMON) $(OPTS) diff --git a/src/axpy.c b/src/axpy.c index c4ec1ebc..10ffca45 100644 --- a/src/axpy.c +++ b/src/axpy.c @@ -1,24 +1,24 @@ #include "mini_blas.h" -inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) +void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY) { int i; for(i = 0; i < N; ++i) Y[i*INCY] += ALPHA*X[i*INCX]; } -inline void scal_cpu(int N, float ALPHA, float *X, int INCX) +void scal_cpu(int N, float ALPHA, float *X, int INCX) { int i; for(i = 0; i < N; ++i) X[i*INCX] *= ALPHA; } -inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) +void copy_cpu(int N, float *X, int INCX, float *Y, int INCY) { int i; for(i = 0; i < N; ++i) Y[i*INCY] = X[i*INCX]; } -inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) +float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) { int i; float dot = 0; diff --git a/src/cnn.c b/src/cnn.c index 7e90a809..a31e59cd 100644 --- a/src/cnn.c +++ b/src/cnn.c @@ -286,14 +286,16 @@ void train_assira() srand(2222222); int i = 0; char *labels[] = {"cat","dog"}; + clock_t time; while(1){ i += 1000; + time=clock(); data train = load_data_image_pathfile_random("data/assira/train.list", imgs*net.batch, labels, 2, 256, 256); normalize_data_rows(train); - clock_t start = clock(), end; - float loss = train_network_sgd_gpu(net, train, imgs); - end = clock(); - printf("%d: %f, Time: %lf seconds\n", i, loss, (float)(end-start)/CLOCKS_PER_SEC ); + printf("Loaded: %lf seconds\n", sec(clock()-time)); + time=clock(); + float loss = train_network_sgd(net, train, imgs); + printf("%d: %f, Time: %lf seconds\n", i, loss, sec(clock()-time)); free_data(train); if(i%10000==0){ char buff[256]; @@ -304,9 +306,69 @@ void train_assira() } } +void train_imagenet() +{ + network net = parse_network_cfg("cfg/imagenet_backup_710.cfg"); + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + int imgs = 1000/net.batch+1; + //imgs=1; + srand(888888); + int i = 0; + char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); + list *plist = get_paths("/home/pjreddie/data/imagenet/cls.cropped.list"); + char **paths = (char **)list_to_array(plist); + clock_t time; + while(1){ + i += 1; + time=clock(); + data train = load_data_random(imgs*net.batch, paths, plist->size, labels, 1000, 256, 256); + normalize_data_rows(train); + printf("Loaded: %lf seconds\n", sec(clock()-time)); + time=clock(); + #ifdef GPU + float loss = train_network_sgd_gpu(net, train, imgs); + printf("%d: %f, %lf seconds, %d images\n", i, loss, sec(clock()-time), i*imgs*net.batch); + #endif + free_data(train); + if(i%10==0){ + char buff[256]; + sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_%d.cfg", i); + save_network(net, buff); + } + } +} + +void test_imagenet() +{ + network net = parse_network_cfg("cfg/imagenet_test.cfg"); + //imgs=1; + srand(2222222); + int i = 0; + char **names = get_labels("cfg/shortnames.txt"); + clock_t time; + char filename[256]; + int indexes[10]; + while(1){ + gets(filename); + image im = load_image_color(filename, 256, 256); + normalize_image(im); + printf("%d %d %d\n", im.h, im.w, im.c); + float *X = im.data; + time=clock(); + float *predictions = network_predict(net, X); + top_predictions(net, 10, indexes); + printf("%s: Predicted in %f seconds.\n", filename, sec(clock()-time)); + for(i = 0; i < 10; ++i){ + int index = indexes[i]; + printf("%s: %f\n", names[index], predictions[index]); + } + free_image(im); + } +} + void test_visualize() { - network net = parse_network_cfg("cfg/voc_imagenet.cfg"); + network net = parse_network_cfg("cfg/assira_backup_740000.cfg"); srand(2222222); visualize_network(net); cvWaitKey(0); @@ -322,7 +384,7 @@ void test_full() for(i = 0; i < total; ++i){ visualize_network(net); cvWaitKey(100); - data test = load_data_image_pathfile_part("images/assira/test.list", i, total, labels, 2, 256, 256); + data test = load_data_image_pathfile_part("data/assira/test.list", i, total, labels, 2, 256, 256); image im = float_to_image(256, 256, 3,test.X.vals[0]); show_image(im, "input"); cvWaitKey(100); @@ -437,7 +499,7 @@ void train_nist() int iters = 10000/net.batch; while(++count <= 2000){ clock_t start = clock(), end; - float loss = train_network_sgd_gpu(net, train, iters); + float loss = train_network_sgd(net, train, iters); end = clock(); float test_acc = network_accuracy(net, test); //float test_acc = 0; @@ -895,10 +957,14 @@ void test_distribution() int main(int argc, char *argv[]) { + test_gpu_blas(); //test_blas(); - train_assira(); + //train_assira(); + //test_visualize(); //test_distribution(); //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + //train_imagenet(); + //test_imagenet(); //test_blas(); //test_visualize(); diff --git a/src/connected_layer.c b/src/connected_layer.c index b41ae91f..dba0b2ac 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -114,6 +114,12 @@ void pull_connected_layer(connected_layer layer) cl_read_array(layer.biases_cl, layer.biases, layer.outputs); } +void push_connected_layer(connected_layer layer) +{ + cl_write_array(layer.weights_cl, layer.weights, layer.inputs*layer.outputs); + cl_write_array(layer.biases_cl, layer.biases, layer.outputs); +} + void update_connected_layer_gpu(connected_layer layer) { axpy_ongpu(layer.outputs, layer.learning_rate, layer.bias_updates_cl, 1, layer.biases_cl, 1); diff --git a/src/connected_layer.h b/src/connected_layer.h index 19bcfa2a..1e5b4a70 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -48,6 +48,7 @@ void update_connected_layer(connected_layer layer); void forward_connected_layer_gpu(connected_layer layer, cl_mem input); void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem delta); void update_connected_layer_gpu(connected_layer layer); +void push_connected_layer(connected_layer layer); #endif #endif diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 0ed5a995..1587ae8d 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -212,7 +212,7 @@ void update_convolutional_layer(convolutional_layer layer) { int size = layer.size*layer.size*layer.c*layer.n; axpy_cpu(layer.n, layer.learning_rate, layer.bias_updates, 1, layer.biases, 1); - scal_cpu(layer.n,layer.momentum, layer.bias_updates, 1); + scal_cpu(layer.n, layer.momentum, layer.bias_updates, 1); scal_cpu(size, 1.-layer.learning_rate*layer.decay, layer.filters, 1); axpy_cpu(size, layer.learning_rate, layer.filter_updates, 1, layer.filters, 1); @@ -434,6 +434,12 @@ void pull_convolutional_layer(convolutional_layer layer) cl_read_array(layer.biases_cl, layer.biases, layer.n); } +void push_convolutional_layer(convolutional_layer layer) +{ + cl_write_array(layer.filters_cl, layer.filters, layer.c*layer.n*layer.size*layer.size); + cl_write_array(layer.biases_cl, layer.biases, layer.n); +} + void update_convolutional_layer_gpu(convolutional_layer layer) { int size = layer.size*layer.size*layer.c*layer.n; diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 465d3091..970a9b11 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -49,6 +49,7 @@ typedef struct { void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl); void update_convolutional_layer_gpu(convolutional_layer layer); +void push_convolutional_layer(convolutional_layer layer); #endif convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay); diff --git a/src/data.c b/src/data.c index aa8fecf2..734fffac 100644 --- a/src/data.c +++ b/src/data.c @@ -41,9 +41,11 @@ data load_data_image_paths(char **paths, int n, char **labels, int k, int h, int d.y = make_matrix(n, k); for(i = 0; i < n; ++i){ - image im = load_image(paths[i], h, w); + image im = load_image_color(paths[i], h, w); d.X.vals[i] = im.data; d.X.cols = im.h*im.w*im.c; + } + for(i = 0; i < n; ++i){ fill_truth(paths[i], labels, k, d.y.vals[i]); } return d; @@ -60,6 +62,14 @@ data load_data_image_pathfile(char *filename, char **labels, int k, int h, int w return d; } +char **get_labels(char *filename) +{ + list *plist = get_paths(filename); + char **labels = (char **)list_to_array(plist); + free_list(plist); + return labels; +} + void free_data(data d) { if(!d.shallow){ @@ -84,6 +94,20 @@ data load_data_image_pathfile_part(char *filename, int part, int total, char **l return d; } +data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w) +{ + char **random_paths = calloc(n, sizeof(char*)); + int i; + for(i = 0; i < n; ++i){ + int index = rand()%m; + random_paths[i] = paths[index]; + if(i == 0) printf("%s\n", paths[index]); + } + data d = load_data_image_paths(random_paths, n, labels, k, h, w); + free(random_paths); + return d; +} + data load_data_image_pathfile_random(char *filename, int n, char **labels, int k, int h, int w) { int i; diff --git a/src/data.h b/src/data.h index bd677e86..eefef8be 100644 --- a/src/data.h +++ b/src/data.h @@ -12,6 +12,7 @@ typedef struct{ void free_data(data d); +data load_data_random(int n, char **paths, int m, char **labels, int k, int h, int w); data load_data_image_pathfile(char *filename, char **labels, int k, int h, int w); data load_data_image_pathfile_part(char *filename, int part, int total, char **labels, int k, int h, int w); @@ -20,6 +21,7 @@ data load_data_image_pathfile_random(char *filename, int n, char **labels, data load_cifar10_data(char *filename); data load_all_cifar10(); list *get_paths(char *filename); +char **get_labels(char *filename); void get_batch(data d, int n, float *X, float *y); data load_categorical_data_csv(char *filename, int target, int k); void normalize_data_rows(data d); diff --git a/src/gemm.c b/src/gemm.c index fa78daf9..2e53b314 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -1,5 +1,5 @@ #include "mini_blas.h" -#include +#include "utils.h" void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, @@ -104,6 +104,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, #include "opencl.h" #include +#include #define STR_HELPER(x) #x #define STR(x) STR_HELPER(x) @@ -111,7 +112,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, #ifdef __APPLE__ #define BLOCK 1 #else -#define BLOCK 8 +#define BLOCK 16 #endif cl_kernel get_gemm_kernel() @@ -125,6 +126,44 @@ cl_kernel get_gemm_kernel() return gemm_kernel; } +cl_kernel get_gemm_nt_kernel() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nt", "-D BLOCK=" STR(BLOCK) ); + init = 1; + } + return gemm_kernel; +} + +cl_kernel get_gemm_tn_kernel() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_tn", "-D BLOCK=" STR(BLOCK) ); + init = 1; + } + return gemm_kernel; +} + +cl_kernel get_gemm_nn_kernel() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm_new.cl", "gemm_nn", "-D BLOCK=" STR(BLOCK) ); + init = 1; + } + return gemm_kernel; +} + +void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, + cl_mem A_gpu, int lda, + cl_mem B_gpu, int ldb, + float BETA, + cl_mem C_gpu, int ldc); void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, cl_mem A_gpu, int lda, cl_mem B_gpu, int ldb, @@ -137,10 +176,51 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, float BETA, cl_mem C_gpu, int ldc) { + /* cl_setup(); - //cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); - //check_error(cl); - gemm_ongpu_old(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); + cl_command_queue queue = cl.queue; + cl_event event; + cl.error = clblasSgemm(clblasRowMajor, TA?clblasTrans:clblasNoTrans, TB?clblasTrans:clblasNoTrans,M, N, K,ALPHA, A_gpu, 0, lda,B_gpu, 0, ldb,BETA, C_gpu, 0, ldc,1, &queue, 0, NULL, &event); + */ + + gemm_ongpu_new(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); +} + +void gemm_ongpu_new(int TA, int TB, int M, int N, int K, float ALPHA, + cl_mem A_gpu, int lda, + cl_mem B_gpu, int ldb, + float BETA, + cl_mem C_gpu, int ldc) +{ + //printf("gpu: %d %d %d %d %d\n",TA, TB, M, N, K); + cl_setup(); + cl_kernel gemm_kernel = get_gemm_kernel(); + if(!TA && !TB) gemm_kernel = get_gemm_nn_kernel(); + if(!TA && TB) gemm_kernel = get_gemm_nt_kernel(); + if(TA && !TB) gemm_kernel = get_gemm_tn_kernel(); + cl_command_queue queue = cl.queue; + + cl_uint i = 0; + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); + check_error(cl); + + const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; + const size_t local_size[] = {BLOCK, BLOCK}; + + clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); + check_error(cl); } void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, @@ -170,7 +250,7 @@ void gemm_ongpu_old(int TA, int TB, int M, int N, int K, float ALPHA, cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); check_error(cl); - const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK}; + const size_t global_size[] = {ceil((float)N/BLOCK)*BLOCK, ceil((float)M/BLOCK)*BLOCK}; const size_t local_size[] = {BLOCK, BLOCK}; clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); @@ -235,7 +315,7 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) float *c = random_matrix(m,n); int i; clock_t start = clock(), end; - for(i = 0; i<10; ++i){ + for(i = 0; i<32; ++i){ gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); } end = clock(); @@ -245,6 +325,39 @@ void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) free(c); } +void time_ongpu(int TA, int TB, int m, int k, int n) +{ + int iter = 100; + float *a = random_matrix(m,k); + float *b = random_matrix(k,n); + + int lda = (!TA)?k:m; + int ldb = (!TB)?n:k; + + float *c = random_matrix(m,n); + + cl_mem a_cl = cl_make_array(a, m*k); + cl_mem b_cl = cl_make_array(b, k*n); + cl_mem c_cl = cl_make_array(c, m*n); + + int i; + clock_t start = clock(), end; + for(i = 0; iheight != h || src->width != w)){ + printf("Resized!\n"); + IplImage *resized = resizeImage(src, h, w, 1); + cvReleaseImage(&src); + src = resized; + } + image out = ipl_to_image(src); + cvReleaseImage(&src); + return out; +} + image load_image(char *filename, int h, int w) { IplImage* src = 0; diff --git a/src/image.h b/src/image.h index fe257425..9f7fc8e4 100644 --- a/src/image.h +++ b/src/image.h @@ -45,6 +45,7 @@ image make_random_kernel(int size, int c, float scale); image float_to_image(int h, int w, int c, float *data); image copy_image(image p); image load_image(char *filename, int h, int w); +image load_image_color(char *filename, int h, int w); image ipl_to_image(IplImage* src); float get_pixel(image m, int x, int y, int c); diff --git a/src/mini_blas.h b/src/mini_blas.h index a155c351..923afc7c 100644 --- a/src/mini_blas.h +++ b/src/mini_blas.h @@ -55,8 +55,8 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float *B, int ldb, float BETA, float *C, int ldc); -inline void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); -inline void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); -inline void scal_cpu(int N, float ALPHA, float *X, int INCX); -inline float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); +void axpy_cpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY); +void copy_cpu(int N, float *X, int INCX, float *Y, int INCY); +void scal_cpu(int N, float ALPHA, float *X, int INCX); +float dot_cpu(int N, float *X, int INCX, float *Y, int INCY); void test_gpu_blas(); diff --git a/src/network.c b/src/network.c index 6696769d..5ea449c9 100644 --- a/src/network.c +++ b/src/network.c @@ -621,7 +621,7 @@ void visualize_network(network net) image *prev = 0; int i; char buff[256]; - show_image(get_network_image_layer(net, 0), "Crop"); + //show_image(get_network_image_layer(net, 0), "Crop"); for(i = 0; i < net.n; ++i){ sprintf(buff, "Layer %d", i); if(net.types[i] == CONVOLUTIONAL){ @@ -635,6 +635,27 @@ void visualize_network(network net) } } +void top_predictions(network net, int n, int *index) +{ + int i,j; + int k = get_network_output_size(net); + float *out = get_network_output(net); + float thresh = FLT_MAX; + for(i = 0; i < n; ++i){ + float max = -FLT_MAX; + int max_i = -1; + for(j = 0; j < k; ++j){ + float val = out[j]; + if(val > max && val < thresh){ + max = val; + max_i = j; + } + } + index[i] = max_i; + thresh = max; + } +} + float *network_predict(network net, float *input) { forward_network(net, input, 0, 0); diff --git a/src/network.h b/src/network.h index 22e277c9..c95f6fa2 100644 --- a/src/network.h +++ b/src/network.h @@ -52,8 +52,10 @@ float train_network_sgd(network net, data d, int n); float train_network_batch(network net, data d, int n); void train_network(network net, data d); matrix network_predict_data(network net, data test); +float *network_predict(network net, float *input); float network_accuracy(network net, data d); float network_accuracy_multi(network net, data d, int n); +void top_predictions(network net, int n, int *index); float *get_network_output(network net); float *get_network_output_layer(network net, int i); float *get_network_delta_layer(network net, int i); diff --git a/src/opencl.c b/src/opencl.c index a2e73668..604a2e3f 100644 --- a/src/opencl.c +++ b/src/opencl.c @@ -4,7 +4,7 @@ #include #include #include -//#include +#include #include "opencl.h" #include "utils.h" @@ -81,7 +81,7 @@ cl_info cl_init() } int index = getpid()%num_devices; - index = 0; + index = 1; printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index); info.device = devices[index]; fprintf(stderr, "Found %d device(s)\n", num_devices); @@ -99,7 +99,7 @@ cl_info cl_init() info.queues[i] = clCreateCommandQueue(info.context, info.device, 0, &info.error); check_error(info); } - //info.error = clblasSetup(); + info.error = clblasSetup(); check_error(info); info.initialized = 1; return info; diff --git a/src/parser.c b/src/parser.c index 9bd2eb76..79d4a3a9 100644 --- a/src/parser.c +++ b/src/parser.c @@ -67,7 +67,6 @@ void parse_data(char *data, float *a, int n) convolutional_layer *parse_convolutional(list *options, network *net, int count) { - int i; int h,w,c; float learning_rate, momentum, decay; int n = option_find_int(options, "filters",1); @@ -98,34 +97,19 @@ convolutional_layer *parse_convolutional(list *options, network *net, int count) if(h == 0) error("Layer before convolutional layer must output image."); } convolutional_layer *layer = make_convolutional_layer(net->batch,h,w,c,n,size,stride,pad,activation,learning_rate,momentum,decay); - char *data = option_find_str(options, "data", 0); - if(data){ - char *curr = data; - char *next = data; - for(i = 0; i < n; ++i){ - while(*++next !='\0' && *next != ','); - *next = '\0'; - sscanf(curr, "%g", &layer->biases[i]); - curr = next+1; - } - for(i = 0; i < c*n*size*size; ++i){ - while(*++next !='\0' && *next != ','); - *next = '\0'; - sscanf(curr, "%g", &layer->filters[i]); - curr = next+1; - } - } char *weights = option_find_str(options, "weights", 0); char *biases = option_find_str(options, "biases", 0); - parse_data(biases, layer->biases, n); parse_data(weights, layer->filters, c*n*size*size); + parse_data(biases, layer->biases, n); + #ifdef GPU + push_convolutional_layer(*layer); + #endif option_unused(options); return layer; } connected_layer *parse_connected(list *options, network *net, int count) { - int i; int input; float learning_rate, momentum, decay; int output = option_find_int(options, "output",1); @@ -147,27 +131,13 @@ connected_layer *parse_connected(list *options, network *net, int count) input = get_network_output_size_layer(*net, count-1); } connected_layer *layer = make_connected_layer(net->batch, input, output, activation,learning_rate,momentum,decay); - char *data = option_find_str(options, "data", 0); - if(data){ - char *curr = data; - char *next = data; - for(i = 0; i < output; ++i){ - while(*++next !='\0' && *next != ','); - *next = '\0'; - sscanf(curr, "%g", &layer->biases[i]); - curr = next+1; - } - for(i = 0; i < input*output; ++i){ - while(*++next !='\0' && *next != ','); - *next = '\0'; - sscanf(curr, "%g", &layer->weights[i]); - curr = next+1; - } - } char *weights = option_find_str(options, "weights", 0); char *biases = option_find_str(options, "biases", 0); parse_data(biases, layer->biases, output); parse_data(weights, layer->weights, input*output); + #ifdef GPU + push_connected_layer(*layer); + #endif option_unused(options); return layer; }