From cc06817efa24f20811ef6b32143c6700a91c5f2a Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Fri, 11 Apr 2014 01:00:27 -0700 Subject: [PATCH] Attempt at visualizing ImageNet Features --- Makefile | 10 ++- src/convolutional_layer.c | 69 ++++++++--------- src/convolutional_layer.h | 2 +- src/cpu_gemm.c | 86 +++++++++++++++++++++ src/gemm.cl | 72 ++++++++++++++++++ src/gpu_gemm.c | 153 ++++++++++++++++++++++++++++++++++++++ src/image.c | 94 +++++++++++++++++++++++ src/image.h | 5 ++ src/mini_blas.c | 148 ++++++++++++++++++++++++++---------- src/mini_blas.h | 13 ++++ src/network.c | 4 +- src/opencl.c | 77 +++++++++++++++++++ src/opencl.h | 21 ++++++ src/tests.c | 75 ++++++++++++++++--- 14 files changed, 737 insertions(+), 92 deletions(-) create mode 100644 src/cpu_gemm.c create mode 100644 src/gemm.cl create mode 100644 src/gpu_gemm.c create mode 100644 src/opencl.c create mode 100644 src/opencl.h diff --git a/Makefile b/Makefile index a02d7ef7..07cf79fb 100644 --- a/Makefile +++ b/Makefile @@ -2,17 +2,19 @@ CC=gcc COMMON=-Wall `pkg-config --cflags opencv` UNAME = $(shell uname) ifeq ($(UNAME), Darwin) -COMMON += -isystem /usr/local/Cellar/opencv/2.4.6.1/include/opencv -isystem /usr/local/Cellar/opencv/2.4.6.1/include +COMMON+= -isystem /usr/local/Cellar/opencv/2.4.6.1/include/opencv -isystem /usr/local/Cellar/opencv/2.4.6.1/include +LDFLAGS= -framework OpenCL else -COMMON += -march=native -flto +COMMON+= -march=native -flto +LDFLAGS= -lOpenCL endif CFLAGS= $(COMMON) -Ofast #CFLAGS= $(COMMON) -O0 -g -LDFLAGS=`pkg-config --libs opencv` -lm +LDFLAGS+=`pkg-config --libs opencv` -lm VPATH=./src/ EXEC=cnn -OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o +OBJ=network.o image.o tests.o connected_layer.o maxpool_layer.o activations.o list.o option_list.o parser.o utils.o data.o matrix.o softmax_layer.o mini_blas.o convolutional_layer.o opencl.o gpu_gemm.o cpu_gemm.o all: $(EXEC) diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index f7c9c102..40d58584 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -285,52 +285,47 @@ image get_convolutional_filter(convolutional_layer layer, int i) return float_to_image(h,w,c,layer.filters+i*h*w*c); } -void visualize_convolutional_layer(convolutional_layer layer, char *window) +image *weighted_sum_filters(convolutional_layer layer, image *prev_filters) { - int color = 1; - int border = 1; - int h,w,c; - int size = layer.size; - h = size; - w = (size + border) * layer.n - border; - c = layer.c; - if(c != 3 || !color){ - h = (h+border)*c - border; - c = 1; + image *filters = calloc(layer.n, sizeof(image)); + int i,j,k,c; + if(!prev_filters){ + for(i = 0; i < layer.n; ++i){ + filters[i] = copy_image(get_convolutional_filter(layer, i)); + } } - - image filters = make_image(h,w,c); - int i,j; - for(i = 0; i < layer.n; ++i){ - int w_offset = i*(size+border); - image k = get_convolutional_filter(layer, i); - //printf("%f ** ", layer.biases[i]); - //print_image(k); - image copy = copy_image(k); - normalize_image(copy); - for(j = 0; j < k.c; ++j){ - //set_pixel(copy,0,0,j,layer.biases[i]); - } - if(c == 3 && color){ - embed_image(copy, filters, 0, w_offset); - } - else{ - for(j = 0; j < k.c; ++j){ - int h_offset = j*(size+border); - image layer = get_image_layer(k, j); - embed_image(layer, filters, h_offset, w_offset); - free_image(layer); + else{ + image base = prev_filters[0]; + for(i = 0; i < layer.n; ++i){ + image filter = get_convolutional_filter(layer, i); + filters[i] = make_image(base.h, base.w, base.c); + for(j = 0; j < layer.size; ++j){ + for(k = 0; k < layer.size; ++k){ + for(c = 0; c < layer.c; ++c){ + float weight = get_pixel(filter, j, k, c); + image prev_filter = copy_image(prev_filters[c]); + scale_image(prev_filter, weight); + add_into_image(prev_filter, filters[i], 0,0); + free_image(prev_filter); + } + } } } - free_image(copy); } + return filters; +} + +image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_filters) +{ + image *single_filters = weighted_sum_filters(layer, 0); + show_images(single_filters, layer.n, window); + image delta = get_convolutional_delta(layer); image dc = collapse_image_layers(delta, 1); char buff[256]; sprintf(buff, "%s: Delta", window); - show_image(dc, buff); + //show_image(dc, buff); free_image(dc); - show_image(filters, window); - free_image(filters); + return single_filters; } diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 4e69dcfd..7404defd 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -30,7 +30,7 @@ void resize_convolutional_layer(convolutional_layer *layer, int h, int w, int c) void forward_convolutional_layer(const convolutional_layer layer, float *in); void learn_convolutional_layer(convolutional_layer layer); void update_convolutional_layer(convolutional_layer layer, float step, float momentum, float decay); -void visualize_convolutional_layer(convolutional_layer layer, char *window); +image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_filters); void backward_convolutional_layer(convolutional_layer layer, float *delta); diff --git a/src/cpu_gemm.c b/src/cpu_gemm.c new file mode 100644 index 00000000..437b39a4 --- /dev/null +++ b/src/cpu_gemm.c @@ -0,0 +1,86 @@ +#include "mini_blas.h" + +void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + int i,j,k; + for(i = 0; i < M; ++i){ + for(k = 0; k < K; ++k){ + register float A_PART = ALPHA*A[i*lda+k]; + for(j = 0; j < N; ++j){ + C[i*ldc+j] += A_PART*B[k*ldb+j]; + } + } + } +} + +void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + int i,j,k; + for(i = 0; i < M; ++i){ + for(j = 0; j < N; ++j){ + register float sum = 0; + for(k = 0; k < K; ++k){ + sum += ALPHA*A[i*lda+k]*B[k+j*ldb]; + } + C[i*ldc+j] += sum; + } + } +} + +void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + int i,j,k; + for(i = 0; i < M; ++i){ + for(k = 0; k < K; ++k){ + register float A_PART = ALPHA*A[k*lda+i]; + for(j = 0; j < N; ++j){ + C[i*ldc+j] += A_PART*B[k*ldb+j]; + } + } + } +} +void cpu_gemm_tt(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + int i,j,k; + for(i = 0; i < M; ++i){ + for(j = 0; j < N; ++j){ + for(k = 0; k < K; ++k){ + C[i*ldc+j] += ALPHA*A[i+k*lda]*B[k+j*ldb]; + } + } + } +} + + +void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + // Assume beta = 1 LULZ + if(!TA && !TB) + cpu_gemm_nn( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); + else if(TA && !TB) + cpu_gemm_tn( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); + else if(!TA && TB) + cpu_gemm_nt( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); + else + cpu_gemm_tt( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); +} diff --git a/src/gemm.cl b/src/gemm.cl new file mode 100644 index 00000000..7c868f41 --- /dev/null +++ b/src/gemm.cl @@ -0,0 +1,72 @@ + + +__kernel void gemm(int TA, int TB, int M, int N, int K, float ALPHA, + __global float *A, int lda, + __global float *B, int ldb, + float BETA, + __global float *C, int ldc) +{ + __local float Asub[BLOCK][BLOCK]; + __local float Bsub[BLOCK][BLOCK]; + + float val = 0; + + int row_block = get_group_id(0); + int col_block = get_group_id(1); + + int sub_row = get_local_id(0); + int sub_col = get_local_id(1); + + int row = row_block*BLOCK + sub_row; + int col = col_block*BLOCK + sub_col; + + int i,j; + for(i = 0; i < K; i += BLOCK){ + int arow = row_block*BLOCK + sub_row; + int acol = i + sub_col; + + int brow = i + sub_row; + int bcol = col_block*BLOCK + sub_col; + + Asub[sub_row][sub_col] = TA ? A[arow + acol*lda] : A[arow*lda + acol]; + Bsub[sub_row][sub_col] = TB ? B[brow + bcol*ldb] : B[brow*ldb + bcol]; + + barrier(CLK_LOCAL_MEM_FENCE); + + for(j = 0; j < BLOCK && i+j +#include +#include +#include +#include + +#include "opencl.h" +#include "mini_blas.h" + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#define BLOCK 8 + +cl_kernel get_gemm_kernel() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm.cl", "gemm", "-D BLOCK=" STR(BLOCK) ); + init = 1; + } + return gemm_kernel; +} + +void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + cl_setup(); + cl_kernel gemm_kernel = get_gemm_kernel(); + cl_context context = cl.context; + cl_command_queue queue = cl.queue; + + size_t size = sizeof(float)*(TA ? lda*K:lda*M); + cl_mem A_gpu = clCreateBuffer(context, + CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + size, A, &cl.error); + check_error(cl); + + size = sizeof(float)*(TB ? ldb*N:ldb*K); + cl_mem B_gpu = clCreateBuffer(context, + CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + size, B, &cl.error); + check_error(cl); + + size = sizeof(float)*(ldc*M); + cl_mem C_gpu = clCreateBuffer(context, + CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, + size, C, &cl.error); + check_error(cl); + + cl_uint i = 0; + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); + check_error(cl); + + const size_t global_size[] = {ceil((float)M/BLOCK)*BLOCK, ceil((float)N/BLOCK)*BLOCK}; + const size_t local_size[] = {BLOCK, BLOCK}; + //printf("%zd %zd %zd %zd\n", global_size[0], global_size[1], local_size[0], local_size[1]); + + clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, local_size, 0, 0, 0); + check_error(cl); + clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); + check_error(cl); + + clReleaseMemObject(A_gpu); + clReleaseMemObject(B_gpu); + clReleaseMemObject(C_gpu); + +} + +/* +cl_kernel get_gemm_kernel_slow() +{ + static int init = 0; + static cl_kernel gemm_kernel; + if(!init){ + gemm_kernel = get_kernel("src/gemm.cl", "gemm_slow"); + init = 1; + } + return gemm_kernel; +} + +void gpu_gemm_slow(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) +{ + cl_setup(); + cl_kernel gemm_kernel = get_gemm_kernel_slow(); + cl_context context = cl.context; + cl_command_queue queue = cl.queue; + + size_t size = sizeof(float)*(TA ? lda*K:lda*M); + cl_mem A_gpu = clCreateBuffer(context, + CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + size, A, &cl.error); + check_error(cl); + + size = sizeof(float)*(TB ? ldb*N:ldb*K); + cl_mem B_gpu = clCreateBuffer(context, + CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + size, B, &cl.error); + check_error(cl); + + size = sizeof(float)*(ldc*M); + cl_mem C_gpu = clCreateBuffer(context, + CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, + size, C, &cl.error); + check_error(cl); + + cl_uint i = 0; + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TA), (void*) &TA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(TB), (void*) &TB); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(M), (void*) &M); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(N), (void*) &N); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(K), (void*) &K); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ALPHA), (void*) &ALPHA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(A_gpu), (void*) &A_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(lda), (void*) &lda); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(B_gpu), (void*) &B_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldb), (void*) &ldb); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(BETA), (void*) &BETA); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(C_gpu), (void*) &C_gpu); + cl.error = clSetKernelArg(gemm_kernel, i++, sizeof(ldc), (void*) &ldc); + check_error(cl); + + const size_t global_size[] = {M, N}; + + clEnqueueNDRangeKernel(queue, gemm_kernel, 2, 0, global_size, 0, 0, 0, 0); + clEnqueueReadBuffer(queue, C_gpu, CL_TRUE, 0, size, C, 0, 0, 0); + + clReleaseMemObject(A_gpu); + clReleaseMemObject(B_gpu); + clReleaseMemObject(C_gpu); + +} +*/ diff --git a/src/image.c b/src/image.c index 24e32922..5c138d33 100644 --- a/src/image.c +++ b/src/image.c @@ -113,6 +113,7 @@ image copy_image(image p) return copy; } + void show_image(image p, char *name) { int i,j,k; @@ -152,6 +153,30 @@ void show_image(image p, char *name) cvReleaseImage(&disp); } +void save_image(image p, char *name) +{ + int i,j,k; + image copy = copy_image(p); + normalize_image(copy); + + char buff[256]; + //sprintf(buff, "%s (%d)", name, windows); + sprintf(buff, "%s.png", name); + + IplImage *disp = cvCreateImage(cvSize(p.w,p.h), IPL_DEPTH_8U, p.c); + int step = disp->widthStep; + for(i = 0; i < p.h; ++i){ + for(j = 0; j < p.w; ++j){ + for(k= 0; k < p.c; ++k){ + disp->imageData[i*step + j*p.c + k] = (unsigned char)(get_pixel(copy,i,j,k)*255); + } + } + } + free_image(copy); + cvSaveImage(buff, disp,0); + cvReleaseImage(&disp); +} + void show_image_layers(image p, char *name) { int i; @@ -227,6 +252,18 @@ image make_random_image(int h, int w, int c) return out; } +void add_into_image(image src, image dest, int h, int w) +{ + int i,j,k; + for(k = 0; k < src.c; ++k){ + for(i = 0; i < src.h; ++i){ + for(j = 0; j < src.w; ++j){ + add_pixel(dest, h+i, w+j, k, get_pixel(src, i, j, k)); + } + } + } +} + void add_scalar_image(image m, float s) { int i; @@ -404,6 +441,20 @@ image get_image_layer(image m, int l) } return out; } +image get_sub_image(image m, int h, int w, int dh, int dw) +{ + image out = make_image(dh, dw, m.c); + int i,j,k; + for(k = 0; k < out.c; ++k){ + for(i = 0; i < dh; ++i){ + for(j = 0; j < dw; ++j){ + float val = get_pixel(m, h+i, w+j, k); + set_pixel(out, i, j, k, val); + } + } + } + return out; +} float get_pixel(image m, int x, int y, int c) { @@ -595,6 +646,49 @@ void print_image(image m) printf("\n"); } +image collapse_images(image *ims, int n) +{ + int color = 1; + int border = 1; + int h,w,c; + int size = ims[0].h; + h = size; + w = (size + border) * n - border; + c = ims[0].c; + if(c != 3 || !color){ + h = (h+border)*c - border; + c = 1; + } + + image filters = make_image(h,w,c); + int i,j; + for(i = 0; i < n; ++i){ + int w_offset = i*(size+border); + image copy = copy_image(ims[i]); + normalize_image(copy); + if(c == 3 && color){ + embed_image(copy, filters, 0, w_offset); + } + else{ + for(j = 0; j < copy.c; ++j){ + int h_offset = j*(size+border); + image layer = get_image_layer(copy, j); + embed_image(layer, filters, h_offset, w_offset); + free_image(layer); + } + } + free_image(copy); + } + return filters; +} + +void show_images(image *ims, int n, char *window) +{ + image m = collapse_images(ims, n); + show_image(m, window); + free_image(m); +} + void free_image(image m) { free(m.data); diff --git a/src/image.h b/src/image.h index 9f7d74d4..9d064c36 100644 --- a/src/image.h +++ b/src/image.h @@ -21,9 +21,13 @@ void rotate_image(image m); void subtract_image(image a, image b); float avg_image_layer(image m, int l); void embed_image(image source, image dest, int h, int w); +void add_into_image(image src, image dest, int h, int w); image collapse_image_layers(image source, int border); +image get_sub_image(image m, int h, int w, int dh, int dw); void show_image(image p, char *name); +void save_image(image p, char *name); +void show_images(image *ims, int n, char *window); void show_image_layers(image p, char *name); void show_image_collapsed(image p, char *name); void print_image(image m); @@ -39,6 +43,7 @@ image ipl_to_image(IplImage* src); float get_pixel(image m, int x, int y, int c); float get_pixel_extend(image m, int x, int y, int c); +void add_pixel(image m, int x, int y, int c, float val); void set_pixel(image m, int x, int y, int c, float val); image get_image_layer(image m, int l); diff --git a/src/mini_blas.c b/src/mini_blas.c index 262798bc..bac3e226 100644 --- a/src/mini_blas.c +++ b/src/mini_blas.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include "mini_blas.h" void pm(int M, int N, float *A) { @@ -17,42 +19,12 @@ void pm(int M, int N, float *A) } void gemm(int TA, int TB, int M, int N, int K, float ALPHA, - float *A, int lda, - float *B, int ldb, - float BETA, - float *C, int ldc) + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc) { - // Assume beta = 1 LULZ - int i,j,k; - if(TB && !TA){ - for(i = 0; i < M; ++i){ - for(j = 0; j < N; ++j){ - register float sum = 0; - for(k = 0; k < K; ++k){ - sum += ALPHA*A[i*lda+k]*B[k+j*ldb]; - } - C[i*ldc+j] += sum; - } - } - }else if(TA && !TB){ - for(i = 0; i < M; ++i){ - for(k = 0; k < K; ++k){ - register float A_PART = ALPHA*A[k*lda+i]; - for(j = 0; j < N; ++j){ - C[i*ldc+j] += A_PART*B[k*ldb+j]; - } - } - } - }else{ - for(i = 0; i < M; ++i){ - for(k = 0; k < K; ++k){ - register float A_PART = ALPHA*A[i*lda+k]; - for(j = 0; j < N; ++j){ - C[i*ldc+j] += A_PART*B[k*ldb+j]; - } - } - } - } + gpu_gemm( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); } void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix) @@ -150,16 +122,26 @@ float *random_matrix(int rows, int cols) void time_random_matrix(int TA, int TB, int m, int k, int n) { - float *a = random_matrix(m,k); - float *b = random_matrix(k,n); + float *a; + if(!TA) a = random_matrix(m,k); + else a = random_matrix(k,m); + int lda = (!TA)?k:m; + float *b; + if(!TB) b = random_matrix(k,n); + else b = random_matrix(n,k); + int ldb = (!TB)?n:k; + float *c = random_matrix(m,n); int i; clock_t start = clock(), end; for(i = 0; i<1000; ++i){ - gemm(TA,TB,m,n,k,1,a,k,b,n,1,c,n); + cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); } end = clock(); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); + free(a); + free(b); + free(c); } void test_blas() @@ -167,9 +149,97 @@ void test_blas() time_random_matrix(0,0,100,100,100); time_random_matrix(1,0,100,100,100); time_random_matrix(0,1,100,100,100); + time_random_matrix(1,1,100,100,100); - time_random_matrix(0,1,1000,100,100); + time_random_matrix(0,0,1000,100,100); time_random_matrix(1,0,1000,100,100); + time_random_matrix(0,1,1000,100,100); + time_random_matrix(1,1,1000,100,100); + } +void time_gpu_random_matrix(int TA, int TB, int m, int k, int n) +{ + float *a; + if(!TA) a = random_matrix(m,k); + else a = random_matrix(k,m); + int lda = (!TA)?k:m; + float *b; + if(!TB) b = random_matrix(k,n); + else b = random_matrix(n,k); + int ldb = (!TB)?n:k; + + float *c = random_matrix(m,n); + int i; + clock_t start = clock(), end; + for(i = 0; i<1000; ++i){ + gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); + } + end = clock(); + printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); + free(a); + free(b); + free(c); +} + +void test_gpu_accuracy(int TA, int TB, int m, int k, int n) +{ + srand(0); + float *a; + if(!TA) a = random_matrix(m,k); + else a = random_matrix(k,m); + int lda = (!TA)?k:m; + float *b; + if(!TB) b = random_matrix(k,n); + else b = random_matrix(n,k); + int ldb = (!TB)?n:k; + + float *c = random_matrix(m,n); + float *c_gpu = random_matrix(m,n); + memset(c, 0, m*n*sizeof(float)); + memset(c_gpu, 0, m*n*sizeof(float)); + int i; + //pm(m,k,b); + gpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n); + //pm(m, n, c_gpu); + cpu_gemm(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); + //pm(m, n, c); + double sse = 0; + for(i = 0; i < m*n; ++i) { + //printf("%f %f\n", c[i], c_gpu[i]); + sse += pow(c[i]-c_gpu[i], 2); + } + printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g MSE\n",m,k,k,n, TA, TB, sse/(m*n)); + free(a); + free(b); + free(c); +} + +void test_gpu_blas() +{ + test_gpu_accuracy(0,0,17,10,10); + test_gpu_accuracy(1,0,17,10,10); + test_gpu_accuracy(0,1,17,10,10); + test_gpu_accuracy(1,1,17,10,10); + + test_gpu_accuracy(0,0,1000,10,100); + test_gpu_accuracy(1,0,1000,10,100); + test_gpu_accuracy(0,1,1000,10,100); + test_gpu_accuracy(1,1,1000,10,100); + + time_gpu_random_matrix(0,0,1000,1000,100); + time_random_matrix(0,0,1000,1000,100); + + time_gpu_random_matrix(0,1,1000,1000,100); + time_random_matrix(0,1,1000,1000,100); + + time_gpu_random_matrix(1,0,1000,1000,100); + time_random_matrix(1,0,1000,1000,100); + + time_gpu_random_matrix(1,1,1000,1000,100); + time_random_matrix(1,1,1000,1000,100); + +} + + diff --git a/src/mini_blas.h b/src/mini_blas.h index ff82a60c..56e4fa72 100644 --- a/src/mini_blas.h +++ b/src/mini_blas.h @@ -4,6 +4,7 @@ void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *B, int ldb, float BETA, float *C, int ldc); +float *random_matrix(int rows, int cols); void im2row(float *image, int h, int w, int c, int size, int stride, float *matrix); void im2col(float *image, int h, int w, int c, int size, int stride, float *matrix); void im2col_cpu(float* data_im, const int channels, @@ -13,3 +14,15 @@ void col2im_cpu(float* data_col, const int channels, const int height, const int width, const int ksize, const int stride, float* data_im); void test_blas(); + +void gpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc); +void cpu_gemm(int TA, int TB, int M, int N, int K, float ALPHA, + float *A, int lda, + float *B, int ldb, + float BETA, + float *C, int ldc); +void test_gpu_blas(); diff --git a/src/network.c b/src/network.c index e2c44b05..edae3c7b 100644 --- a/src/network.c +++ b/src/network.c @@ -428,13 +428,14 @@ image get_network_image(network net) void visualize_network(network net) { + image *prev = 0; int i; char buff[256]; for(i = 0; i < net.n; ++i){ sprintf(buff, "Layer %d", i); if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; - visualize_convolutional_layer(layer, buff); + prev = visualize_convolutional_layer(layer, buff, prev); } } } @@ -506,3 +507,4 @@ float network_accuracy(network net, data d) return acc; } + diff --git a/src/opencl.c b/src/opencl.c new file mode 100644 index 00000000..193fba32 --- /dev/null +++ b/src/opencl.c @@ -0,0 +1,77 @@ +#include "opencl.h" +#include +#include +#include + +cl_info cl = {0}; + +void check_error(cl_info info) +{ + if (info.error != CL_SUCCESS) { + printf("\n Error number %d", info.error); + } +} + +cl_info cl_init() +{ + cl_info info; + info.initialized = 0; + cl_uint platforms, devices; + // Fetch the Platform and Device IDs; we only want one. + info.error=clGetPlatformIDs(1, &info.platform, &platforms); + check_error(info); + info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, 1, &info.device, &devices); + check_error(info); + + cl_context_properties properties[]={ + CL_CONTEXT_PLATFORM, (cl_context_properties)info.platform, + 0}; + // Note that nVidia's OpenCL requires the platform property + info.context=clCreateContext(properties, 1, &info.device, 0, 0, &info.error); + check_error(info); + info.queue = clCreateCommandQueue(info.context, info.device, 0, &info.error); + check_error(info); + info.initialized = 1; + return info; +} + +cl_program cl_fprog(char *filename, char *options, cl_info info) +{ + size_t srcsize; + char src[8192]; + memset(src, 0, 8192); + FILE *fil=fopen(filename,"r"); + srcsize=fread(src, sizeof src, 1, fil); + fclose(fil); + const char *srcptr[]={src}; + // Submit the source code of the example kernel to OpenCL + cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error); + check_error(info); + char build_c[4096]; + // and compile it (after this we could extract the compiled version) + info.error=clBuildProgram(prog, 0, 0, options, 0, 0); + if ( info.error != CL_SUCCESS ) { + fprintf(stderr, "Error Building Program: %d\n", info.error); + clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0); + fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); + } + return prog; +} + +void cl_setup() +{ + if(!cl.initialized){ + cl = cl_init(); + } +} + +cl_kernel get_kernel(char *filename, char *kernelname, char *options) +{ + cl_setup(); + cl_program prog = cl_fprog(filename, options, cl); + cl_kernel kernel=clCreateKernel(prog, kernelname, &cl.error); + check_error(cl); + return kernel; +} + + diff --git a/src/opencl.h b/src/opencl.h new file mode 100644 index 00000000..59efbae0 --- /dev/null +++ b/src/opencl.h @@ -0,0 +1,21 @@ +#ifdef __APPLE__ +#include +#else +#include +#endif + +typedef struct { + int initialized; + cl_int error; + cl_platform_id platform; + cl_device_id device; + cl_context context; + cl_command_queue queue; +}cl_info; + +extern cl_info cl; + +void cl_setup(); +void check_error(cl_info info); +cl_kernel get_kernel(char *filename, char *kernelname, char *options); + diff --git a/src/tests.c b/src/tests.c index 91217d42..5d9136de 100644 --- a/src/tests.c +++ b/src/tests.c @@ -220,6 +220,14 @@ void train_full() //lr *= .99; } } + +void test_visualize() +{ + network net = parse_network_cfg("cfg/imagenet.cfg"); + srand(2222222); + visualize_network(net); + cvWaitKey(0); +} void test_full() { network net = parse_network_cfg("cfg/backup_1300.cfg"); @@ -265,7 +273,7 @@ void test_cifar10() scale_data_rows(train, 1./255); train_network_sgd(net, train, batch, lr, momentum, decay); //printf("%5f %5f\n",(double)count*batch/train.X.rows, loss); - + float test_acc = network_accuracy(net, test); printf("%5f %5f\n",(double)count*batch/train.X.rows/5, 1-test_acc); free_data(train); @@ -316,15 +324,15 @@ void test_nist() //printf("Time: %lf seconds\n", (float)(end-start)/CLOCKS_PER_SEC); //start=end; /* - if(count%5 == 0){ - float train_acc = network_accuracy(net, train); - fprintf(stderr, "\nTRAIN: %f\n", train_acc); - float test_acc = network_accuracy(net, test); - fprintf(stderr, "TEST: %f\n\n", test_acc); - printf("%d, %f, %f\n", count, train_acc, test_acc); - //lr *= .5; + if(count%5 == 0){ + float train_acc = network_accuracy(net, train); + fprintf(stderr, "\nTRAIN: %f\n", train_acc); + float test_acc = network_accuracy(net, test); + fprintf(stderr, "TEST: %f\n\n", test_acc); + printf("%d, %f, %f\n", count, train_acc, test_acc); + //lr *= .5; } - */ + */ } } @@ -516,6 +524,48 @@ void features_VOC_image_size(char *image_path, int h, int w) cvReleaseImage(&src); } +void visualize_imagenet_features(char *filename) +{ + int i,j,k; + network net = parse_network_cfg("cfg/voc_imagenet.cfg"); + list *plist = get_paths(filename); + node *n = plist->front; + int h = voc_size(1), w = voc_size(1); + int num = get_network_image(net).c; + image *vizs = calloc(num, sizeof(image)); + for(i = 0; i < num; ++i) vizs[i] = make_image(h, w, 3); + while(n){ + char *image_path = (char *)n->val; + image im = load_image(image_path, 0, 0); + printf("Processing %dx%d image\n", im.h, im.w); + resize_network(net, im.h, im.w, im.c); + forward_network(net, im.data); + image out = get_network_image(net); + + int dh = (im.h - h)/h; + int dw = (im.w - w)/w; + for(i = 0; i < out.h; ++i){ + for(j = 0; j < out.w; ++j){ + image sub = get_sub_image(im, dh*i, dw*j, h, w); + for(k = 0; k < out.c; ++k){ + float val = get_pixel(out, i, j, k); + //printf("%f, ", val); + image sub_c = copy_image(sub); + scale_image(sub_c, val); + add_into_image(sub_c, vizs[k], 0, 0); + free_image(sub_c); + } + free_image(sub); + } + } + //printf("\n"); + show_images(vizs, 10, "IMAGENET Visualization"); + cvWaitKey(1000); + n = n->next; + } + cvWaitKey(0); +} + void features_VOC_image(char *image_file, char *image_dir, char *out_dir) { int i,j; @@ -627,6 +677,9 @@ int main(int argc, char *argv[]) //test_distribution(); //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); + //test_blas(); + //test_visualize(); + //test_gpu_blas(); //test_blas(); //test_convolve_matrix(); // test_im2row(); @@ -638,7 +691,9 @@ int main(int argc, char *argv[]) //test_full(); //train_VOC(); //features_VOC_image(argv[1], argv[2], argv[3]); - features_VOC_image_size(argv[1], atoi(argv[2]), atoi(argv[3])); + //features_VOC_image_size(argv[1], atoi(argv[2]), atoi(argv[3])); + //visualize_imagenet_features("data/assira/train.list"); + visualize_imagenet_features("data/VOC2011.list"); fprintf(stderr, "Success!\n"); //test_random_preprocess(); //test_random_classify();