diff --git a/src/axpy.c b/src/axpy.c index 10ffca45..eddfdc6e 100644 --- a/src/axpy.c +++ b/src/axpy.c @@ -64,6 +64,11 @@ cl_kernel get_scal_kernel() void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY) +{ + axpy_ongpu_offset(N,ALPHA,X,0,INCX,Y,0,INCY); +} + +void axpy_ongpu_offset(int N, float ALPHA, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY) { cl_setup(); cl_kernel kernel = get_axpy_kernel(); @@ -73,8 +78,10 @@ void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY) cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); cl.error = clSetKernelArg(kernel, i++, sizeof(ALPHA), (void*) &ALPHA); cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); + cl.error = clSetKernelArg(kernel, i++, sizeof(OFFX), (void*) &OFFX); cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); + cl.error = clSetKernelArg(kernel, i++, sizeof(OFFY), (void*) &OFFY); cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); check_error(cl); @@ -85,6 +92,10 @@ void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY) } void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY) +{ + copy_ongpu_offset(N,X,0,INCX,Y,0,INCY); +} +void copy_ongpu_offset(int N, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY) { cl_setup(); cl_kernel kernel = get_copy_kernel(); @@ -93,8 +104,10 @@ void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY) cl_uint i = 0; cl.error = clSetKernelArg(kernel, i++, sizeof(N), (void*) &N); cl.error = clSetKernelArg(kernel, i++, sizeof(X), (void*) &X); + cl.error = clSetKernelArg(kernel, i++, sizeof(OFFX), (void*) &OFFX); cl.error = clSetKernelArg(kernel, i++, sizeof(INCX), (void*) &INCX); cl.error = clSetKernelArg(kernel, i++, sizeof(Y), (void*) &Y); + cl.error = clSetKernelArg(kernel, i++, sizeof(OFFY), (void*) &OFFY); cl.error = clSetKernelArg(kernel, i++, sizeof(INCY), (void*) &INCY); check_error(cl); diff --git a/src/axpy.cl b/src/axpy.cl index 394d8976..901a8266 100644 --- a/src/axpy.cl +++ b/src/axpy.cl @@ -1,7 +1,7 @@ -__kernel void axpy(int N, float ALPHA, __global float *X, int INCX, __global float *Y, int INCY) +__kernel void axpy(int N, float ALPHA, __global float *X, int OFFX, int INCX, __global float *Y, int OFFY, int INCY) { int i = get_global_id(0); - Y[i*INCY] += ALPHA*X[i*INCX]; + Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX]; } __kernel void scal(int N, float ALPHA, __global float *X, int INCX) @@ -10,9 +10,9 @@ __kernel void scal(int N, float ALPHA, __global float *X, int INCX) X[i*INCX] *= ALPHA; } -__kernel void copy(int N, __global float *X, int INCX, __global float *Y, int INCY) +__kernel void copy(int N, __global float *X, int OFFX, int INCX, __global float *Y, int OFFY, int INCY) { int i = get_global_id(0); - Y[i*INCY] = X[i*INCX]; + Y[i*INCY + OFFY] = X[i*INCX + OFFX]; } diff --git a/src/cnn.c b/src/cnn.c index 9e9e62b4..de37bc31 100644 --- a/src/cnn.c +++ b/src/cnn.c @@ -308,10 +308,10 @@ void train_assira() void train_imagenet() { - network net = parse_network_cfg("/home/pjreddie/imagenet_backup/imagenet_backup_slower_larger_870.cfg"); + network net = parse_network_cfg("cfg/imagenet_backup_slowest_2340.cfg"); printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = 1000/net.batch+1; - srand(986987); + srand(6472345); int i = 0; char **labels = get_labels("/home/pjreddie/data/imagenet/cls.labels.list"); list *plist = get_paths("/data/imagenet/cls.train.list"); @@ -332,7 +332,7 @@ void train_imagenet() free_data(train); if(i%10==0){ char buff[256]; - sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_backup_larger_%d.cfg", i); + sprintf(buff, "/home/pjreddie/imagenet_backup/imagenet_small_%d.cfg", i); save_network(net, buff); } } @@ -397,7 +397,7 @@ void test_imagenet() void test_visualize() { - network net = parse_network_cfg("cfg/imagenet_test.cfg"); + network net = parse_network_cfg("cfg/imagenet.cfg"); visualize_network(net); cvWaitKey(0); } @@ -991,7 +991,7 @@ void test_gpu_net() translate_data_rows(train, -144); translate_data_rows(test, -144); int count = 0; - int iters = 10000/net.batch; + int iters = 1000/net.batch; while(++count <= 5){ clock_t start = clock(), end; float loss = train_network_sgd(net, train, iters); @@ -999,6 +999,7 @@ void test_gpu_net() float test_acc = network_accuracy(net, test); printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); } + #ifdef GPU count = 0; srand(222222); net = parse_network_cfg("cfg/nist.cfg"); @@ -1009,6 +1010,7 @@ void test_gpu_net() float test_acc = network_accuracy(net, test); printf("%d: Loss: %f, Test Acc: %f, Time: %lf seconds, LR: %f, Momentum: %f, Decay: %f\n", count, loss, test_acc,(float)(end-start)/CLOCKS_PER_SEC, net.learning_rate, net.momentum, net.decay); } + #endif } @@ -1020,13 +1022,12 @@ int main(int argc, char *argv[]) } if(0==strcmp(argv[1], "train")) train_imagenet(); else if(0==strcmp(argv[1], "train_small")) train_imagenet_small(); + else if(0==strcmp(argv[1], "test_correct")) test_gpu_net(); + else if(0==strcmp(argv[1], "test")) test_imagenet(); + else if(0==strcmp(argv[1], "visualize")) test_visualize(); + #ifdef GPU else if(0==strcmp(argv[1], "test_gpu")) test_gpu_blas(); - else if(0==strcmp(argv[1], "test")) test_gpu_net(); - //test_gpu_blas(); - //train_imagenet_small(); - //test_imagenet(); - //train_nist(); - //test_visualize(); + #endif fprintf(stderr, "Success!\n"); return 0; } diff --git a/src/connected_layer.c b/src/connected_layer.c index dba0b2ac..ac4c4179 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -135,9 +135,7 @@ void forward_connected_layer_gpu(connected_layer layer, cl_mem input) { int i; for(i = 0; i < layer.batch; ++i){ - cl_mem sub = cl_sub_array(layer.output_cl, i*layer.outputs, layer.outputs); - copy_ongpu(layer.outputs, layer.biases_cl, 1, sub, 1); - clReleaseMemObject(sub); + copy_ongpu_offset(layer.outputs, layer.biases_cl, 0, 1, layer.output_cl, i*layer.outputs, 1); } int m = layer.batch; int k = layer.inputs; @@ -154,9 +152,7 @@ void backward_connected_layer_gpu(connected_layer layer, cl_mem input, cl_mem de int i; gradient_array_ongpu(layer.output_cl, layer.outputs*layer.batch, layer.activation, layer.delta_cl); for(i = 0; i < layer.batch; ++i){ - cl_mem sub = cl_sub_array(layer.delta_cl, i*layer.outputs, layer.outputs); - axpy_ongpu(layer.outputs, 1, sub, 1, layer.bias_updates_cl, 1); - clReleaseMemObject(sub); + axpy_ongpu_offset(layer.outputs, 1, layer.delta_cl, i*layer.outputs, 1, layer.bias_updates_cl, 0, 1); } int m = layer.inputs; int k = layer.batch; diff --git a/src/im2col.c b/src/im2col.c index b743e342..bfaa54cd 100644 --- a/src/im2col.c +++ b/src/im2col.c @@ -51,12 +51,23 @@ void im2col_cpu(float* data_im, int batch, #include "opencl.h" #include -cl_kernel get_im2col_kernel() +cl_kernel get_im2col_pad_kernel() { static int init = 0; static cl_kernel im2col_kernel; if(!init){ - im2col_kernel = get_kernel("src/im2col.cl", "im2col", 0); + im2col_kernel = get_kernel("src/im2col.cl", "im2col_pad", 0); + init = 1; + } + return im2col_kernel; +} + +cl_kernel get_im2col_nopad_kernel() +{ + static int init = 0; + static cl_kernel im2col_kernel; + if(!init){ + im2col_kernel = get_kernel("src/im2col.cl", "im2col_nopad", 0); init = 1; } return im2col_kernel; @@ -68,32 +79,34 @@ void im2col_ongpu(cl_mem data_im, int batch, int ksize, int stride, int pad, cl_mem data_col) { cl_setup(); - cl_kernel im2col_kernel = get_im2col_kernel(); - cl_command_queue queue = cl.queue; - - cl_uint i = 0; - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_im), (void*) &data_im); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(batch), (void*) &batch); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(channels), (void*) &channels); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(height), (void*) &height); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad); - cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col); - check_error(cl); int height_col = (height - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1; int channels_col = channels * ksize * ksize; + cl_kernel kernel = get_im2col_nopad_kernel(); + if (pad){ height_col = 1 + (height-1) / stride; width_col = 1 + (width-1) / stride; + kernel = get_im2col_pad_kernel(); } + cl_command_queue queue = cl.queue; + + cl_uint i = 0; + cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im); + cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch); + cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels); + cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height); + cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width); + cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize); + cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride); + cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col); + check_error(cl); + size_t global_size = batch*channels_col*height_col*width_col; - clEnqueueNDRangeKernel(queue, im2col_kernel, 1, 0, + clEnqueueNDRangeKernel(queue, kernel, 1, 0, &global_size, 0, 0, 0, 0); check_error(cl); } diff --git a/src/im2col.cl b/src/im2col.cl index 8169e1aa..e00e8f55 100644 --- a/src/im2col.cl +++ b/src/im2col.cl @@ -1,28 +1,17 @@ -float im2col_get_pixel(__global float *im, int height, int width, int channels, - int batch, int row, int col, int channel, int pad) -{ - row -= pad; - col -= pad; - if (row < 0 || col < 0 || row >= height || col >= width) return 0; - int index = col + width*(row + height*(channel+batch*channels)); - return im[index]; -} - -__kernel void im2col(__global float *data_im, int batch, +__kernel void im2col_pad(__global float *im, int batch, int channels, int height, int width, - int ksize, int stride, int pad, __global float *data_col) + int ksize, int stride, __global float *data_col) { int c,h,w,b; - int height_col = (height - ksize) / stride + 1; - int width_col = (width - ksize) / stride + 1; + int height_col = 1 + (height-1) / stride; + int width_col = 1 + (width-1) / stride; int channels_col = channels * ksize * ksize; - if (pad){ - height_col = 1 + (height-1) / stride; - width_col = 1 + (width-1) / stride; - pad = ksize/2; - } + + int pad = ksize/2; + int id = get_global_id(0); + int col_index = id; w = id % width_col; id /= width_col; h = id % height_col; @@ -35,9 +24,45 @@ __kernel void im2col(__global float *data_im, int batch, int col_size = height_col*width_col*channels_col; int w_offset = c % ksize; int h_offset = (c / ksize) % ksize; - int c_im = c / ksize / ksize; + int im_channel = c / ksize / ksize; + int im_row = h_offset + h * stride - pad; + int im_col = w_offset + w * stride - pad; + + int im_index = im_col + width*(im_row + height*(im_channel+batch*channels)); + float val = (im_row < 0 || im_col < 0 || im_row >= height || im_col >= width) ? 0 : im[im_index]; + + data_col[col_index] = val; +} + +__kernel void im2col_nopad(__global float *im, int batch, + int channels, int height, int width, + int ksize, int stride, __global float *data_col) +{ + int c,h,w,b; + int height_col = (height - ksize) / stride + 1; + int width_col = (width - ksize) / stride + 1; + int channels_col = channels * ksize * ksize; + + int id = get_global_id(0); + int col_index = id; + w = id % width_col; + id /= width_col; + h = id % height_col; + id /= height_col; + c = id % channels_col; + id /= channels_col; + b = id % batch; + id /= batch; + + int col_size = height_col*width_col*channels_col; + int w_offset = c % ksize; + int h_offset = (c / ksize) % ksize; + int im_channel = c / ksize / ksize; int im_row = h_offset + h * stride; int im_col = w_offset + w * stride; - int col_index = (c * height_col + h) * width_col + w + b*col_size; - data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad); + + int im_index = im_col + width*(im_row + height*(im_channel+batch*channels)); + float val = (im_row < 0 || im_col < 0 || im_row >= height || im_col >= width) ? 0 : im[im_index]; + + data_col[col_index] = val; } diff --git a/src/image.c b/src/image.c index da8b54a4..bf34e095 100644 --- a/src/image.c +++ b/src/image.c @@ -738,7 +738,7 @@ image collapse_images_horz(image *ims, int n) void show_images(image *ims, int n, char *window) { image m = collapse_images_vert(ims, n); - //save_image(m, window); + save_image(m, window); show_image(m, window); free_image(m); } diff --git a/src/mini_blas.h b/src/mini_blas.h index 5d5e715e..07b7cc60 100644 --- a/src/mini_blas.h +++ b/src/mini_blas.h @@ -11,7 +11,9 @@ void time_random_matrix(int TA, int TB, int m, int k, int n); #ifdef GPU void axpy_ongpu(int N, float ALPHA, cl_mem X, int INCX, cl_mem Y, int INCY); +void axpy_ongpu_offset(int N, float ALPHA, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY); void copy_ongpu(int N, cl_mem X, int INCX, cl_mem Y, int INCY); +void copy_ongpu_offset(int N, cl_mem X, int OFFX, int INCX, cl_mem Y, int OFFY, int INCY); void scal_ongpu(int N, float ALPHA, cl_mem X, int INCX); void im2col_ongpu(cl_mem data_im, int batch, int channels, int height, int width, diff --git a/src/network.c b/src/network.c index 69942e86..0a72a196 100644 --- a/src/network.c +++ b/src/network.c @@ -38,7 +38,7 @@ void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) //printf("start\n"); int i; for(i = 0; i < net.n; ++i){ - //clock_t time = clock(); + clock_t time = clock(); if(net.types[i] == CONVOLUTIONAL){ convolutional_layer layer = *(convolutional_layer *)net.layers[i]; forward_convolutional_layer_gpu(layer, input); @@ -63,7 +63,7 @@ void forward_network_gpu(network net, cl_mem input, cl_mem truth, int train) forward_softmax_layer_gpu(layer, input); input = layer.output_cl; } - //printf("%d %f\n", i, sec(clock()-time)); + printf("%d %f\n", i, sec(clock()-time)); /* else if(net.types[i] == CROP){ crop_layer layer = *(crop_layer *)net.layers[i]; @@ -85,6 +85,7 @@ void backward_network_gpu(network net, cl_mem input) cl_mem prev_input; cl_mem prev_delta; for(i = net.n-1; i >= 0; --i){ + clock_t time = clock(); if(i == 0){ prev_input = input; prev_delta = 0; @@ -112,6 +113,7 @@ void backward_network_gpu(network net, cl_mem input) softmax_layer layer = *(softmax_layer *)net.layers[i]; backward_softmax_layer_gpu(layer, prev_delta); } + printf("back: %d %f\n", i, sec(clock()-time)); } }