Trying some stuff w/ dropout

2023-08-10 21:13:14 +03:00 · 2014-08-27 19:11:46 -07:00 · 2014-08-27 19:11:46 -07:00 · 76ee68f96d
commit 76ee68f96d
parent 176d65b765
18 changed files with 550 additions and 200 deletions
--- a/src/activations.c
+++ b/src/activations.c
@ -41,6 +41,12 @@ float relu_activate(float x){return x*(x>0);}
 float ramp_activate(float x){return x*(x>0)+.1*x;}
 float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}

+float linear_gradient(float x){return 1;}
+float sigmoid_gradient(float x){return (1-x)*x;}
+float relu_gradient(float x){return (x>0);}
+float ramp_gradient(float x){return (x>0)+.1;}
+float tanh_gradient(float x){return 1-x*x;}
+
 float activate(float x, ACTIVATION a)
 {
    switch(a){
@ -66,19 +72,19 @@ void activate_array(float *x, const int n, const ACTIVATION a)
    }
 }

-
-float gradient(float x, ACTIVATION a){
+float gradient(float x, ACTIVATION a)
+{
    switch(a){
        case LINEAR:
-            return 1;
+            return linear_gradient(x);
        case SIGMOID:
-            return (1.-x)*x;
+            return sigmoid_gradient(x);
        case RELU:
-            return (x>0);
+            return relu_gradient(x);
        case RAMP:
-            return (x>0) + .1;
+            return ramp_gradient(x);
        case TANH:
-            return 1-x*x;
+            return tanh_gradient(x);
    }
    return 0;
 }
@ -107,7 +113,6 @@ cl_kernel get_activation_kernel()
    return kernel;
 }

-
 void activate_array_ongpu(cl_mem x, int n, ACTIVATION a) 
 {
    cl_setup();
@ -125,4 +130,34 @@ void activate_array_ongpu(cl_mem x, int n, ACTIVATION a)
    clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
    check_error(cl);
 }
+
+cl_kernel get_gradient_kernel()
+{
+    static int init = 0;
+    static cl_kernel kernel;
+    if(!init){
+        kernel = get_kernel("src/activations.cl", "gradient_array", 0);
+        init = 1;
+    }
+    return kernel;
+}
+
+void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta) 
+{
+    cl_setup();
+    cl_kernel kernel = get_gradient_kernel();
+    cl_command_queue queue = cl.queue;
+
+    cl_uint i = 0;
+    cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta);
+    check_error(cl);
+
+    size_t gsize = n;
+
+    clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
+    check_error(cl);
+}
 #endif
--- a/src/activations.cl
+++ b/src/activations.cl
@ -8,6 +8,12 @@ float relu_activate(float x){return x*(x>0);}
 float ramp_activate(float x){return x*(x>0)+.1*x;}
 float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}

+float linear_gradient(float x){return 1;}
+float sigmoid_gradient(float x){return (1-x)*x;}
+float relu_gradient(float x){return (x>0);}
+float ramp_gradient(float x){return (x>0)+.1;}
+float tanh_gradient(float x){return 1-x*x;}
+
 float activate(float x, ACTIVATION a)
 {
    switch(a){
@ -25,9 +31,32 @@ float activate(float x, ACTIVATION a)
    return 0;
 }

-__kernel void activate_array(__global float *x,
-    const int n, const ACTIVATION a)
+float gradient(float x, ACTIVATION a)
+{
+    switch(a){
+        case LINEAR:
+            return linear_gradient(x);
+        case SIGMOID:
+            return sigmoid_gradient(x);
+        case RELU:
+            return relu_gradient(x);
+        case RAMP:
+            return ramp_gradient(x);
+        case TANH:
+            return tanh_gradient(x);
+    }
+    return 0;
+}
+
+__kernel void activate_array(__global float *x, int n, ACTIVATION a)
 {
    int i = get_global_id(0);
    x[i] = activate(x[i], a);
 }
+
+__kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta)
+{
+    int i = get_global_id(0);
+    delta[i] *= gradient(x[i], a);
+}
+
--- a/src/activations.h
+++ b/src/activations.h
@ -14,7 +14,9 @@ float gradient(float x, ACTIVATION a);
 void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
 void activate_array(float *x, const int n, const ACTIVATION a);
 #ifdef GPU
+cl_kernel get_activation_kernel();
 void activate_array_ongpu(cl_mem x, int n, ACTIVATION a);
+void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta);
 #endif

 #endif
--- a/src/cnn.c
+++ b/src/cnn.c
@ -32,6 +32,51 @@ void test_convolve()
 	show_image_layers(edge, "Test Convolve");
 }

+#ifdef GPU
+
+void test_convolutional_layer()
+{
+    int i;
+	image dog = load_image("data/dog.jpg",256,256);
+	network net = parse_network_cfg("cfg/convolutional.cfg");
+//    data test = load_cifar10_data("data/cifar10/test_batch.bin");
+//    float *X = calloc(net.batch*test.X.cols, sizeof(float));
+//    float *y = calloc(net.batch*test.y.cols, sizeof(float));
+    int in_size = get_network_input_size(net)*net.batch;
+    int size = get_network_output_size(net)*net.batch;
+float *X = calloc(in_size, sizeof(float));
+    for(i = 0; i < in_size; ++i){
+        X[i] = dog.data[i%get_network_input_size(net)];
+    }
+//    get_batch(test, net.batch, X, y);
+    clock_t start, end;
+    cl_mem input_cl = cl_make_array(X, in_size);
+
+    forward_network_gpu(net, input_cl, 1);
+    start = clock();
+    forward_network_gpu(net, input_cl, 1);
+    end = clock();
+    float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
+    float *gpu_out = calloc(size, sizeof(float));
+    memcpy(gpu_out, get_network_output(net), size*sizeof(float));
+
+    start = clock();
+    forward_network(net, X, 1);
+    end = clock();
+    float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
+    float *cpu_out = calloc(size, sizeof(float));
+    memcpy(cpu_out, get_network_output(net), size*sizeof(float));
+
+    float sum = 0;
+    for(i = 0; i < size; ++i) {
+        //printf("%f, %f\n", gpu_out[i], cpu_out[i]);
+        sum += pow(gpu_out[i] - cpu_out[i], 2);
+    }
+    printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size);
+}
+
+#endif
+
 void test_convolve_matrix()
 {
 	image dog = load_image("dog.jpg",300,400);
@ -325,7 +370,7 @@ void test_nist()
 void train_nist()
 {
    srand(222222);
-    network net = parse_network_cfg("cfg/nist_final.cfg");
+    network net = parse_network_cfg("cfg/nist.cfg");
    data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
    data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
    translate_data_rows(train, -144);
@ -349,7 +394,7 @@ void train_nist()
          mean_array(get_network_output_layer(net,3), 100),
          mean_array(get_network_output_layer(net,4), 100));
         */
-        save_network(net, "cfg/nist_final2.cfg");
+        //save_network(net, "cfg/nist_final2.cfg");

        //printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
        //end = clock();
@ -798,7 +843,7 @@ int main(int argc, char *argv[])
 {
    //train_full();
    //test_distribution();
-    feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
+    //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);

    //test_blas();
    //test_visualize();
@ -809,7 +854,9 @@ int main(int argc, char *argv[])
    //test_split();
    //test_ensemble();
    //test_nist_single();
-    test_nist();
+    //test_nist();
+    train_nist();
+    //test_convolutional_layer();
    //test_cifar10();
    //train_cifar10();
    //test_vince();
--- a/src/col2im.c
+++ b/src/col2im.c
@ -1,21 +1,21 @@
 #include <stdio.h>
 #include <math.h>
 inline void col2im_add_pixel(float *im, int height, int width, int channels,
-                        int row, int col, int channel, int pad, float val)
+                        int b, int row, int col, int channel, int pad, float val)
 {
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return;
-    im[col + width*(row + channel*height)] += val;
+    im[col + width*(row + height*(channel+b*channels))] += val;
 }
 //This one might be too, can't remember.
-void col2im_cpu(float* data_col,
-        const int channels, const int height, const int width,
-        const int ksize, const int stride, int pad, float* data_im) 
+void col2im_cpu(float* data_col, int batch,
+         int channels,  int height,  int width,
+         int ksize,  int stride, int pad, float* data_im) 
 {
-    int c,h,w;
+    int b,c,h,w;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    if (pad){
@ -24,20 +24,67 @@ void col2im_cpu(float* data_col,
        pad = ksize/2;
    }
    int channels_col = channels * ksize * ksize;
-    for (c = 0; c < channels_col; ++c) {
-        int w_offset = c % ksize;
-        int h_offset = (c / ksize) % ksize;
-        int c_im = c / ksize / ksize;
-        for (h = 0; h < height_col; ++h) {
-            for (w = 0; w < width_col; ++w) {
-                int im_row = h_offset + h * stride;
-                int im_col = w_offset + w * stride;
-                double val = data_col[(c * height_col + h) * width_col + w];
-                col2im_add_pixel(data_im, height, width, channels,
-                        im_row, im_col, c_im, pad, val);
+    int col_size = height_col*width_col*channels_col;
+    for(b = 0; b < batch; ++b){
+        for (c = 0; c < channels_col; ++c) {
+            int w_offset = c % ksize;
+            int h_offset = (c / ksize) % ksize;
+            int c_im = c / ksize / ksize;
+            for (h = 0; h < height_col; ++h) {
+                for (w = 0; w < width_col; ++w) {
+                    int im_row = h_offset + h * stride;
+                    int im_col = w_offset + w * stride;
+                    int col_index = (c * height_col + h) * width_col + w + b*col_size;
+                    double val = data_col[col_index];
+                    col2im_add_pixel(data_im, height, width, channels,
+                            b, im_row, im_col, c_im, pad, val);
+                }
            }
        }
    }
 }


+#ifdef GPU
+
+#include "opencl.h"
+
+cl_kernel get_col2im_kernel()
+{
+    static int init = 0;
+    static cl_kernel im2col_kernel;
+    if(!init){
+        im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0);
+        init = 1;
+    }
+    return im2col_kernel;
+}
+
+void col2im_ongpu(cl_mem data_col,  int batch,
+         int channels,  int height,  int width,
+         int ksize,  int stride,  int pad, cl_mem data_im)
+{
+    cl_setup();
+    cl_kernel kernel = get_col2im_kernel();
+    cl_command_queue queue = cl.queue;
+
+    cl_uint i = 0;
+    cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im);
+    check_error(cl);
+
+    size_t global_size = {channels*height*width*batch};
+
+    clEnqueueNDRangeKernel(queue, kernel, 3, 0,
+            global_size, 0, 0, 0, 0);
+    check_error(cl);
+}
+
+#endif
--- a/src/col2im.cl
+++ b/src/col2im.cl
@ -0,0 +1,41 @@
+int index(int row, int col)
+{
+    
+}
+
+__kernel void col2im(__global float *data_col,  int batch,
+     int channels,  int height,  int width,
+     int ksize,  int stride,  int pad, __global float *data_im)
+{
+    int id = get_global_id(0);
+    int index = id;
+    int w = id%width;
+    id /= width;
+    int h = id%height;
+    id /= height;
+    int c = id%channels;
+    id /= channels;
+    int b = id%batch;
+
+    int height_col = (height - ksize) / stride + 1;
+    int width_col = (width - ksize) / stride + 1;
+    int rows = channels * ksize * ksize;
+    if (pad){
+        height_col = 1 + (height-1) / stride;
+        width_col = 1 + (width-1) / stride;
+        pad = ksize/2;
+    }
+    int cols = height_col*width_col;
+    int batch_offset = b*cols*rows;
+    int channel_offset = c*cols*ksize*ksize;
+    data_col[index] = 0;
+    int i,j;
+    for(i = 0; i < ksize; ++i){
+        row_offset = i*height_col*width_col;
+        for(j = 0; j < ksize; ++j){
+            col_offset = 
+        }
+    }
+
+    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
+}
--- a/src/convolutional_layer.c
+++ b/src/convolutional_layer.c
@ -147,15 +147,9 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in)

    for(i = 0; i < layer.batch; ++i){
        gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
-        c += n*m;
-        in += layer.h*layer.w*layer.c;
        b += k*n;
+        c += n*m;
    }
-    /*
-    int i;
-    for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]);
-    printf("\n");
-    */
    activate_array(layer.output, m*n*layer.batch, layer.activation);
 }

@ -205,10 +199,10 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta)

        for(i = 0; i < layer.batch; ++i){
            gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
-            col2im_cpu(c, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta);
-            c += k*n;
-            delta += layer.h*layer.w*layer.c;
+            b += k*n;
+            c += m*n;
        }
+        col2im_cpu(layer.col_image, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta);
    }
 }

@ -278,22 +272,140 @@ image *visualize_convolutional_layer(convolutional_layer layer, char *window, im
 }

 #ifdef GPU
+
+cl_kernel get_convolutional_learn_bias_kernel()
+{
+    static int init = 0;
+    static cl_kernel kernel;
+    if(!init){
+        kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
+        init = 1;
+    }
+    return kernel;
+}
+
+void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
+{
+    int size = convolutional_out_height(layer) * convolutional_out_width(layer);
+
+    cl_setup();
+    cl_kernel kernel = get_convolutional_learn_bias_kernel();
+    cl_command_queue queue = cl.queue;
+
+    cl_uint i = 0;
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
+    check_error(cl);
+
+    const size_t global_size[] = {layer.n};
+
+    clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
+    check_error(cl);
+}
+
+cl_kernel get_convolutional_bias_kernel()
+{
+    static int init = 0;
+    static cl_kernel kernel;
+    if(!init){
+        kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
+        init = 1;
+    }
+    return kernel;
+}
+
+void bias_output_gpu(const convolutional_layer layer)
+{
+    int out_h = convolutional_out_height(layer);
+    int out_w = convolutional_out_width(layer);
+    int size = out_h*out_w;
+
+    cl_setup();
+    cl_kernel kernel = get_convolutional_bias_kernel();
+    cl_command_queue queue = cl.queue;
+
+    cl_uint i = 0;
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
+    cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
+    check_error(cl);
+
+    const size_t global_size[] = {layer.batch, layer.n*size};
+
+    clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
+    check_error(cl);
+}
+
 void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
 {
+    int i;
    int m = layer.n;
    int k = layer.size*layer.size*layer.c;
    int n = convolutional_out_height(layer)*
-        convolutional_out_width(layer)*
-        layer.batch;
+        convolutional_out_width(layer);

-    cl_write_array(layer.filters_cl, layer.filters, m*k);
-    cl_mem a = layer.filters_cl;
-    cl_mem b = layer.col_image_cl;
-    cl_mem c = layer.output_cl;
-    im2col_ongpu(in, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, b);
-    gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n);
-    activate_array_ongpu(layer.output_cl, m*n, layer.activation);
-    cl_read_array(layer.output_cl, layer.output, m*n);
+    //cl_write_array(layer.filters_cl, layer.filters, m*k);
+    //cl_write_array(layer.biases_cl, layer.biases, m);
+    bias_output_gpu(layer);
+    im2col_ongpu(in, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, layer.col_image_cl);
+    for(i = 0; i < layer.batch; ++i){
+        cl_mem a = layer.filters_cl;
+        cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n);
+        cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n);
+        gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n);
+        clReleaseMemObject(b);
+        clReleaseMemObject(c);
+    }
+    activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation);
+    cl_read_array(layer.output_cl, layer.output, m*n*layer.batch);
 }
+
+void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
+{
+    int i;
+    int m = layer.n;
+    int n = layer.size*layer.size*layer.c;
+    int k = convolutional_out_height(layer)*
+        convolutional_out_width(layer);
+    gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl);
+    learn_bias_convolutional_layer_ongpu(layer);
+
+    for(i = 0; i < layer.batch; ++i){
+        cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k);
+        cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n);
+        cl_mem c = layer.filter_updates_cl;
+
+        gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
+
+        clReleaseMemObject(a);
+        clReleaseMemObject(b);
+    }
+    cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n);
+    cl_read_array(layer.bias_updates_cl, layer.bias_updates, m);
+    
+
+    if(delta_cl){
+        m = layer.size*layer.size*layer.c;
+        k = layer.n;
+        n = convolutional_out_height(layer)*
+            convolutional_out_width(layer);
+
+        for(i = 0; i < layer.batch; ++i){
+            a = layer.filters_cl;
+            b = cl_sub_array(layer.delta_cl, i*k*n, k*n);
+            c = cl_sub_array(layer.col_image_cl, i*m*n, m*n);
+
+            gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
+            clReleaseMemObject(b);
+            clReleaseMemObject(c);
+        }
+        col2im_gpu(layer.col_image_cl, layer.batch, layer.c,  layer.h,  layer.w,  layer.size,  layer.stride, layer.pad, delta_cl);
+    }
+}
+
 #endif

--- a/src/convolutional_layer.cl
+++ b/src/convolutional_layer.cl
@ -0,0 +1,25 @@
+
+__kernel void bias(int n, int size, __global float *biases, __global float *output)
+{
+    int batch = get_global_id(0);
+    int id = get_global_id(1);
+    int filter = id/size;
+    int position = id%size;
+
+    output[batch*n*size + id] = biases[filter];
+}
+
+__kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates)
+{
+    int i,b;
+    int filter = get_global_id(0);
+    float sum = 0;
+    for(b = 0; b < batch; ++b){
+        for(i = 0; i < size; ++i){
+            int index = i + size*(filter + n*b);
+            sum += delta[index];
+        }
+    }
+    bias_updates[filter] += sum;
+}
+
--- a/src/convolutional_layer.h
+++ b/src/convolutional_layer.h
@ -50,6 +50,7 @@ typedef struct {

 #ifdef GPU
 void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
+void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
 #endif

 convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);
--- a/src/data.c
+++ b/src/data.c
@ -148,6 +148,16 @@ data load_cifar10_data(char *filename)
    return d;
 }

+void get_batch(data d, int n, float *X, float *y)
+{
+    int j;
+    for(j = 0; j < n; ++j){
+        int index = rand()%d.X.rows;
+        memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
+        memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
+    }
+}
+
 data load_all_cifar10()
 {
    data d;
@ -158,7 +168,7 @@ data load_all_cifar10()
    d.X = X;
    d.y = y;

-    
+
    for(b = 0; b < 5; ++b){
        char buff[256];
        sprintf(buff, "data/cifar10/data_batch_%d.bin", b+1);
@ -176,8 +186,8 @@ data load_all_cifar10()
        fclose(fp);
    }
    //normalize_data_rows(d);
-	translate_data_rows(d, -144);
-	scale_data_rows(d, 1./128);
+    translate_data_rows(d, -144);
+    scale_data_rows(d, 1./128);
    return d;
 }

--- a/src/data.h
+++ b/src/data.h
@ -20,6 +20,7 @@ data load_data_image_pathfile_random(char *filename, int n, char **labels,
 data load_cifar10_data(char *filename);
 data load_all_cifar10();
 list *get_paths(char *filename);
+void get_batch(data d, int n, float *X, float *y);
 data load_categorical_data_csv(char *filename, int target, int k);
 void normalize_data_rows(data d);
 void scale_data_rows(data d, float s);
--- a/src/gemm.c
+++ b/src/gemm.c
@ -6,11 +6,7 @@ void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
        float BETA,
        float *C, int ldc)
 {
-#ifdef GPU
-    gemm_gpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-#else
    gemm_cpu( TA,  TB,  M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
-#endif
 }

 void gemm_nn(int M, int N, int K, float ALPHA, 
@ -83,6 +79,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
        float BETA,
        float *C, int ldc)
 {
+    //printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    int i, j;
    for(i = 0; i < M; ++i){
        for(j = 0; j < N; ++j){
@ -107,7 +104,11 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)

+#ifdef __APPLE__
+#define BLOCK 1
+#else
 #define BLOCK 8
+#endif

 cl_kernel get_gemm_kernel()
 {
@ -126,6 +127,7 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
        float BETA,
        cl_mem C_gpu, int ldc)
 {
+    //printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
    cl_setup();
    cl_kernel gemm_kernel = get_gemm_kernel();
    cl_command_queue queue = cl.queue;
@ -256,6 +258,8 @@ void test_gpu_accuracy(int TA, int TB, int m, int k, int n)

 void test_gpu_blas()
 {
+    test_gpu_accuracy(0,0,10,576,75); 
+
    test_gpu_accuracy(0,0,17,10,10); 
    test_gpu_accuracy(1,0,17,10,10); 
    test_gpu_accuracy(0,1,17,10,10); 
@ -266,6 +270,7 @@ void test_gpu_blas()
    test_gpu_accuracy(0,1,1000,10,100); 
    test_gpu_accuracy(1,1,1000,10,100); 

+/*
    time_gpu_random_matrix(0,0,1000,1000,100); 
    time_random_matrix(0,0,1000,1000,100); 

@ -277,6 +282,7 @@ void test_gpu_blas()

    time_gpu_random_matrix(1,1,1000,1000,100); 
    time_random_matrix(1,1,1000,1000,100); 
+    */

 }
 #endif
--- a/src/im2col.c
+++ b/src/im2col.c
@ -1,22 +1,21 @@
 #include "mini_blas.h"
 #include <stdio.h>
-
 inline float im2col_get_pixel(float *im, int height, int width, int channels,
-                        int row, int col, int channel, int pad)
+                        int b, int row, int col, int channel, int pad)
 {
    row -= pad;
    col -= pad;

    if (row < 0 || col < 0 ||
        row >= height || col >= width) return 0;
-    return im[col + width*(row + channel*height)];
+    return im[col + width*(row + height*(channel+b*channels))];
 }

 //From Berkeley Vision's Caffe!
 //https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu_batch(float* data_im,
-    const int batch, const int channels, const int height, const int width,
-    const int ksize, const int stride, int pad, float* data_col) 
+void im2col_cpu(float* data_im,  int batch,
+     int channels,  int height,  int width,
+     int ksize,  int stride, int pad, float* data_col) 
 {
    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
@ -27,44 +26,6 @@ void im2col_cpu_batch(float* data_im,
        pad = ksize/2;
    }
    int channels_col = channels * ksize * ksize;
-    int im_size = height*width*channels;
-    //int col_size = height_col*width_col*channels_col;
-    for (b = 0; b < batch; ++b) {
-        for (c = 0; c < channels_col; ++c) {
-            int w_offset = c % ksize;
-            int h_offset = (c / ksize) % ksize;
-            int c_im = c / ksize / ksize;
-            for (h = 0; h < height_col; ++h) {
-                for (w = 0; w < width_col; ++w) {
-                    int im_row = h_offset + h * stride;
-                    int im_col = w_offset + w * stride;
-                    int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col;
-                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                                        im_row, im_col, c_im, pad);
-                }
-            }
-        }
-        data_im += im_size;
-        data_col+= channels_col;
-    }
-}
-
-//From Berkeley Vision's Caffe!
-//https://github.com/BVLC/caffe/blob/master/LICENSE
-void im2col_cpu(float* data_im, const int batch,
-    const int channels, const int height, const int width,
-    const int ksize, const int stride, int pad, float* data_col) 
-{
-    int c,h,w,b;
-    int height_col = (height - ksize) / stride + 1;
-    int width_col = (width - ksize) / stride + 1;
-    if (pad){
-        height_col = 1 + (height-1) / stride;
-        width_col = 1 + (width-1) / stride;
-        pad = ksize/2;
-    }
-    int channels_col = channels * ksize * ksize;
-    int im_size = height*width*channels;
    int col_size = height_col*width_col*channels_col;
    for (b = 0; b < batch; ++b) {
        for (c = 0; c < channels_col; ++c) {
@ -75,14 +36,12 @@ void im2col_cpu(float* data_im, const int batch,
                for (w = 0; w < width_col; ++w) {
                    int im_row = h_offset + h * stride;
                    int im_col = w_offset + w * stride;
-                    int col_index = (c * height_col + h) * width_col + w;
+                    int col_index = (c * height_col + h) * width_col + w + b*col_size;
                    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
-                            im_row, im_col, c_im, pad);
+                            b, im_row, im_col, c_im, pad);
                }
            }
        }
-        data_im += im_size;
-        data_col += col_size;
    }
 }

@ -104,9 +63,9 @@ cl_kernel get_im2col_kernel()
 }


-void im2col_ongpu(cl_mem data_im, const int batch,
-        const int channels, const int height, const int width,
-        const int ksize, const int stride, cl_mem data_col) 
+void im2col_ongpu(cl_mem data_im,  int batch,
+         int channels,  int height,  int width,
+         int ksize,  int stride,  int pad, cl_mem data_col)
 {
    cl_setup();
    cl_kernel im2col_kernel = get_im2col_kernel();
@ -120,29 +79,30 @@ void im2col_ongpu(cl_mem data_im, const int batch,
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride);
+    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad);
    cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col);
    check_error(cl);

    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
    int channels_col = channels * ksize * ksize;
+    if (pad){
+        height_col = 1 + (height-1) / stride;
+        width_col = 1 + (width-1) / stride;
+    }

    size_t global_size[2];
-    size_t local_size[2];
-    global_size[0] = batch;
-    global_size[1] = channels_col;
-    local_size[0] = height_col;
-    local_size[1] = width_col;
+    global_size[0] = batch*channels_col;
+    global_size[1] = height_col*width_col;

    clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
-            global_size, local_size, 0, 0, 0);
+            global_size, 0, 0, 0, 0);
    check_error(cl);
 }

-void im2col_gpu(float *data_im,
-        const int batch, const int channels, const int height, const int width,
-        const int ksize, const int stride,
-        float *data_col) 
+void im2col_gpu(float *data_im,  int batch,
+         int channels,  int height,  int width,
+         int ksize,  int stride,  int pad, float *data_col) 
 {
    cl_setup();
    cl_context context = cl.context;
@ -165,7 +125,7 @@ void im2col_gpu(float *data_im,
    check_error(cl);

    im2col_ongpu(im_gpu, batch, channels, height, width,
-            ksize, stride, col_gpu);
+            ksize, stride, pad, col_gpu);

    clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0);
    check_error(cl);
--- a/src/im2col.cl
+++ b/src/im2col.cl
@ -1,26 +1,43 @@

-__kernel void im2col(__global float *data_im, const int im_offset,
-    const int channels, const int height, const int width,
-    const int ksize, const int stride, __global float *data_col, const int col_offset) 
+float im2col_get_pixel(__global float *im, int height, int width, int channels,
+                       int batch, int row, int col, int channel, int pad)
 {
-    int b = get_global_id(0);
-    int c = get_global_id(1);
+    row -= pad;
+    col -= pad;

-    int h = get_local_id(0);
-    int w = get_local_id(1);
+    if (row < 0 || col < 0 || row >= height || col >= width) return 0;
+    int index = col + width*(row + height*(channel+batch*channels));
+    return im[index];
+}

+__kernel void im2col(__global float *data_im,  int batch,
+     int channels,  int height,  int width,
+     int ksize,  int stride,  int pad, __global float *data_col)
+{
+    int c,h,w,b;
    int height_col = (height - ksize) / stride + 1;
    int width_col = (width - ksize) / stride + 1;
+    if (pad){
+        height_col = 1 + (height-1) / stride;
+        width_col = 1 + (width-1) / stride;
+        pad = ksize/2;
+    }
+    int gid1 = get_global_id(0);
+    b = gid1%batch;
+    c = gid1/batch;
+
+    int gid2 = get_global_id(1);
+    h = gid2%height_col;
+    w = gid2/height_col;
+
+
    int channels_col = channels * ksize * ksize;
-
-    int im_offset = height*width*channels*b;
-    int col_offset = height_col*width_col*channels_col*b;
-
+    int col_size = height_col*width_col*channels_col;
    int w_offset = c % ksize;
    int h_offset = (c / ksize) % ksize;
    int c_im = c / ksize / ksize;
-
-    data_col[(c * height_col + h) * width_col + w + col_offset] =
-        data_im[(c_im * height + h * stride + h_offset) * width
-        + w * stride + w_offset + im_offset];
+    int im_row = h_offset + h * stride;
+    int im_col = w_offset + w * stride;
+    int col_index = (c * height_col + h) * width_col + w + b*col_size;
+    data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
 }
--- a/src/mini_blas.h
+++ b/src/mini_blas.h
@ -10,13 +10,17 @@ float *random_matrix(int rows, int cols);
 void time_random_matrix(int TA, int TB, int m, int k, int n);

 #ifdef GPU
-void im2col_ongpu(cl_mem data_im, const int batch,
-        const int channels, const int height, const int width,
-        const int ksize, const int stride, cl_mem data_col);
+void im2col_ongpu(cl_mem data_im, int batch,
+         int channels, int height, int width,
+         int ksize, int stride, int pad, cl_mem data_col);

-void im2col_gpu(float *data_im,
-    const int batch, const int channels, const int height, const int width,
-    const int ksize, const int stride, float *data_col);
+void col2im_ongpu(cl_mem data_col, int batch,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, cl_mem data_im);
+
+void im2col_gpu(float *data_im, int batch,
+         int channels, int height, int width,
+         int ksize, int stride, int pad, float *data_col);

 void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, 
        cl_mem A_gpu, int lda, 
@ -25,13 +29,14 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
        cl_mem C_gpu, int ldc);
 #endif

-void im2col_cpu(float* data_im, const int batch,
-    const int channels, const int height, const int width,
-    const int ksize, const int stride, int pad, float* data_col);
+void im2col_cpu(float* data_im, int batch,
+    int channels, int height, int width,
+    int ksize, int stride, int pad, float* data_col);
+
+void col2im_cpu(float* data_col, int batch,
+        int channels, int height, int width,
+        int ksize, int stride, int pad, float* data_im);

-void col2im_cpu(float* data_col,
-        const int channels, const int height, const int width,
-        const int ksize, const int stride, int pad, float* data_im);
 void test_blas();

 void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
--- a/src/network.c
+++ b/src/network.c
@ -28,25 +28,16 @@ network make_network(int n, int batch)
 }

 #ifdef GPU
-void forward_network(network net, float *input, int train)
+void forward_network_gpu(network net, cl_mem input_cl, int train)
 {
-    cl_setup();
-    size_t size = get_network_input_size(net);
-    if(!net.input_cl){
-        net.input_cl = clCreateBuffer(cl.context,
-            CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error);
-        check_error(cl);
-    }
-    cl_write_array(net.input_cl, input, size);
-    cl_mem input_cl = net.input_cl;
    int i;
    for(i = 0; i < net.n; ++i){
        if(net.types[i] == CONVOLUTIONAL){
            convolutional_layer layer = *(convolutional_layer *)net.layers[i];
            forward_convolutional_layer_gpu(layer, input_cl);
            input_cl = layer.output_cl;
-            input = layer.output;
        }
+        /*
        else if(net.types[i] == CONNECTED){
            connected_layer layer = *(connected_layer *)net.layers[i];
            forward_connected_layer(layer, input, train);
@ -72,10 +63,11 @@ void forward_network(network net, float *input, int train)
            forward_normalization_layer(layer, input);
            input = layer.output;
        }
+        */
    }
 }

-#else
+#endif

 void forward_network(network net, float *input, int train)
 {
@ -118,7 +110,6 @@ void forward_network(network net, float *input, int train)
        }
    }
 }
-#endif

 void update_network(network net)
 {
@ -275,45 +266,13 @@ float train_network_sgd(network net, data d, int n)
    float *X = calloc(batch*d.X.cols, sizeof(float));
    float *y = calloc(batch*d.y.cols, sizeof(float));

-    int i,j;
+    int i;
    float sum = 0;
-    int index = 0;
    for(i = 0; i < n; ++i){
-        for(j = 0; j < batch; ++j){
-            index = rand()%d.X.rows;
-            memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
-            memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
-        }
-
+        get_batch(d, batch, X, y);
        float err = train_network_datum(net, X, y);
        sum += err;
-        //train_network_datum(net, X, y);
-        /*
-        float *y = d.y.vals[index];
-        int class = get_predicted_class_network(net);
-        correct += (y[class]?1:0);
-        */
-
-/*
-        for(j = 0; j < d.y.cols*batch; ++j){
-            printf("%6.3f ", y[j]);
-        }
-        printf("\n");
-        for(j = 0; j < d.y.cols*batch; ++j){
-            printf("%6.3f ", get_network_output(net)[j]);
-        }
-        printf("\n");
-        printf("\n");
-        */
-
-
-        //printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
-        //if((i+1)%10 == 0){
-        //    printf("%d: %f\n", (i+1), (float)correct/(i+1));
-        //}
    }
-    //printf("Accuracy: %f\n",(float) correct/n);
-    //show_image(float_to_image(32,32,3,X), "Orig");
    free(X);
    free(y);
    return (float)sum/(n*batch);
--- a/src/network.h
+++ b/src/network.h
@ -33,6 +33,10 @@ typedef struct {
    #endif
 } network;

+#ifdef GPU
+void forward_network_gpu(network net, cl_mem input, int train);
+#endif
+
 network make_network(int n, int batch);
 void forward_network(network net, float *input, int train);
 float backward_network(network net, float *input, float *truth);
--- a/src/opencl.c
+++ b/src/opencl.c
@ -11,6 +11,7 @@ cl_info cl = {0};

 void check_error(cl_info info)
 {
+    clFinish(cl.queue);
    if (info.error != CL_SUCCESS) {
        printf("\n Error number %d", info.error);
        exit(1);
@ -27,13 +28,60 @@ cl_info cl_init()
    // Fetch the Platform and Device IDs; we only want one.
    cl_device_id devices[MAX_DEVICES];
    info.error=clGetPlatformIDs(1, &info.platform, &num_platforms);
+
+    printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
+    char buffer[10240];
+    clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
+    printf("  PROFILE = %s\n", buffer);
+    clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL);
+    printf("  VERSION = %s\n", buffer);
+    clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL);
+    printf("  NAME = %s\n", buffer);
+    clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
+    printf("  VENDOR = %s\n", buffer);
+    clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
+    printf("  EXTENSIONS = %s\n", buffer);
+
    check_error(info);
    info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices);
    if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
+    printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
+    int i;
+    for (i=0; i<num_devices; i++)
+    {
+        char buffer[10240];
+        cl_uint buf_uint;
+        cl_ulong buf_ulong;
+        printf("  -- %d --\n", i);
+        clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
+        printf("  DEVICE_NAME = %s\n", buffer);
+        clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
+        printf("  DEVICE_VENDOR = %s\n", buffer);
+        clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
+        printf("  DEVICE_VERSION = %s\n", buffer);
+        clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
+        printf("  DRIVER_VERSION = %s\n", buffer);
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
+        printf("  DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
+        printf("  DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
+        clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
+        printf("  DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
+        printf("  DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong);
+        cl_uint items;
+        clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), 
+                                       &items, NULL);
+        printf("  DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items);
+        size_t workitem_size[10];
+        clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL);
+        printf("  DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]);
+
+    }
    int index = getpid()%num_devices;
    printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
    //info.device = devices[index];
-    info.device = devices[1];
+    info.device = devices[0];
    fprintf(stderr, "Found %d device(s)\n", num_devices);
    check_error(info);

@ -52,8 +100,8 @@ cl_info cl_init()
 cl_program cl_fprog(char *filename, char *options, cl_info info)
 {
 	size_t srcsize;
-	char src[8192];
-	memset(src, 0, 8192);
+	char src[64*1024];
+	memset(src, 0, 64*1024);
 	FILE *fil=fopen(filename,"r");
 	srcsize=fread(src, sizeof src, 1, fil);
 	fclose(fil);
@ -61,12 +109,12 @@ cl_program cl_fprog(char *filename, char *options, cl_info info)
 	// Submit the source code of the example kernel to OpenCL
 	cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error);
 	check_error(info);
-	char build_c[4096];
+	char build_c[1024*64];
 	// and compile it (after this we could extract the compiled version)
 	info.error=clBuildProgram(prog, 0, 0, options, 0, 0);
 	if ( info.error != CL_SUCCESS ) {
 		fprintf(stderr, "Error Building Program: %d\n", info.error);
-		clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0);
+		clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0);
 		fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
 	}
 	check_error(info);
@ -115,7 +163,8 @@ cl_mem cl_sub_array(cl_mem src, int offset, int size)
    cl_buffer_region r;
    r.origin = offset*sizeof(float);
    r.size = size*sizeof(float);
-    cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0);
+    cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error);
+    check_error(cl);
    return sub;
 }