Trying some stuff w/ dropout

This commit is contained in:
Joseph Redmon 2014-08-27 19:11:46 -07:00
parent 176d65b765
commit 76ee68f96d
18 changed files with 550 additions and 200 deletions

View File

@ -41,6 +41,12 @@ float relu_activate(float x){return x*(x>0);}
float ramp_activate(float x){return x*(x>0)+.1*x;} float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
float linear_gradient(float x){return 1;}
float sigmoid_gradient(float x){return (1-x)*x;}
float relu_gradient(float x){return (x>0);}
float ramp_gradient(float x){return (x>0)+.1;}
float tanh_gradient(float x){return 1-x*x;}
float activate(float x, ACTIVATION a) float activate(float x, ACTIVATION a)
{ {
switch(a){ switch(a){
@ -66,19 +72,19 @@ void activate_array(float *x, const int n, const ACTIVATION a)
} }
} }
float gradient(float x, ACTIVATION a)
float gradient(float x, ACTIVATION a){ {
switch(a){ switch(a){
case LINEAR: case LINEAR:
return 1; return linear_gradient(x);
case SIGMOID: case SIGMOID:
return (1.-x)*x; return sigmoid_gradient(x);
case RELU: case RELU:
return (x>0); return relu_gradient(x);
case RAMP: case RAMP:
return (x>0) + .1; return ramp_gradient(x);
case TANH: case TANH:
return 1-x*x; return tanh_gradient(x);
} }
return 0; return 0;
} }
@ -107,7 +113,6 @@ cl_kernel get_activation_kernel()
return kernel; return kernel;
} }
void activate_array_ongpu(cl_mem x, int n, ACTIVATION a) void activate_array_ongpu(cl_mem x, int n, ACTIVATION a)
{ {
cl_setup(); cl_setup();
@ -125,4 +130,34 @@ void activate_array_ongpu(cl_mem x, int n, ACTIVATION a)
clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0); clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
check_error(cl); check_error(cl);
} }
cl_kernel get_gradient_kernel()
{
static int init = 0;
static cl_kernel kernel;
if(!init){
kernel = get_kernel("src/activations.cl", "gradient_array", 0);
init = 1;
}
return kernel;
}
void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta)
{
cl_setup();
cl_kernel kernel = get_gradient_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(x), (void*) &x);
cl.error = clSetKernelArg(kernel, i++, sizeof(n), (void*) &n);
cl.error = clSetKernelArg(kernel, i++, sizeof(a), (void*) &a);
cl.error = clSetKernelArg(kernel, i++, sizeof(delta), (void*) &delta);
check_error(cl);
size_t gsize = n;
clEnqueueNDRangeKernel(queue, kernel, 1, 0, &gsize, 0, 0, 0, 0);
check_error(cl);
}
#endif #endif

View File

@ -8,6 +8,12 @@ float relu_activate(float x){return x*(x>0);}
float ramp_activate(float x){return x*(x>0)+.1*x;} float ramp_activate(float x){return x*(x>0)+.1*x;}
float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);} float tanh_activate(float x){return (exp(2*x)-1)/(exp(2*x)+1);}
float linear_gradient(float x){return 1;}
float sigmoid_gradient(float x){return (1-x)*x;}
float relu_gradient(float x){return (x>0);}
float ramp_gradient(float x){return (x>0)+.1;}
float tanh_gradient(float x){return 1-x*x;}
float activate(float x, ACTIVATION a) float activate(float x, ACTIVATION a)
{ {
switch(a){ switch(a){
@ -25,9 +31,32 @@ float activate(float x, ACTIVATION a)
return 0; return 0;
} }
__kernel void activate_array(__global float *x, float gradient(float x, ACTIVATION a)
const int n, const ACTIVATION a) {
switch(a){
case LINEAR:
return linear_gradient(x);
case SIGMOID:
return sigmoid_gradient(x);
case RELU:
return relu_gradient(x);
case RAMP:
return ramp_gradient(x);
case TANH:
return tanh_gradient(x);
}
return 0;
}
__kernel void activate_array(__global float *x, int n, ACTIVATION a)
{ {
int i = get_global_id(0); int i = get_global_id(0);
x[i] = activate(x[i], a); x[i] = activate(x[i], a);
} }
__kernel void gradient_array(__global float *x, int n, ACTIVATION a, __global float *delta)
{
int i = get_global_id(0);
delta[i] *= gradient(x[i], a);
}

View File

@ -14,7 +14,9 @@ float gradient(float x, ACTIVATION a);
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta);
void activate_array(float *x, const int n, const ACTIVATION a); void activate_array(float *x, const int n, const ACTIVATION a);
#ifdef GPU #ifdef GPU
cl_kernel get_activation_kernel();
void activate_array_ongpu(cl_mem x, int n, ACTIVATION a); void activate_array_ongpu(cl_mem x, int n, ACTIVATION a);
void gradient_array_ongpu(cl_mem x, int n, ACTIVATION a, cl_mem delta);
#endif #endif
#endif #endif

View File

@ -32,6 +32,51 @@ void test_convolve()
show_image_layers(edge, "Test Convolve"); show_image_layers(edge, "Test Convolve");
} }
#ifdef GPU
void test_convolutional_layer()
{
int i;
image dog = load_image("data/dog.jpg",256,256);
network net = parse_network_cfg("cfg/convolutional.cfg");
// data test = load_cifar10_data("data/cifar10/test_batch.bin");
// float *X = calloc(net.batch*test.X.cols, sizeof(float));
// float *y = calloc(net.batch*test.y.cols, sizeof(float));
int in_size = get_network_input_size(net)*net.batch;
int size = get_network_output_size(net)*net.batch;
float *X = calloc(in_size, sizeof(float));
for(i = 0; i < in_size; ++i){
X[i] = dog.data[i%get_network_input_size(net)];
}
// get_batch(test, net.batch, X, y);
clock_t start, end;
cl_mem input_cl = cl_make_array(X, in_size);
forward_network_gpu(net, input_cl, 1);
start = clock();
forward_network_gpu(net, input_cl, 1);
end = clock();
float gpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
float *gpu_out = calloc(size, sizeof(float));
memcpy(gpu_out, get_network_output(net), size*sizeof(float));
start = clock();
forward_network(net, X, 1);
end = clock();
float cpu_sec = (float)(end-start)/CLOCKS_PER_SEC;
float *cpu_out = calloc(size, sizeof(float));
memcpy(cpu_out, get_network_output(net), size*sizeof(float));
float sum = 0;
for(i = 0; i < size; ++i) {
//printf("%f, %f\n", gpu_out[i], cpu_out[i]);
sum += pow(gpu_out[i] - cpu_out[i], 2);
}
printf("gpu: %f sec, cpu: %f sec, diff: %f, size: %d\n", gpu_sec, cpu_sec, sum, size);
}
#endif
void test_convolve_matrix() void test_convolve_matrix()
{ {
image dog = load_image("dog.jpg",300,400); image dog = load_image("dog.jpg",300,400);
@ -325,7 +370,7 @@ void test_nist()
void train_nist() void train_nist()
{ {
srand(222222); srand(222222);
network net = parse_network_cfg("cfg/nist_final.cfg"); network net = parse_network_cfg("cfg/nist.cfg");
data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10); data train = load_categorical_data_csv("data/mnist/mnist_train.csv", 0, 10);
data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10); data test = load_categorical_data_csv("data/mnist/mnist_test.csv",0,10);
translate_data_rows(train, -144); translate_data_rows(train, -144);
@ -349,7 +394,7 @@ void train_nist()
mean_array(get_network_output_layer(net,3), 100), mean_array(get_network_output_layer(net,3), 100),
mean_array(get_network_output_layer(net,4), 100)); mean_array(get_network_output_layer(net,4), 100));
*/ */
save_network(net, "cfg/nist_final2.cfg"); //save_network(net, "cfg/nist_final2.cfg");
//printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay); //printf("%5d Training Loss: %lf, Params: %f %f %f, ",count*1000, loss, lr, momentum, decay);
//end = clock(); //end = clock();
@ -798,7 +843,7 @@ int main(int argc, char *argv[])
{ {
//train_full(); //train_full();
//test_distribution(); //test_distribution();
feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW); //feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
//test_blas(); //test_blas();
//test_visualize(); //test_visualize();
@ -809,7 +854,9 @@ int main(int argc, char *argv[])
//test_split(); //test_split();
//test_ensemble(); //test_ensemble();
//test_nist_single(); //test_nist_single();
test_nist(); //test_nist();
train_nist();
//test_convolutional_layer();
//test_cifar10(); //test_cifar10();
//train_cifar10(); //train_cifar10();
//test_vince(); //test_vince();

View File

@ -1,21 +1,21 @@
#include <stdio.h> #include <stdio.h>
#include <math.h> #include <math.h>
inline void col2im_add_pixel(float *im, int height, int width, int channels, inline void col2im_add_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad, float val) int b, int row, int col, int channel, int pad, float val)
{ {
row -= pad; row -= pad;
col -= pad; col -= pad;
if (row < 0 || col < 0 || if (row < 0 || col < 0 ||
row >= height || col >= width) return; row >= height || col >= width) return;
im[col + width*(row + channel*height)] += val; im[col + width*(row + height*(channel+b*channels))] += val;
} }
//This one might be too, can't remember. //This one might be too, can't remember.
void col2im_cpu(float* data_col, void col2im_cpu(float* data_col, int batch,
const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, int pad, float* data_im) int ksize, int stride, int pad, float* data_im)
{ {
int c,h,w; int b,c,h,w;
int height_col = (height - ksize) / stride + 1; int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1;
if (pad){ if (pad){
@ -24,6 +24,8 @@ void col2im_cpu(float* data_col,
pad = ksize/2; pad = ksize/2;
} }
int channels_col = channels * ksize * ksize; int channels_col = channels * ksize * ksize;
int col_size = height_col*width_col*channels_col;
for(b = 0; b < batch; ++b){
for (c = 0; c < channels_col; ++c) { for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize; int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize; int h_offset = (c / ksize) % ksize;
@ -32,12 +34,57 @@ void col2im_cpu(float* data_col,
for (w = 0; w < width_col; ++w) { for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride; int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride; int im_col = w_offset + w * stride;
double val = data_col[(c * height_col + h) * width_col + w]; int col_index = (c * height_col + h) * width_col + w + b*col_size;
double val = data_col[col_index];
col2im_add_pixel(data_im, height, width, channels, col2im_add_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad, val); b, im_row, im_col, c_im, pad, val);
}
} }
} }
} }
} }
#ifdef GPU
#include "opencl.h"
cl_kernel get_col2im_kernel()
{
static int init = 0;
static cl_kernel im2col_kernel;
if(!init){
im2col_kernel = get_kernel("src/col2im.cl", "col2im", 0);
init = 1;
}
return im2col_kernel;
}
void col2im_ongpu(cl_mem data_col, int batch,
int channels, int height, int width,
int ksize, int stride, int pad, cl_mem data_im)
{
cl_setup();
cl_kernel kernel = get_col2im_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(data_col), (void*) &data_col);
cl.error = clSetKernelArg(kernel, i++, sizeof(batch), (void*) &batch);
cl.error = clSetKernelArg(kernel, i++, sizeof(channels), (void*) &channels);
cl.error = clSetKernelArg(kernel, i++, sizeof(height), (void*) &height);
cl.error = clSetKernelArg(kernel, i++, sizeof(width), (void*) &width);
cl.error = clSetKernelArg(kernel, i++, sizeof(ksize), (void*) &ksize);
cl.error = clSetKernelArg(kernel, i++, sizeof(stride), (void*) &stride);
cl.error = clSetKernelArg(kernel, i++, sizeof(pad), (void*) &pad);
cl.error = clSetKernelArg(kernel, i++, sizeof(data_im), (void*) &data_im);
check_error(cl);
size_t global_size = {channels*height*width*batch};
clEnqueueNDRangeKernel(queue, kernel, 3, 0,
global_size, 0, 0, 0, 0);
check_error(cl);
}
#endif

View File

@ -0,0 +1,41 @@
int index(int row, int col)
{
}
__kernel void col2im(__global float *data_col, int batch,
int channels, int height, int width,
int ksize, int stride, int pad, __global float *data_im)
{
int id = get_global_id(0);
int index = id;
int w = id%width;
id /= width;
int h = id%height;
id /= height;
int c = id%channels;
id /= channels;
int b = id%batch;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
int rows = channels * ksize * ksize;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
pad = ksize/2;
}
int cols = height_col*width_col;
int batch_offset = b*cols*rows;
int channel_offset = c*cols*ksize*ksize;
data_col[index] = 0;
int i,j;
for(i = 0; i < ksize; ++i){
row_offset = i*height_col*width_col;
for(j = 0; j < ksize; ++j){
col_offset =
}
}
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
}

View File

@ -147,15 +147,9 @@ void forward_convolutional_layer(const convolutional_layer layer, float *in)
for(i = 0; i < layer.batch; ++i){ for(i = 0; i < layer.batch; ++i){
gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
c += n*m;
in += layer.h*layer.w*layer.c;
b += k*n; b += k*n;
c += n*m;
} }
/*
int i;
for(i = 0; i < m*n; ++i) printf("%f, ", layer.output[i]);
printf("\n");
*/
activate_array(layer.output, m*n*layer.batch, layer.activation); activate_array(layer.output, m*n*layer.batch, layer.activation);
} }
@ -205,10 +199,10 @@ void backward_convolutional_layer(convolutional_layer layer, float *delta)
for(i = 0; i < layer.batch; ++i){ for(i = 0; i < layer.batch; ++i){
gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); gemm(1,0,m,n,k,1,a,m,b,n,0,c,n);
col2im_cpu(c, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta); b += k*n;
c += k*n; c += m*n;
delta += layer.h*layer.w*layer.c;
} }
col2im_cpu(layer.col_image, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta);
} }
} }
@ -278,22 +272,140 @@ image *visualize_convolutional_layer(convolutional_layer layer, char *window, im
} }
#ifdef GPU #ifdef GPU
cl_kernel get_convolutional_learn_bias_kernel()
{
static int init = 0;
static cl_kernel kernel;
if(!init){
kernel = get_kernel("src/convolutional_layer.cl", "learn_bias", 0);
init = 1;
}
return kernel;
}
void learn_bias_convolutional_layer_ongpu(convolutional_layer layer)
{
int size = convolutional_out_height(layer) * convolutional_out_width(layer);
cl_setup();
cl_kernel kernel = get_convolutional_learn_bias_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.batch), (void*) &layer.batch);
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.delta_cl), (void*) &layer.delta_cl);
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.bias_updates_cl), (void*) &layer.bias_updates_cl);
check_error(cl);
const size_t global_size[] = {layer.n};
clEnqueueNDRangeKernel(queue, kernel, 1, 0, global_size, 0, 0, 0, 0);
check_error(cl);
}
cl_kernel get_convolutional_bias_kernel()
{
static int init = 0;
static cl_kernel kernel;
if(!init){
kernel = get_kernel("src/convolutional_layer.cl", "bias", 0);
init = 1;
}
return kernel;
}
void bias_output_gpu(const convolutional_layer layer)
{
int out_h = convolutional_out_height(layer);
int out_w = convolutional_out_width(layer);
int size = out_h*out_w;
cl_setup();
cl_kernel kernel = get_convolutional_bias_kernel();
cl_command_queue queue = cl.queue;
cl_uint i = 0;
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.n), (void*) &layer.n);
cl.error = clSetKernelArg(kernel, i++, sizeof(size), (void*) &size);
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.biases_cl), (void*) &layer.biases_cl);
cl.error = clSetKernelArg(kernel, i++, sizeof(layer.output_cl), (void*) &layer.output_cl);
check_error(cl);
const size_t global_size[] = {layer.batch, layer.n*size};
clEnqueueNDRangeKernel(queue, kernel, 2, 0, global_size, 0, 0, 0, 0);
check_error(cl);
}
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in) void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in)
{ {
int i;
int m = layer.n; int m = layer.n;
int k = layer.size*layer.size*layer.c; int k = layer.size*layer.size*layer.c;
int n = convolutional_out_height(layer)* int n = convolutional_out_height(layer)*
convolutional_out_width(layer)* convolutional_out_width(layer);
layer.batch;
cl_write_array(layer.filters_cl, layer.filters, m*k); //cl_write_array(layer.filters_cl, layer.filters, m*k);
//cl_write_array(layer.biases_cl, layer.biases, m);
bias_output_gpu(layer);
im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, layer.col_image_cl);
for(i = 0; i < layer.batch; ++i){
cl_mem a = layer.filters_cl; cl_mem a = layer.filters_cl;
cl_mem b = layer.col_image_cl; cl_mem b = cl_sub_array(layer.col_image_cl, i*k*n, k*n);
cl_mem c = layer.output_cl; cl_mem c = cl_sub_array(layer.output_cl, i*m*n, m*n);
im2col_ongpu(in, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, b); gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c,n);
gemm_ongpu(0,0,m,n,k,1,a,k,b,n,0,c,n); clReleaseMemObject(b);
activate_array_ongpu(layer.output_cl, m*n, layer.activation); clReleaseMemObject(c);
cl_read_array(layer.output_cl, layer.output, m*n);
} }
activate_array_ongpu(layer.output_cl, m*n*layer.batch, layer.activation);
cl_read_array(layer.output_cl, layer.output, m*n*layer.batch);
}
void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl)
{
int i;
int m = layer.n;
int n = layer.size*layer.size*layer.c;
int k = convolutional_out_height(layer)*
convolutional_out_width(layer);
gradient_array_ongpu(layer.output_cl, m*k*layer.batch, layer.activation, layer.delta_cl);
learn_bias_convolutional_layer_ongpu(layer);
for(i = 0; i < layer.batch; ++i){
cl_mem a = cl_sub_array(layer.delta_cl,i*m*k, m*k);
cl_mem b = cl_sub_array(layer.col_image_cl,i*k*n, k*n);
cl_mem c = layer.filter_updates_cl;
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
clReleaseMemObject(a);
clReleaseMemObject(b);
}
cl_read_array(layer.filter_updates_cl, layer.filter_updates, m*n);
cl_read_array(layer.bias_updates_cl, layer.bias_updates, m);
if(delta_cl){
m = layer.size*layer.size*layer.c;
k = layer.n;
n = convolutional_out_height(layer)*
convolutional_out_width(layer);
for(i = 0; i < layer.batch; ++i){
a = layer.filters_cl;
b = cl_sub_array(layer.delta_cl, i*k*n, k*n);
c = cl_sub_array(layer.col_image_cl, i*m*n, m*n);
gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n);
clReleaseMemObject(b);
clReleaseMemObject(c);
}
col2im_gpu(layer.col_image_cl, layer.batch, layer.c, layer.h, layer.w, layer.size, layer.stride, layer.pad, delta_cl);
}
}
#endif #endif

View File

@ -0,0 +1,25 @@
__kernel void bias(int n, int size, __global float *biases, __global float *output)
{
int batch = get_global_id(0);
int id = get_global_id(1);
int filter = id/size;
int position = id%size;
output[batch*n*size + id] = biases[filter];
}
__kernel void learn_bias(int batch, int n, int size, __global float *delta, __global float *bias_updates)
{
int i,b;
int filter = get_global_id(0);
float sum = 0;
for(b = 0; b < batch; ++b){
for(i = 0; i < size; ++i){
int index = i + size*(filter + n*b);
sum += delta[index];
}
}
bias_updates[filter] += sum;
}

View File

@ -50,6 +50,7 @@ typedef struct {
#ifdef GPU #ifdef GPU
void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in); void forward_convolutional_layer_gpu(convolutional_layer layer, cl_mem in);
void backward_convolutional_layer_gpu(convolutional_layer layer, cl_mem delta_cl);
#endif #endif
convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay); convolutional_layer *make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, float learning_rate, float momentum, float decay);

View File

@ -148,6 +148,16 @@ data load_cifar10_data(char *filename)
return d; return d;
} }
void get_batch(data d, int n, float *X, float *y)
{
int j;
for(j = 0; j < n; ++j){
int index = rand()%d.X.rows;
memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
}
}
data load_all_cifar10() data load_all_cifar10()
{ {
data d; data d;

View File

@ -20,6 +20,7 @@ data load_data_image_pathfile_random(char *filename, int n, char **labels,
data load_cifar10_data(char *filename); data load_cifar10_data(char *filename);
data load_all_cifar10(); data load_all_cifar10();
list *get_paths(char *filename); list *get_paths(char *filename);
void get_batch(data d, int n, float *X, float *y);
data load_categorical_data_csv(char *filename, int target, int k); data load_categorical_data_csv(char *filename, int target, int k);
void normalize_data_rows(data d); void normalize_data_rows(data d);
void scale_data_rows(data d, float s); void scale_data_rows(data d, float s);

View File

@ -6,11 +6,7 @@ void gemm(int TA, int TB, int M, int N, int K, float ALPHA,
float BETA, float BETA,
float *C, int ldc) float *C, int ldc)
{ {
#ifdef GPU
gemm_gpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
#else
gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc);
#endif
} }
void gemm_nn(int M, int N, int K, float ALPHA, void gemm_nn(int M, int N, int K, float ALPHA,
@ -83,6 +79,7 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
float BETA, float BETA,
float *C, int ldc) float *C, int ldc)
{ {
//printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
int i, j; int i, j;
for(i = 0; i < M; ++i){ for(i = 0; i < M; ++i){
for(j = 0; j < N; ++j){ for(j = 0; j < N; ++j){
@ -107,7 +104,11 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA,
#define STR_HELPER(x) #x #define STR_HELPER(x) #x
#define STR(x) STR_HELPER(x) #define STR(x) STR_HELPER(x)
#ifdef __APPLE__
#define BLOCK 1
#else
#define BLOCK 8 #define BLOCK 8
#endif
cl_kernel get_gemm_kernel() cl_kernel get_gemm_kernel()
{ {
@ -126,6 +127,7 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
float BETA, float BETA,
cl_mem C_gpu, int ldc) cl_mem C_gpu, int ldc)
{ {
//printf("gpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
cl_setup(); cl_setup();
cl_kernel gemm_kernel = get_gemm_kernel(); cl_kernel gemm_kernel = get_gemm_kernel();
cl_command_queue queue = cl.queue; cl_command_queue queue = cl.queue;
@ -256,6 +258,8 @@ void test_gpu_accuracy(int TA, int TB, int m, int k, int n)
void test_gpu_blas() void test_gpu_blas()
{ {
test_gpu_accuracy(0,0,10,576,75);
test_gpu_accuracy(0,0,17,10,10); test_gpu_accuracy(0,0,17,10,10);
test_gpu_accuracy(1,0,17,10,10); test_gpu_accuracy(1,0,17,10,10);
test_gpu_accuracy(0,1,17,10,10); test_gpu_accuracy(0,1,17,10,10);
@ -266,6 +270,7 @@ void test_gpu_blas()
test_gpu_accuracy(0,1,1000,10,100); test_gpu_accuracy(0,1,1000,10,100);
test_gpu_accuracy(1,1,1000,10,100); test_gpu_accuracy(1,1,1000,10,100);
/*
time_gpu_random_matrix(0,0,1000,1000,100); time_gpu_random_matrix(0,0,1000,1000,100);
time_random_matrix(0,0,1000,1000,100); time_random_matrix(0,0,1000,1000,100);
@ -277,6 +282,7 @@ void test_gpu_blas()
time_gpu_random_matrix(1,1,1000,1000,100); time_gpu_random_matrix(1,1,1000,1000,100);
time_random_matrix(1,1,1000,1000,100); time_random_matrix(1,1,1000,1000,100);
*/
} }
#endif #endif

View File

@ -1,22 +1,21 @@
#include "mini_blas.h" #include "mini_blas.h"
#include <stdio.h> #include <stdio.h>
inline float im2col_get_pixel(float *im, int height, int width, int channels, inline float im2col_get_pixel(float *im, int height, int width, int channels,
int row, int col, int channel, int pad) int b, int row, int col, int channel, int pad)
{ {
row -= pad; row -= pad;
col -= pad; col -= pad;
if (row < 0 || col < 0 || if (row < 0 || col < 0 ||
row >= height || col >= width) return 0; row >= height || col >= width) return 0;
return im[col + width*(row + channel*height)]; return im[col + width*(row + height*(channel+b*channels))];
} }
//From Berkeley Vision's Caffe! //From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE //https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu_batch(float* data_im, void im2col_cpu(float* data_im, int batch,
const int batch, const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, int pad, float* data_col) int ksize, int stride, int pad, float* data_col)
{ {
int c,h,w,b; int c,h,w,b;
int height_col = (height - ksize) / stride + 1; int height_col = (height - ksize) / stride + 1;
@ -27,44 +26,6 @@ void im2col_cpu_batch(float* data_im,
pad = ksize/2; pad = ksize/2;
} }
int channels_col = channels * ksize * ksize; int channels_col = channels * ksize * ksize;
int im_size = height*width*channels;
//int col_size = height_col*width_col*channels_col;
for (b = 0; b < batch; ++b) {
for (c = 0; c < channels_col; ++c) {
int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize;
for (h = 0; h < height_col; ++h) {
for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride;
int col_index = (c * height_col + h) * width_col + w + (batch-1) * c * height_col*width_col;
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad);
}
}
}
data_im += im_size;
data_col+= channels_col;
}
}
//From Berkeley Vision's Caffe!
//https://github.com/BVLC/caffe/blob/master/LICENSE
void im2col_cpu(float* data_im, const int batch,
const int channels, const int height, const int width,
const int ksize, const int stride, int pad, float* data_col)
{
int c,h,w,b;
int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
pad = ksize/2;
}
int channels_col = channels * ksize * ksize;
int im_size = height*width*channels;
int col_size = height_col*width_col*channels_col; int col_size = height_col*width_col*channels_col;
for (b = 0; b < batch; ++b) { for (b = 0; b < batch; ++b) {
for (c = 0; c < channels_col; ++c) { for (c = 0; c < channels_col; ++c) {
@ -75,14 +36,12 @@ void im2col_cpu(float* data_im, const int batch,
for (w = 0; w < width_col; ++w) { for (w = 0; w < width_col; ++w) {
int im_row = h_offset + h * stride; int im_row = h_offset + h * stride;
int im_col = w_offset + w * stride; int im_col = w_offset + w * stride;
int col_index = (c * height_col + h) * width_col + w; int col_index = (c * height_col + h) * width_col + w + b*col_size;
data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, data_col[col_index] = im2col_get_pixel(data_im, height, width, channels,
im_row, im_col, c_im, pad); b, im_row, im_col, c_im, pad);
} }
} }
} }
data_im += im_size;
data_col += col_size;
} }
} }
@ -104,9 +63,9 @@ cl_kernel get_im2col_kernel()
} }
void im2col_ongpu(cl_mem data_im, const int batch, void im2col_ongpu(cl_mem data_im, int batch,
const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, cl_mem data_col) int ksize, int stride, int pad, cl_mem data_col)
{ {
cl_setup(); cl_setup();
cl_kernel im2col_kernel = get_im2col_kernel(); cl_kernel im2col_kernel = get_im2col_kernel();
@ -120,29 +79,30 @@ void im2col_ongpu(cl_mem data_im, const int batch,
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width); cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(width), (void*) &width);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize); cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(ksize), (void*) &ksize);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride); cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(stride), (void*) &stride);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(pad), (void*) &pad);
cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col); cl.error = clSetKernelArg(im2col_kernel, i++, sizeof(data_col), (void*) &data_col);
check_error(cl); check_error(cl);
int height_col = (height - ksize) / stride + 1; int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1;
int channels_col = channels * ksize * ksize; int channels_col = channels * ksize * ksize;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
}
size_t global_size[2]; size_t global_size[2];
size_t local_size[2]; global_size[0] = batch*channels_col;
global_size[0] = batch; global_size[1] = height_col*width_col;
global_size[1] = channels_col;
local_size[0] = height_col;
local_size[1] = width_col;
clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0, clEnqueueNDRangeKernel(queue, im2col_kernel, 2, 0,
global_size, local_size, 0, 0, 0); global_size, 0, 0, 0, 0);
check_error(cl); check_error(cl);
} }
void im2col_gpu(float *data_im, void im2col_gpu(float *data_im, int batch,
const int batch, const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, int ksize, int stride, int pad, float *data_col)
float *data_col)
{ {
cl_setup(); cl_setup();
cl_context context = cl.context; cl_context context = cl.context;
@ -165,7 +125,7 @@ void im2col_gpu(float *data_im,
check_error(cl); check_error(cl);
im2col_ongpu(im_gpu, batch, channels, height, width, im2col_ongpu(im_gpu, batch, channels, height, width,
ksize, stride, col_gpu); ksize, stride, pad, col_gpu);
clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0); clEnqueueReadBuffer(queue, col_gpu, CL_TRUE, 0, size, data_col, 0, 0, 0);
check_error(cl); check_error(cl);

View File

@ -1,26 +1,43 @@
__kernel void im2col(__global float *data_im, const int im_offset, float im2col_get_pixel(__global float *im, int height, int width, int channels,
const int channels, const int height, const int width, int batch, int row, int col, int channel, int pad)
const int ksize, const int stride, __global float *data_col, const int col_offset)
{ {
int b = get_global_id(0); row -= pad;
int c = get_global_id(1); col -= pad;
int h = get_local_id(0); if (row < 0 || col < 0 || row >= height || col >= width) return 0;
int w = get_local_id(1); int index = col + width*(row + height*(channel+batch*channels));
return im[index];
}
__kernel void im2col(__global float *data_im, int batch,
int channels, int height, int width,
int ksize, int stride, int pad, __global float *data_col)
{
int c,h,w,b;
int height_col = (height - ksize) / stride + 1; int height_col = (height - ksize) / stride + 1;
int width_col = (width - ksize) / stride + 1; int width_col = (width - ksize) / stride + 1;
if (pad){
height_col = 1 + (height-1) / stride;
width_col = 1 + (width-1) / stride;
pad = ksize/2;
}
int gid1 = get_global_id(0);
b = gid1%batch;
c = gid1/batch;
int gid2 = get_global_id(1);
h = gid2%height_col;
w = gid2/height_col;
int channels_col = channels * ksize * ksize; int channels_col = channels * ksize * ksize;
int col_size = height_col*width_col*channels_col;
int im_offset = height*width*channels*b;
int col_offset = height_col*width_col*channels_col*b;
int w_offset = c % ksize; int w_offset = c % ksize;
int h_offset = (c / ksize) % ksize; int h_offset = (c / ksize) % ksize;
int c_im = c / ksize / ksize; int c_im = c / ksize / ksize;
int im_row = h_offset + h * stride;
data_col[(c * height_col + h) * width_col + w + col_offset] = int im_col = w_offset + w * stride;
data_im[(c_im * height + h * stride + h_offset) * width int col_index = (c * height_col + h) * width_col + w + b*col_size;
+ w * stride + w_offset + im_offset]; data_col[col_index] = im2col_get_pixel(data_im, height, width, channels, b, im_row, im_col, c_im, pad);
} }

View File

@ -10,13 +10,17 @@ float *random_matrix(int rows, int cols);
void time_random_matrix(int TA, int TB, int m, int k, int n); void time_random_matrix(int TA, int TB, int m, int k, int n);
#ifdef GPU #ifdef GPU
void im2col_ongpu(cl_mem data_im, const int batch, void im2col_ongpu(cl_mem data_im, int batch,
const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, cl_mem data_col); int ksize, int stride, int pad, cl_mem data_col);
void im2col_gpu(float *data_im, void col2im_ongpu(cl_mem data_col, int batch,
const int batch, const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, float *data_col); int ksize, int stride, int pad, cl_mem data_im);
void im2col_gpu(float *data_im, int batch,
int channels, int height, int width,
int ksize, int stride, int pad, float *data_col);
void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
cl_mem A_gpu, int lda, cl_mem A_gpu, int lda,
@ -25,13 +29,14 @@ void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA,
cl_mem C_gpu, int ldc); cl_mem C_gpu, int ldc);
#endif #endif
void im2col_cpu(float* data_im, const int batch, void im2col_cpu(float* data_im, int batch,
const int channels, const int height, const int width, int channels, int height, int width,
const int ksize, const int stride, int pad, float* data_col); int ksize, int stride, int pad, float* data_col);
void col2im_cpu(float* data_col, int batch,
int channels, int height, int width,
int ksize, int stride, int pad, float* data_im);
void col2im_cpu(float* data_col,
const int channels, const int height, const int width,
const int ksize, const int stride, int pad, float* data_im);
void test_blas(); void test_blas();
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA,

View File

@ -28,25 +28,16 @@ network make_network(int n, int batch)
} }
#ifdef GPU #ifdef GPU
void forward_network(network net, float *input, int train) void forward_network_gpu(network net, cl_mem input_cl, int train)
{ {
cl_setup();
size_t size = get_network_input_size(net);
if(!net.input_cl){
net.input_cl = clCreateBuffer(cl.context,
CL_MEM_READ_WRITE, size*sizeof(float), 0, &cl.error);
check_error(cl);
}
cl_write_array(net.input_cl, input, size);
cl_mem input_cl = net.input_cl;
int i; int i;
for(i = 0; i < net.n; ++i){ for(i = 0; i < net.n; ++i){
if(net.types[i] == CONVOLUTIONAL){ if(net.types[i] == CONVOLUTIONAL){
convolutional_layer layer = *(convolutional_layer *)net.layers[i]; convolutional_layer layer = *(convolutional_layer *)net.layers[i];
forward_convolutional_layer_gpu(layer, input_cl); forward_convolutional_layer_gpu(layer, input_cl);
input_cl = layer.output_cl; input_cl = layer.output_cl;
input = layer.output;
} }
/*
else if(net.types[i] == CONNECTED){ else if(net.types[i] == CONNECTED){
connected_layer layer = *(connected_layer *)net.layers[i]; connected_layer layer = *(connected_layer *)net.layers[i];
forward_connected_layer(layer, input, train); forward_connected_layer(layer, input, train);
@ -72,10 +63,11 @@ void forward_network(network net, float *input, int train)
forward_normalization_layer(layer, input); forward_normalization_layer(layer, input);
input = layer.output; input = layer.output;
} }
*/
} }
} }
#else #endif
void forward_network(network net, float *input, int train) void forward_network(network net, float *input, int train)
{ {
@ -118,7 +110,6 @@ void forward_network(network net, float *input, int train)
} }
} }
} }
#endif
void update_network(network net) void update_network(network net)
{ {
@ -275,45 +266,13 @@ float train_network_sgd(network net, data d, int n)
float *X = calloc(batch*d.X.cols, sizeof(float)); float *X = calloc(batch*d.X.cols, sizeof(float));
float *y = calloc(batch*d.y.cols, sizeof(float)); float *y = calloc(batch*d.y.cols, sizeof(float));
int i,j; int i;
float sum = 0; float sum = 0;
int index = 0;
for(i = 0; i < n; ++i){ for(i = 0; i < n; ++i){
for(j = 0; j < batch; ++j){ get_batch(d, batch, X, y);
index = rand()%d.X.rows;
memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
}
float err = train_network_datum(net, X, y); float err = train_network_datum(net, X, y);
sum += err; sum += err;
//train_network_datum(net, X, y);
/*
float *y = d.y.vals[index];
int class = get_predicted_class_network(net);
correct += (y[class]?1:0);
*/
/*
for(j = 0; j < d.y.cols*batch; ++j){
printf("%6.3f ", y[j]);
} }
printf("\n");
for(j = 0; j < d.y.cols*batch; ++j){
printf("%6.3f ", get_network_output(net)[j]);
}
printf("\n");
printf("\n");
*/
//printf("%d %f %f\n", i,net.output[0], d.y.vals[index][0]);
//if((i+1)%10 == 0){
// printf("%d: %f\n", (i+1), (float)correct/(i+1));
//}
}
//printf("Accuracy: %f\n",(float) correct/n);
//show_image(float_to_image(32,32,3,X), "Orig");
free(X); free(X);
free(y); free(y);
return (float)sum/(n*batch); return (float)sum/(n*batch);

View File

@ -33,6 +33,10 @@ typedef struct {
#endif #endif
} network; } network;
#ifdef GPU
void forward_network_gpu(network net, cl_mem input, int train);
#endif
network make_network(int n, int batch); network make_network(int n, int batch);
void forward_network(network net, float *input, int train); void forward_network(network net, float *input, int train);
float backward_network(network net, float *input, float *truth); float backward_network(network net, float *input, float *truth);

View File

@ -11,6 +11,7 @@ cl_info cl = {0};
void check_error(cl_info info) void check_error(cl_info info)
{ {
clFinish(cl.queue);
if (info.error != CL_SUCCESS) { if (info.error != CL_SUCCESS) {
printf("\n Error number %d", info.error); printf("\n Error number %d", info.error);
exit(1); exit(1);
@ -27,13 +28,60 @@ cl_info cl_init()
// Fetch the Platform and Device IDs; we only want one. // Fetch the Platform and Device IDs; we only want one.
cl_device_id devices[MAX_DEVICES]; cl_device_id devices[MAX_DEVICES];
info.error=clGetPlatformIDs(1, &info.platform, &num_platforms); info.error=clGetPlatformIDs(1, &info.platform, &num_platforms);
printf("=== %d OpenCL platform(s) found: ===\n", num_platforms);
char buffer[10240];
clGetPlatformInfo(info.platform, CL_PLATFORM_PROFILE, 10240, buffer, NULL);
printf(" PROFILE = %s\n", buffer);
clGetPlatformInfo(info.platform, CL_PLATFORM_VERSION, 10240, buffer, NULL);
printf(" VERSION = %s\n", buffer);
clGetPlatformInfo(info.platform, CL_PLATFORM_NAME, 10240, buffer, NULL);
printf(" NAME = %s\n", buffer);
clGetPlatformInfo(info.platform, CL_PLATFORM_VENDOR, 10240, buffer, NULL);
printf(" VENDOR = %s\n", buffer);
clGetPlatformInfo(info.platform, CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL);
printf(" EXTENSIONS = %s\n", buffer);
check_error(info); check_error(info);
info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices); info.error=clGetDeviceIDs(info.platform, CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &num_devices);
if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES; if(num_devices > MAX_DEVICES) num_devices = MAX_DEVICES;
printf("=== %d OpenCL device(s) found on platform:\n", num_devices);
int i;
for (i=0; i<num_devices; i++)
{
char buffer[10240];
cl_uint buf_uint;
cl_ulong buf_ulong;
printf(" -- %d --\n", i);
clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL);
printf(" DEVICE_NAME = %s\n", buffer);
clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL);
printf(" DEVICE_VENDOR = %s\n", buffer);
clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL);
printf(" DEVICE_VERSION = %s\n", buffer);
clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL);
printf(" DRIVER_VERSION = %s\n", buffer);
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL);
printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint);
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL);
printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint);
clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong);
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(buf_ulong), &buf_ulong, NULL);
printf(" DEVICE_MAX_WORK_GROUP_SIZE = %llu\n", (unsigned long long)buf_ulong);
cl_uint items;
clGetDeviceInfo( devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint),
&items, NULL);
printf(" DEVICE_MAX_WORK_ITEM_DIMENSIONS = %u\n", (unsigned int)items);
size_t workitem_size[10];
clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, 10*sizeof(workitem_size), workitem_size, NULL);
printf(" DEVICE_MAX_WORK_ITEM_SIZES = %u / %u / %u \n", (unsigned int)workitem_size[0], (unsigned int)workitem_size[1], (unsigned int)workitem_size[2]);
}
int index = getpid()%num_devices; int index = getpid()%num_devices;
printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index); printf("%d rand, %d devices, %d index\n", getpid(), num_devices, index);
//info.device = devices[index]; //info.device = devices[index];
info.device = devices[1]; info.device = devices[0];
fprintf(stderr, "Found %d device(s)\n", num_devices); fprintf(stderr, "Found %d device(s)\n", num_devices);
check_error(info); check_error(info);
@ -52,8 +100,8 @@ cl_info cl_init()
cl_program cl_fprog(char *filename, char *options, cl_info info) cl_program cl_fprog(char *filename, char *options, cl_info info)
{ {
size_t srcsize; size_t srcsize;
char src[8192]; char src[64*1024];
memset(src, 0, 8192); memset(src, 0, 64*1024);
FILE *fil=fopen(filename,"r"); FILE *fil=fopen(filename,"r");
srcsize=fread(src, sizeof src, 1, fil); srcsize=fread(src, sizeof src, 1, fil);
fclose(fil); fclose(fil);
@ -61,12 +109,12 @@ cl_program cl_fprog(char *filename, char *options, cl_info info)
// Submit the source code of the example kernel to OpenCL // Submit the source code of the example kernel to OpenCL
cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error); cl_program prog=clCreateProgramWithSource(info.context,1, srcptr, &srcsize, &info.error);
check_error(info); check_error(info);
char build_c[4096]; char build_c[1024*64];
// and compile it (after this we could extract the compiled version) // and compile it (after this we could extract the compiled version)
info.error=clBuildProgram(prog, 0, 0, options, 0, 0); info.error=clBuildProgram(prog, 0, 0, options, 0, 0);
if ( info.error != CL_SUCCESS ) { if ( info.error != CL_SUCCESS ) {
fprintf(stderr, "Error Building Program: %d\n", info.error); fprintf(stderr, "Error Building Program: %d\n", info.error);
clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 4096, build_c, 0); clGetProgramBuildInfo( prog, info.device, CL_PROGRAM_BUILD_LOG, 1024*64, build_c, 0);
fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c); fprintf(stderr, "Build Log for %s program:\n%s\n", filename, build_c);
} }
check_error(info); check_error(info);
@ -115,7 +163,8 @@ cl_mem cl_sub_array(cl_mem src, int offset, int size)
cl_buffer_region r; cl_buffer_region r;
r.origin = offset*sizeof(float); r.origin = offset*sizeof(float);
r.size = size*sizeof(float); r.size = size*sizeof(float);
cl_mem sub = clCreateSubBuffer(src, CL_MEM_USE_HOST_PTR, CL_BUFFER_CREATE_TYPE_REGION, &r, 0); cl_mem sub = clCreateSubBuffer(src, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &r, &cl.error);
check_error(cl);
return sub; return sub;
} }