diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 6d2ce905..556efda1 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -141,7 +141,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) size_t t_intput_size = new_ldb * n; size_t t_bit_input_size = t_intput_size / 8;// +1; - + /* int i = 0; im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); //cudaDeviceSynchronize(); @@ -160,23 +160,46 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) (unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu, new_ldb, l.output_gpu, n, l.mean_arr_gpu); //cudaDeviceSynchronize(); //check_error(status); - + */ { - //float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size); + // /* float *input_cpu = (float *)calloc(input_size, sizeof(float)); status = cudaMemcpy(input_cpu, state.input, input_size* sizeof(float), cudaMemcpyDeviceToHost); check_error(status); - convolve_bin_cpu(input_cpu, l.weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad); // CPU + // swaped(binary_weights <-> l.weights) + convolve_cpu(input_cpu, l.weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad); // CPU status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice); check_error(status); free(input_cpu); */ - //convolve_bin_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad); + /* + float *input_cpu = (float *)calloc(input_size, sizeof(float)); + float *input_bin_cpu = (float *)calloc(input_size, sizeof(char)); + //float *weights_bin_cpu = (float *)calloc(l.n*l.c*l.size*l.size, sizeof(char)); + status = cudaMemcpy(input_cpu, state.input, input_size * sizeof(float), cudaMemcpyDeviceToHost); + check_error(status); + float_to_bit(input_cpu, (unsigned char *)input_bin_cpu, input_size); + //float_to_bit(l.weights, (unsigned char *)weights_bin_cpu, l.n*l.c*l.size*l.size); // l.align_bit_weights + + convolve_bin_cpu(input_bin_cpu, (float *)l.align_bit_weights, l.output, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr); // CPU + status = cudaMemcpy(l.output_gpu, l.output, l.outputs * sizeof(float), cudaMemcpyHostToDevice); + check_error(status); + //free(weights_bin_cpu); + free(input_bin_cpu); + free(input_cpu); + */ + + float_to_bit_gpu(state.input, (unsigned char *)l.align_workspace_gpu, input_size); + convolve_bin_gpu(l.align_workspace_gpu, (float *)l.align_bit_weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad, l.new_lda, l.mean_arr_gpu); + + + //convolve_gpu(state.input, l.weights_gpu, l.output_gpu, l.w, l.h, l.c, l.n, l.size, l.pad); + //cudaDeviceSynchronize(); //check_error(status); @@ -309,10 +332,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) int k = l.size*l.size*l.c; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ - im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + float *im = state.input + i*l.c*l.h*l.w; float * a = l.weights_gpu; float * b = state.workspace; float * c = l.output_gpu; + if (l.size == 1) { + b = im; + } + else { + im2col_ongpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace); + } gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n); } #endif diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 452f792e..69af24fe 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -605,6 +605,7 @@ void binary_align_weights(convolutional_layer *l) int m = l->n; int k = l->size*l->size*l->c; size_t new_lda = k + (l->lda_align - k % l->lda_align); // (k / 8 + 1) * 8; + l->new_lda = new_lda; binarize_weights(l->weights, m, k, l->binary_weights); diff --git a/src/im2col.h b/src/im2col.h index b518e803..74211e25 100644 --- a/src/im2col.h +++ b/src/im2col.h @@ -29,9 +29,15 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, unsigned char *B, int ldb, float *C, int ldc, float *mean_arr); -void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad); +void convolve_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad); -void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad); +void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad, + int new_lda, float *mean_arr_gpu); + +void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, + int size, int pad, int new_lda, float *mean_arr_gpu); + +void convolve_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad); #endif #endif diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index f407e976..1d719ee8 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -258,7 +258,17 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) { // -------------------------------- typedef unsigned long long int uint64_t; +typedef unsigned int uint32_t; typedef unsigned char uint8_t; +typedef char int8_t; + +__device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) { + return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0; +} + +__device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) { + return ~(a^b) & 0b1; +} __device__ __host__ static inline uint64_t xnor_int64(uint64_t a, uint64_t b) { return ~(a^b); @@ -462,78 +472,9 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K, C, ldc, mean_arr); } - // -------------------------------- - -__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) -{ - int index = blockIdx.x*blockDim.x + threadIdx.x; - - int fil; - // filter index - //for (fil = 0; fil < n; ++fil) - int chan, y, x, f_y, f_x; - // channel index - //for (chan = 0; chan < in_c; ++chan) - // input - y - //for (y = 0; y < in_h; ++y) - // input - x - //for (x = 0; x < in_w; ++x) - x = index % in_w; - int index2 = index / in_w; - y = index2 % in_h; - fil = index2 / in_h; - if (fil < n) - { - - int const output_index = fil*in_w*in_h + y*in_w + x; - float sum = 0; - - for (chan = 0; chan < in_c; ++chan) - { - int const weights_pre_index = fil*in_c*size*size + chan*size*size; - int const input_pre_index = chan*in_w*in_h; - - // filter - y - for (f_y = 0; f_y < size; ++f_y) - { - int input_y = y + f_y - pad; - // filter - x - for (f_x = 0; f_x < size; ++f_x) - { - int input_x = x + f_x - pad; - if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; - - int input_index = input_pre_index + input_y*in_w + input_x; - int weights_index = weights_pre_index + f_y*size + f_x; - - sum += input[input_index] *weights[weights_index]; - - } - } - // l.output[filters][width][height] += - // state.input[channels][width][height] * - // l.weights[filters][channels][filter_width][filter_height]; - //output[output_index] += sum; - } - output[output_index] = sum; - } - -} - -void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) -{ - size_t array_size = in_w*in_h*n; // width X height X filters - const int num_blocks = array_size / BLOCK + 1; - //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad); - - convolve_bin_gpu_kernel << > > (input, weights, output, in_w, in_h, in_c, n, size, pad); -} - - - -void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) +void convolve_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) { int fil; // filter index @@ -576,4 +517,325 @@ void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int } -} \ No newline at end of file +} +// -------------------------------- + + +void convolve_bin_cpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, + int size, int pad, int new_lda, float *mean_arr_gpu) +{ + int fil; + // filter index +#pragma omp parallel for // "omp parallel for" - automatic parallelization of loop by using OpenMP + for (fil = 0; fil < n; ++fil) { + float mean_val = mean_arr_gpu[fil]; + int chan, y, x, f_y, f_x; + // channel index + for (chan = 0; chan < in_c; ++chan) + // input - y + for (y = 0; y < in_h; ++y) + // input - x + for (x = 0; x < in_w; ++x) + { + int const output_index = fil*in_w*in_h + y*in_w + x; + int const weights_pre_index = fil*in_c*size*size + chan*size*size; + int const input_pre_index = chan*in_w*in_h; + int sum = 0; + int good_val = 0; + + // filter - y + for (f_y = 0; f_y < size; ++f_y) + { + int input_y = y + f_y - pad; + // filter - x + for (f_x = 0; f_x < size; ++f_x) + { + int input_x = x + f_x - pad; + if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; + + int input_index = input_pre_index + input_y*in_w + input_x; + //int weights_index = weights_pre_index + f_y*size + f_x; + //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x; + int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x; + + //sum += input[input_index] * weights[weights_index]; + + int8_t in_bit = get_bit((uint8_t *)input, input_index); + int8_t w_bit = get_bit((uint8_t *)weights, weights_index); + int res = xnor_bit1(in_bit, w_bit); + sum += res; + good_val++; + //sum += (res > 0) ? 1 : -1; + //in_bit = (in_bit > 0) ? 1 : -1; + //w_bit = (w_bit > 0) ? 1 : -1; + //int8_t res = in_bit*w_bit; + //sum += res; + //printf("\n i: %d x w: %d = res: %d \t sum: %d \t mean = %f \n", in_bit, w_bit, res, sum, mean_val); + } + } + //printf("sum = %d, ", sum); + sum = sum - (good_val - sum); + //printf(" size = %d, sum = %d \n", size, sum); + + // l.output[filters][width][height] += + // state.input[channels][width][height] * + // l.weights[filters][channels][filter_width][filter_height]; + output[output_index] += sum*mean_val; + } + } +} +// -------------------------------- + +__global__ void convolve_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) +{ + int index = blockIdx.x*blockDim.x + threadIdx.x; + + int fil; + // filter index + //for (fil = 0; fil < n; ++fil) + int chan, y, x, f_y, f_x; + // channel index + //for (chan = 0; chan < in_c; ++chan) + // input - y + //for (y = 0; y < in_h; ++y) + // input - x + //for (x = 0; x < in_w; ++x) + x = index % in_w; + int index2 = index / in_w; + y = index2 % in_h; + fil = index2 / in_h; + if (fil < n) + { + + int const output_index = fil*in_w*in_h + y*in_w + x; + float sum = 0; + + for (chan = 0; chan < in_c; ++chan) + { + int const weights_pre_index = fil*in_c*size*size + chan*size*size; + int const input_pre_index = chan*in_w*in_h; + + // filter - y + for (f_y = 0; f_y < size; ++f_y) + { + int input_y = y + f_y - pad; + // filter - x + for (f_x = 0; f_x < size; ++f_x) + { + int input_x = x + f_x - pad; + if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; + + int input_index = input_pre_index + input_y*in_w + input_x; + int weights_index = weights_pre_index + f_y*size + f_x; + + sum += input[input_index] * weights[weights_index]; + + } + } + // l.output[filters][width][height] += + // state.input[channels][width][height] * + // l.weights[filters][channels][filter_width][filter_height]; + //output[output_index] += sum; + } + output[output_index] = sum; + } + +} + +void convolve_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, int size, int pad) +{ + size_t array_size = in_w*in_h*n; // width X height X filters + const int num_blocks = array_size / BLOCK + 1; + //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad); + + convolve_gpu_kernel << > > (input, weights, output, in_w, in_h, in_c, n, size, pad); +} + +// -------------------------------- + +/* +__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, + int size, int pad, int new_lda, float *mean_arr_gpu) +{ + int index = blockIdx.x*blockDim.x + threadIdx.x; + + int fil; + // filter index + //for (fil = 0; fil < n; ++fil) + int chan, y, x, f_y, f_x; + // channel index + //for (chan = 0; chan < in_c; ++chan) + // input - y + //for (y = 0; y < in_h; ++y) + // input - x + //for (x = 0; x < in_w; ++x) + x = index % in_w; + int index2 = index / in_w; + y = index2 % in_h; + fil = index2 / in_h; + if (fil < n) // (1-6 for one BLOCK) + { + //float mean_val = mean_arr_gpu[fil]; + int const output_index = fil*in_w*in_h + y*in_w + x; + int sum = 0; + int good_val = 0; + + for (chan = 0; chan < in_c; ++chan) + { + //int const weights_pre_index = fil*in_c*size*size + chan*size*size; + int const weights_pre_index = fil*new_lda + chan*size*size; + int const input_pre_index = chan*in_w*in_h; + + // filter - y + for (f_y = 0; f_y < size; ++f_y) + { + int input_y = y + f_y - pad; + // filter - x + for (f_x = 0; f_x < size; ++f_x) + { + int input_x = x + f_x - pad; + if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; + + int input_index = input_pre_index + input_y*in_w + input_x; + int weights_index = weights_pre_index + f_y*size + f_x; + //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x; + //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x; + + uint8_t in_bit = get_bit((uint8_t *)input, input_index); + uint8_t w_bit = get_bit((uint8_t *)weights, weights_index); + int res = xnor_bit1(in_bit, w_bit); + sum += res; + good_val++; + + //sum += input[input_index] *weights[weights_index]; + + } + } + // l.output[filters][width][height] += + // state.input[channels][width][height] * + // l.weights[filters][channels][filter_width][filter_height]; + //output[output_index] += sum; + } + sum = sum - (good_val - sum); + output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum + } + +} +*/ + +__global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, + int size, int pad, int new_lda, float *mean_arr_gpu) +{ + int index = blockIdx.x*blockDim.x + threadIdx.x; + + int fil; + // filter index + //for (fil = 0; fil < n; ++fil) + int chan, y, x, f_y, f_x; + // channel index + //for (chan = 0; chan < in_c; ++chan) + // input - y + //for (y = 0; y < in_h; ++y) + // input - x + //for (x = 0; x < in_w; ++x) + x = index % in_w; + int index2 = index / in_w; + y = index2 % in_h; + fil = index2 / in_h; + if (fil < n) // (1-6 for one BLOCK) + { + //float mean_val = mean_arr_gpu[fil]; + int const output_index = fil*in_w*in_h + y*in_w + x; + int sum = 0; + int good_val = 0; + + int min_index = blockIdx.x*blockDim.x; + int min_fil = (min_index / in_w) / in_h; + int max_index = (blockIdx.x+1)*blockDim.x - 1; + int max_fil = (max_index / in_w) / in_h; + + __shared__ uint32_t weights_shared[3*3*1024*6/32 + 1]; // 7 KB (6 filters) - use (new_lda) for size calculation + //const int weights_size = size*size*in_c/8; + const int weights_size = size*size*in_c / 32 + 1; + + for (int fil = min_fil; fil <= max_fil; fil++) { + for (int s = threadIdx.x; s < weights_size; s += blockDim.x) { + //weights_shared[s + (fil - min_fil)*new_lda / 8] = ((uint8_t *)weights)[fil*new_lda / 8 + s]; + weights_shared[s + (fil - min_fil)*new_lda/32] = ((uint32_t *)weights)[fil*new_lda / 32 + s]; + } + } + __syncthreads(); + + for (chan = 0; chan < in_c; ++chan) + { + //int const weights_pre_index = fil*in_c*size*size + chan*size*size; + int const weights_pre_index = fil*new_lda + chan*size*size; + int const input_pre_index = chan*in_w*in_h; + + __shared__ uint32_t input_shared[416*416/32]; // 21.2 KB bytes + const int input_shared_size = in_w*in_h / 32; + const int add_input_index = input_pre_index % 32; + + for (int s = threadIdx.x; s < input_shared_size; s += blockDim.x) { + input_shared[s] = ((uint32_t *)weights)[input_pre_index / 32 + s]; + } + __syncthreads(); + + // filter - y + for (f_y = 0; f_y < size; ++f_y) + { + int input_y = y + f_y - pad; + // filter - x + for (f_x = 0; f_x < size; ++f_x) + { + int input_x = x + f_x - pad; + if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; + + int input_index = input_pre_index + input_y*in_w + input_x; + int weights_index = weights_pre_index + f_y*size + f_x; + //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x; + //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x; + + uint8_t in_bit = get_bit((uint8_t *)input, input_index); + //uint8_t w_bit = get_bit((uint8_t *)weights, weights_index); + + //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x; + int weights_shared_index = (fil - min_fil)*new_lda + chan*size*size + f_y*size + f_x; + //uint8_t in_bit = get_bit((uint8_t *)weights_shared, weights_shared_index); + uint8_t w_bit = get_bit((uint8_t *)weights_shared, weights_shared_index); + + //int input_index = input_pre_index + input_y*in_w + input_x; + //int input_shared_index = /*input_pre_index +*/ input_y*in_w + input_x + add_input_index; + //uint8_t in_bit = get_bit((uint8_t *)input_shared, input_shared_index); + + int res = xnor_bit1(in_bit, w_bit); + sum += res; + good_val++; + + //sum += input[input_index] *weights[weights_index]; + + } + } + // l.output[filters][width][height] += + // state.input[channels][width][height] * + // l.weights[filters][channels][filter_width][filter_height]; + //output[output_index] += sum; + } + sum = sum - (good_val - sum); + output[output_index] = sum * mean_arr_gpu[fil]; // atoimcAdd for inter-BLOCK sum + } + +} + +void convolve_bin_gpu(float *input, float *weights, float *output, int in_w, int in_h, int in_c, int n, + int size, int pad, int new_lda, float *mean_arr_gpu) +{ + size_t array_size = in_w*in_h*n; // width X height X filters + const int num_blocks = array_size / BLOCK + 1; + //printf("\n array_size = %d, num_blocks = %d, w = %d, h = %d, n = %d, c = %d, pad = %d \n", array_size, num_blocks, in_w, in_h, n, in_c, pad); + + convolve_bin_gpu_kernel << > > (input, weights, output, in_w, in_h, in_c, n, size, pad, new_lda, mean_arr_gpu); +} + +// -------------------------------- + diff --git a/src/layer.h b/src/layer.h index 9ec35f1f..be725d9b 100644 --- a/src/layer.h +++ b/src/layer.h @@ -189,6 +189,7 @@ struct layer{ float *mean_arr; int align_bit_weights_size; int lda_align; + int new_lda; int bit_align; float *col_image;