diff --git a/build/darknet/x64/cfg/tiny-yolo_xnor.cfg b/build/darknet/x64/cfg/tiny-yolo_xnor.cfg index c39aabed..42a96309 100644 --- a/build/darknet/x64/cfg/tiny-yolo_xnor.cfg +++ b/build/darknet/x64/cfg/tiny-yolo_xnor.cfg @@ -32,6 +32,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=32 size=3 @@ -45,6 +46,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=64 size=3 @@ -58,6 +60,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=128 size=3 @@ -71,6 +74,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=256 size=3 @@ -84,6 +88,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=512 size=3 @@ -97,6 +102,7 @@ stride=1 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=1024 size=3 diff --git a/build/darknet/x64/cfg/yolov3-tiny_xnor.cfg b/build/darknet/x64/cfg/yolov3-tiny_xnor.cfg index 8427beed..725ce079 100644 --- a/build/darknet/x64/cfg/yolov3-tiny_xnor.cfg +++ b/build/darknet/x64/cfg/yolov3-tiny_xnor.cfg @@ -36,6 +36,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=32 size=3 @@ -49,6 +50,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=64 size=3 @@ -62,6 +64,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=128 size=3 @@ -88,6 +91,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=512 size=3 @@ -101,6 +105,7 @@ stride=1 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=1024 size=3 @@ -173,6 +178,7 @@ stride=1 pad=1 activation=leaky + [convolutional] size=1 stride=1 diff --git a/cfg/tiny-yolo_xnor.cfg b/cfg/tiny-yolo_xnor.cfg index c39aabed..42a96309 100644 --- a/cfg/tiny-yolo_xnor.cfg +++ b/cfg/tiny-yolo_xnor.cfg @@ -32,6 +32,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=32 size=3 @@ -45,6 +46,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=64 size=3 @@ -58,6 +60,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=128 size=3 @@ -71,6 +74,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=256 size=3 @@ -84,6 +88,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=512 size=3 @@ -97,6 +102,7 @@ stride=1 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=1024 size=3 diff --git a/cfg/yolov3-tiny_xnor.cfg b/cfg/yolov3-tiny_xnor.cfg index 8427beed..725ce079 100644 --- a/cfg/yolov3-tiny_xnor.cfg +++ b/cfg/yolov3-tiny_xnor.cfg @@ -36,6 +36,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=32 size=3 @@ -49,6 +50,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=64 size=3 @@ -62,6 +64,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=128 size=3 @@ -88,6 +91,7 @@ stride=2 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=512 size=3 @@ -101,6 +105,7 @@ stride=1 [convolutional] xnor=1 +bin_output=1 batch_normalize=1 filters=1024 size=3 @@ -173,6 +178,7 @@ stride=1 pad=1 activation=leaky + [convolutional] size=1 stride=1 diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 40a63f5b..809557af 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -119,18 +119,15 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) if(l.xnor){ if (!l.align_bit_weights_gpu || state.train) { binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu); - - swap_binary(&l); - binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); - state.input = l.binary_input_gpu; } //swap_binary(&l); //binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); //state.input = l.binary_input_gpu; //cudaDeviceSynchronize(); - if (l.align_bit_weights_gpu && !state.train) + if (l.align_bit_weights_gpu && !state.train && l.c >= 256 && l.size > 1) { + //return; cudaError_t status = cudaSuccess; int input_size = l.c*l.h*l.w*l.batch; @@ -146,15 +143,25 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //if(0) { + //cudaDeviceSynchronize(); + int i = 0; - im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); - //cudaDeviceSynchronize(); + if (l.stride == 1 && l.c >= 256 && l.w > 13 && l.size > 1 && 0) // disabled + { + // stride=1 only + im2col_align_bin_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); + //cudaDeviceSynchronize(); + } + else + { + im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, l.align_workspace_gpu, l.bit_align); + //cudaDeviceSynchronize(); + //getchar(); - // should be optimized - float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size); - //cudaDeviceSynchronize(); - - //im2col_align_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace, l.bit_align); + // should be optimized + float_to_bit_gpu(l.align_workspace_gpu, (unsigned char *)state.workspace, l.align_workspace_size); + //cudaDeviceSynchronize(); + } transpose_bin_gpu((unsigned char *)state.workspace, (unsigned char *)l.transposed_align_workspace_gpu, k, n, l.bit_align, new_ldb, 8); //cudaDeviceSynchronize(); @@ -197,7 +204,13 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) } } - fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); + if (l.xnor) { + swap_binary(&l); + binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu); + state.input = l.binary_input_gpu; + } + + //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); #ifdef CUDNN float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT @@ -294,7 +307,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) #else cudnnConvolutionForward(cudnn_handle(), - &one, + &alpha, //&one, l.srcTensorDesc, state.input, l.weightDesc, @@ -303,9 +316,11 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) l.fw_algo, state.workspace, l.workspace_size, - &one, + &beta, //&one, l.dstTensorDesc, l.output_gpu); + + //cudaDeviceSynchronize(); #endif // CUDNN_HALF @@ -338,7 +353,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) } #endif // no CUDNN_HALF - activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); + if (l.activation != LINEAR) activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); //if(l.dot > 0) dot_error_gpu(l); if(l.binary || l.xnor) swap_binary(&l); //cudaDeviceSynchronize(); // for correct profiling of performance diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 69af24fe..d4884f0c 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -257,7 +257,7 @@ void cudnn_convolutional_setup(layer *l, int cudnn_preference) #endif #endif -convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam) +convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output) { int i; convolutional_layer l = {0}; @@ -269,6 +269,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.n = n; l.binary = binary; l.xnor = xnor; + l.use_bin_output = use_bin_output; l.batch = batch; l.stride = stride; l.size = size; @@ -307,7 +308,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.binary_weights = calloc(c*n*size*size, sizeof(float)); l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); - int align = 8; + int align = 32;// 8; int src_align = l.out_h*l.out_w; l.bit_align = src_align + (align - src_align % align); } @@ -404,8 +405,9 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int //fprintf(stderr, "conv %5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); l.bflops = (2.0 * l.n * l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.; - if (l.xnor) fprintf(stderr, "convX "); - else fprintf(stderr, "conv "); + if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB"); + else if (l.xnor) fprintf(stderr, "convX "); + else fprintf(stderr, "conv "); fprintf(stderr, "%5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); return l; @@ -428,7 +430,7 @@ void denormalize_convolutional_layer(convolutional_layer l) void test_convolutional_layer() { - convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0); + convolutional_layer l = make_convolutional_layer(1, 5, 5, 3, 2, 5, 2, 1, LEAKY, 1, 0, 0, 0, 0); l.batch_normalize = 1; float data[] = {1,1,1,1,1, 1,1,1,1,1, diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 8869a3c3..20e9d68e 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -25,7 +25,7 @@ void cuda_convert_f32_to_f16(float* input_f32, size_t size, float *output_f16); #endif #endif -convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam); +convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int padding, ACTIVATION activation, int batch_normalize, int binary, int xnor, int adam, int use_bin_output); void denormalize_convolutional_layer(convolutional_layer l); void resize_convolutional_layer(convolutional_layer *layer, int w, int h); void forward_convolutional_layer(const convolutional_layer layer, network_state state); diff --git a/src/crnn_layer.c b/src/crnn_layer.c index 54958803..f78e0c9d 100644 --- a/src/crnn_layer.c +++ b/src/crnn_layer.c @@ -48,17 +48,17 @@ layer make_crnn_layer(int batch, int h, int w, int c, int hidden_filters, int ou l.input_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0); + *(l.input_layer) = make_convolutional_layer(batch*steps, h, w, c, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0); + *(l.self_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, hidden_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0); + *(l.output_layer) = make_convolutional_layer(batch*steps, h, w, hidden_filters, output_filters, 3, 1, 1, activation, batch_normalize, 0, 0, 0, 0); l.output_layer->batch = batch; l.output = l.output_layer->output; diff --git a/src/im2col.h b/src/im2col.h index 1013d4cd..9a6be9ab 100644 --- a/src/im2col.h +++ b/src/im2col.h @@ -17,6 +17,10 @@ void im2col_align_ongpu(float *im, int channels, int height, int width, int ksize, int stride, int pad, float *data_col, int bit_align); +void im2col_align_bin_ongpu(float *im, + int channels, int height, int width, + int ksize, int stride, int pad, float *data_col, int bit_align); + void float_to_bit_gpu(float *src, unsigned char *dst, size_t size); void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m, diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 9e122136..e1ac3fa2 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -45,6 +45,7 @@ __global__ void im2col_gpu_kernel(const int n, const float* data_im, *data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0; + //data_im[(channel_in * height + h_in) * width + w_in + i * width + j]; //*data_col_ptr = data_im_ptr[ii * width + jj]; data_col_ptr += height_col * width_col; @@ -69,7 +70,7 @@ void im2col_ongpu(float *im, } // -------------------------------- - +/* __global__ void im2col_align_gpu_kernel(const int n, const float* data_im, const int height, const int width, const int ksize, const int pad, @@ -120,6 +121,71 @@ __global__ void im2col_align_gpu_kernel(const int n, const float* data_im, } } } +*/ + +// float 32 +__global__ void im2col_align_gpu_kernel(const int n, const float* data_im, + const int height, const int width, const int ksize, + const int pad, + const int stride, + const int height_col, const int width_col, + float *data_col, const int bit_align) +{ + __shared__ float tmp_s[1]; + +//#define SHRED_VALS ((BLOCK / 169) * ) + __shared__ float dst_s[1024]; + //__shared__ float dst_s[1024]; + //__shared__ uint32_t bit_s[32]; + __shared__ uint8_t bit_s[128]; + + int index = blockIdx.x*blockDim.x + threadIdx.x; + for (; index < n; index += blockDim.x*gridDim.x) { + int w_out = index % width_col; + int h_index = index / width_col; + int h_out = h_index % height_col; + int channel_in = h_index / height_col; + int channel_out = channel_in * ksize * ksize; + int h_in = h_out * stride - pad; + int w_in = w_out * stride - pad; + //float* data_col_ptr = data_col; + //float* data_col_ptr_32 = data_col + (channel_out * bit_align + h_out * width_col + w_out) / 32; + //data_col_ptr += (channel_out * height_col + h_out) * width_col + w_out; + //data_col_ptr += channel_out * bit_align + h_out * width_col + w_out; + float* data_col_ptr = &data_col[channel_out * bit_align + h_out * width_col + w_out]; + const float* data_im_ptr = data_im; + data_im_ptr += (channel_in * height + h_in) * width + w_in; + for (int i = 0; i < ksize; ++i) { + for (int j = 0; j < ksize; ++j) { + int h = h_in + i; + int w = w_in + j; + + float val = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im_ptr[i * width + j] : 0; + + int pre_out_index = index % (width_col*height_col); + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index;// h_out * width_col + w_out; + data_col[out_index] = val; + + //*data_col_ptr = val; + //dst_s[threadIdx.x] = val; + //tmp_s[0] = val; + + //*data_col_ptr = (h >= 0 && w >= 0 && h < height && w < width) ? + // data_im_ptr[i * width + j] : 0; + + //float src_val = (h >= 0 && w >= 0 && h < height && w < width) ? data_im_ptr[i * width + j] : 0; + //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0); + //if (threadIdx.x % WARP_SIZE == 0) *((unsigned int*)data_col_ptr_32) = bit_mask; + // use atomicOr() // *dst_ptr |= (mask << (col_index % 8)); + //data_col_ptr_32 += bit_align / 32; + + //data_col_ptr += height_col * width_col; + data_col_ptr += bit_align; + } + } + } +} void im2col_align_ongpu(float *im, int channels, int height, int width, @@ -135,6 +201,354 @@ void im2col_align_ongpu(float *im, stride, height_col, width_col, data_col, bit_align); } + + +// -------------------------------- + +/* +// binary im2col +__global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im, + const int height, const int width, const int ksize, const int channels, + const int pad, + const int stride, + const int height_col, const int width_col, + float *data_col, const int bit_align) +{ + __shared__ float tmp_s[1]; + + //#define SHRED_VALS ((BLOCK / 169) * ) + __shared__ float dst_s[1024]; + //__shared__ float dst_s[1024]; + //__shared__ uint32_t bit_s[32]; + __shared__ uint8_t bit_s[128]; + + int index = blockIdx.x*blockDim.x + threadIdx.x; + for (; index < n; index += blockDim.x*gridDim.x) + { + //int c_index = index; + //int channel_in = c_index % channels; + + int h_out = index % height_col; + int c_index = index / height_col; + int channel_in = c_index % channels; + + int channel_out = channel_in * ksize * ksize; + + int j_index = c_index / channels; + int j = j_index % ksize; + int i = j_index / ksize; + if (i < ksize) + { + for (int w_out = 0; w_out < width_col; ++w_out) + { + int h_in = h_out * stride - pad; + int w_in = w_out * stride - pad; + + int h = h_in + i; + int w = w_in + j; + + float val = (h >= 0 && w >= 0 && h < height && w < width) ? + data_im[(channel_in * height + h_in) * width + w_in + i * width + j] : 0; + + //int pre_out_index = index % (width_col*height_col); + int pre_out_index = h_out * width_col + w_out; + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index; + data_col[out_index] = val; + + + }// w_out + } + } +} +*/ + +/* +// binary im2col +__global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im, + const int height, const int width, const int ksize, const int channels, + const int pad, + const int stride, + const int height_col, const int width_col, + float *data_col, const int bit_align) +{ + __shared__ float tmp_s[1]; + __shared__ ulonglong4 tmp256_s[1]; + + + //#define SHRED_VALS ((BLOCK / 169) * ) + //__shared__ float dst_s[1024]; + //__shared__ float dst_s[1024]; + //__shared__ uint32_t bit_s[32]; + //__shared__ uint8_t bit_s[128]; + + int index = blockIdx.x*blockDim.x + threadIdx.x; + //for (; index < n; index += blockDim.x*gridDim.x) + { + //int c_index = index; + //int channel_in = c_index % channels; + + int h_out = index % height_col; + int c_index = index / height_col; + int channel_in = c_index % channels; + + int channel_out = channel_in * ksize * ksize; + + int j_index = c_index / channels; + int j = j_index % ksize; + int i = j_index / ksize; + + int h_in = h_out * stride - pad; + int h = h_in + i; + + //if (i < ksize) + { + int w_out = 0; + + // the end of padding + //if(0) + for (; w_out < (width_col); w_out += 32) + { + int w = w_out * stride - pad + j; + int pre_in_index = (channel_in * height + h_in) * width + i * width; + int in_index = pre_in_index + w; + //float *src_p = (float *)&data_im[in_index]; + + int pre_out_index = h_out * width_col + w_out; + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index; + // float *dst_p = (float *)&data_col[out_index]; + + if (i >= ksize) { + out_index = -1; + } + + #pragma unroll + for (int t = 0; t < WARP_SIZE; ++t) { + const int lane_id = threadIdx.x % WARP_SIZE; + + //const int64_t cur_pre_in_index = pre_in_index; + //const int64_t cur_j = j; + //const int64_t out_i = out_index;// __shfl(out_index, t) + lane_id; + + const int64_t cur_out_index = __shfl(out_index, t); + if (cur_out_index >= 0) + { + const int64_t cur_pre_in_index = __shfl(pre_in_index, t); + const int64_t cur_j = __shfl(j, t); + const int64_t cur_h = __shfl(h, t); + + int cur_w = ((w_out + lane_id) * stride - pad + cur_j); + int in_index = cur_pre_in_index + cur_w; + + float val = (cur_w >= 0 && cur_w < width && cur_h >= 0 && cur_h < height) ? + data_im[in_index] : float(); + + if ((w_out + lane_id) < width_col) { + data_col[cur_out_index + lane_id] = val; + //tmp_s[0] = val; + + //uint32_t bit_mask = __ballot(val > 0); + //uint8_t *bit8_ptr = &(((uint8_t *)data_col)[cur_out_index / 8]); + //uint32_t *bit32_ptr = (uint32_t *)bit8_ptr; + //*bit32_ptr = bit_mask; + } + } + } + + }// w_out + +#ifdef NOT_USED + if (i < ksize && h >= 0 && h < height) + { + + // wait for align address and the end of padding + for (; w_out < width_col; ++w_out) + { + int w_in = w_out * stride - pad; + int w = w_in + j; + + int in_index = (channel_in * height + h_in) * width + w_in + i * width + j; + float *src_p = (float *)&data_im[in_index]; + + int pre_out_index = h_out * width_col + w_out; + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index; + float *dst_p = (float *)&data_col[out_index]; + + if (((uint64_t)src_p % 32 == 0) && ((uint64_t)dst_p % 32 == 0) && w > 0) { + //printf(" aligned addresses and there is no padding \n"); + break; + } + + float val = (w >= 0 && w < width) ? + (*src_p) : float(); + + *dst_p = val; + //tmp_s[0] = val; + }// w_out + + // ulonglong4 (256 bit) / instead of float (32 bit) = 8x times + for (; w_out < (width_col - 8); w_out += 8) + { + int w_in = w_out * stride - pad; + int w = w_in + j; + + ulonglong4 *src_p = (ulonglong4 *)&data_im[(channel_in * height + h_in) * width + w_in + i * width + j]; + + int pre_out_index = h_out * width_col + w_out; + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index; + ulonglong4 *dst_p = (ulonglong4 *)&data_col[out_index]; + + ulonglong4 val = (w < width) ? + (*src_p) : ulonglong4(); + + *dst_p = val; + //tmp256_s[0] = val; + }// w_out + + for (; w_out < width_col; ++w_out) + { + //int h_in = h_out * stride - pad; + int w_in = w_out * stride - pad; + + //int h = h_in + i; + int w = w_in + j; + + float val = (w < width) ? + data_im[(channel_in * height + h_in) * width + w_in + i * width + j] : 0; + + int pre_out_index = h_out * width_col + w_out; + int out_index = (channel_out + i*ksize + j) * bit_align + pre_out_index; + data_col[out_index] = val; + //tmp_s[0] = val; + }// w_out + } +#endif // NOT_USED + } + } +} +*/ + + +// binary im2col - stride=1 +__global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im, + const int height, const int width, const int ksize, const int channels, + const int pad, + const int stride, + const int height_col, const int width_col, + float *data_col, const int bit_align) +{ + __shared__ float tmp_s[1]; + __shared__ ulonglong4 tmp256_s[1]; + + + //#define SHRED_VALS ((BLOCK / 169) * ) + //__shared__ float dst_s[1024]; + //__shared__ float dst_s[1024]; + //__shared__ uint32_t bit_s[32]; + //__shared__ uint8_t bit_s[128]; + + int index = blockIdx.x*blockDim.x + threadIdx.x; + //for (; index < n; index += blockDim.x*gridDim.x) + { + int c_index = index; + int channel_in = c_index % channels; + + //int h_out = index % height_col; + //int c_index = index / height_col; + //int channel_in = c_index % channels; + + int channel_out = channel_in * ksize * ksize; + + int j_index = c_index / channels; + int j = j_index % ksize; + int i = j_index / ksize; + + int pre_out_index = (channel_out + i*ksize + j) * bit_align; + int j_pad = (j - pad); + int i_pad = (i - pad); + + for(int wh_index = 0; wh_index < (height_col*width_col); wh_index += 32) + //for (int h_out = 0; h_out < height_col; ++h_out) + { + + // the end of padding + //if(0) + //for (int w_out = 0; w_out < (width_col); w_out += 32) + { + const int w_out = wh_index % width_col; + const int h_out = wh_index / width_col; + + const int w = w_out + j_pad; + const int h = h_out + i_pad; + + int pre_in_index = channel_in * height * width; + int pre_in_wh_index = h * width + w; + + int send_wh_index = wh_index; + if (i >= ksize) send_wh_index = height_col*width_col; + + #pragma unroll + for (int t = 0; t < WARP_SIZE; ++t) + { + const int lane_id = threadIdx.x % WARP_SIZE; + + const int cur_wh_index = __shfl(send_wh_index, t) + lane_id; + + if (cur_wh_index < (width_col*height_col))// && (cur_i_pad+pad) < ksize) + { + const int cur_pre_out_index = __shfl(pre_out_index, t); + + const int cur_pre_in_index = __shfl(pre_in_index, t); + const int cur_pre_in_wh_index = __shfl(pre_in_wh_index, t) + lane_id; + + int w = cur_pre_in_wh_index % width; + int h = cur_pre_in_wh_index / width; + int in_index = cur_pre_in_index + cur_pre_in_wh_index; + + int out_index = cur_pre_out_index + cur_wh_index; + + float val = (w >= 0 && w < width && h >= 0 && h < height) ? + data_im[in_index] : float(); + + //data_col[out_index] = val; + //tmp_s[0] = val; + + uint32_t bit_mask = __ballot(val > 0); + if (lane_id == 0) { + uint8_t *bit8_ptr = &(((uint8_t *)data_col)[out_index / 8]); + uint32_t *bit32_ptr = (uint32_t *)bit8_ptr; + *bit32_ptr = bit_mask; + } + } + + + } + + }// w_out + + } + } +} + + +void im2col_align_bin_ongpu(float *im, + int channels, int height, int width, + int ksize, int stride, int pad, float *data_col, int bit_align) { + // We are going to launch channels * height_col * width_col kernels, each + // kernel responsible for copying a single-channel grid. + int height_col = (height + 2 * pad - ksize) / stride + 1; + int width_col = (width + 2 * pad - ksize) / stride + 1; + //int num_kernels = channels * height_col * width_col * ksize * ksize; + //int num_kernels = channels * ksize * ksize * height_col; + int num_kernels = channels * ksize * ksize; + int num_blocks = num_kernels / BLOCK + 1; + + //im2col_align_bin_gpu_kernel << <(num_kernels + BLOCK - 1) / BLOCK, + im2col_align_bin_gpu_kernel << > >( + num_kernels, im, height, width, ksize, channels, pad, + stride, height_col, + width_col, data_col, bit_align); +} // -------------------------------- @@ -560,7 +974,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int int count = 0; k = 0; -//#ifdef NON_USED +//#ifdef NOT_USED // 32 thread X 64 bit = 2048 bit for (; k < (K - 2048); k += 2048) { // l.size*l.size*l.c - one filter size [27 - 9216] uint64_t c_bit64; @@ -591,7 +1005,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } //#endif -//#ifdef NON_USED +//#ifdef NOT_USED // 32 thread X 32 bit = 1024 bit for (; k < (K - 1024); k += 1024) { // l.size*l.size*l.c - one filter size [27 - 9216] @@ -626,7 +1040,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int float mean_val = mean_arr[i]; float bias_val = bias_arr[i]; -//#ifdef NON_USED +//#ifdef NOT_USED for (; k < K; k += 256) { // l.size*l.size*l.c - one filter size [27 - 144 - 9216] //ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights @@ -638,7 +1052,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } //#endif -#ifdef NON_USED +#ifdef NOT_USED for (; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216] //uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights @@ -697,7 +1111,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int int count = 0; k = 0; -//#ifdef NON_USED +//#ifdef NOT_USED // 32 thread X 64 bit = 2048 bit for (; k < (K - 2048); k += 2048) { // l.size*l.size*l.c - one filter size [27 - 9216] uint64_t c_bit64; @@ -728,7 +1142,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } //#endif -//#ifdef NON_USED +//#ifdef NOT_USED // 32 thread X 32 bit = 1024 bit for (; k < (K - 1024); k += 1024) { // l.size*l.size*l.c - one filter size [27 - 9216] @@ -763,7 +1177,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int float mean_val = mean_arr[i]; float bias_val = bias_arr[i]; -//#ifdef NON_USED +//#ifdef NOT_USED for (; k < K; k += 256) { // l.size*l.size*l.c - one filter size [27 - 144 - 9216] ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights //ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input @@ -775,7 +1189,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } //#endif -#ifdef NON_USED +#ifdef NOT_USED for (; k < K; k += 64) { // l.size*l.size*l.c - one filter size [27 - 9216] uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights //uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input diff --git a/src/network.c b/src/network.c index 0666ff1e..08bb34e0 100644 --- a/src/network.c +++ b/src/network.c @@ -793,6 +793,7 @@ void free_network(network net) #ifdef GPU if (gpu_index >= 0) cuda_free(net.workspace); else free(net.workspace); + if (net.input_state_gpu) cuda_free(net.input_state_gpu); if (*net.input_gpu) cuda_free(*net.input_gpu); if (*net.truth_gpu) cuda_free(*net.truth_gpu); if (net.input_gpu) free(net.input_gpu); @@ -866,7 +867,7 @@ void calculate_binary_weights(network net) binary_align_weights(l); - if(net.layers[j + 1].use_bin_output) { + if(net.layers[j].use_bin_output) { l->activation = LINEAR; } } diff --git a/src/network.h b/src/network.h index 3b07dbb3..18dc5953 100644 --- a/src/network.h +++ b/src/network.h @@ -64,6 +64,8 @@ typedef struct network{ tree *hierarchy; #ifdef GPU + float *input_state_gpu; + float **input_gpu; float **truth_gpu; float **input16_gpu; diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 0ecba682..dee781b9 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -51,7 +51,7 @@ void forward_network_gpu(network net, network_state state) for(i = 0; i < net.n; ++i){ state.index = i; layer l = net.layers[i]; - if(l.delta_gpu){ + if(l.delta_gpu && state.train){ fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1); } l.forward_gpu(l, state); @@ -428,13 +428,15 @@ float *network_predict_gpu(network net, float *input) network_state state; state.index = 0; state.net = net; - state.input = cuda_make_array(input, size); + //state.input = cuda_make_array(input, size); // memory will be allocated in the parse_network_cfg_custom() + state.input = net.input_state_gpu; + cuda_push_array(state.input, input, size); state.truth = 0; state.train = 0; state.delta = 0; forward_network_gpu(net, state); float *out = get_network_output_gpu(net); - cuda_free(state.input); + //cuda_free(state.input); // will be freed in the free_network() return out; } diff --git a/src/parser.c b/src/parser.c index 2fe9bf3e..95589216 100644 --- a/src/parser.c +++ b/src/parser.c @@ -163,11 +163,12 @@ convolutional_layer parse_convolutional(list *options, size_params params) int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); int binary = option_find_int_quiet(options, "binary", 0); int xnor = option_find_int_quiet(options, "xnor", 0); + int use_bin_output = option_find_int_quiet(options, "bin_output", 0); - convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam); + convolutional_layer layer = make_convolutional_layer(batch,h,w,c,n,size,stride,padding,activation, batch_normalize, binary, xnor, params.net.adam, use_bin_output); layer.flipped = option_find_int_quiet(options, "flipped", 0); layer.dot = option_find_float_quiet(options, "dot", 0); - layer.use_bin_output = option_find_int_quiet(options, "bin_output", 0); + if(params.net.adam){ layer.B1 = params.net.B1; layer.B2 = params.net.B2; @@ -819,6 +820,8 @@ network parse_network_cfg_custom(char *filename, int batch) #ifdef GPU if(gpu_index >= 0){ net.workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1); + int size = get_network_input_size(net) * net.batch; + net.input_state_gpu = cuda_make_array(0, size); }else { net.workspace = calloc(1, workspace_size); } diff --git a/src/region_layer.c b/src/region_layer.c index ada5f8e0..ca0a704f 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -368,9 +368,11 @@ void backward_region_layer(const region_layer l, network_state state) void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map) { - int i,j,n; - float *predictions = l.output; + int i; + float *const predictions = l.output; + #pragma omp parallel for for (i = 0; i < l.w*l.h; ++i){ + int j, n; int row = i / l.w; int col = i % l.w; for(n = 0; n < l.n; ++n){