mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Try to fuse conv_xnor+shortcut -> conv_xnor
This commit is contained in:
@ -457,6 +457,8 @@ struct layer {
|
|||||||
|
|
||||||
float *binary_input_gpu;
|
float *binary_input_gpu;
|
||||||
float *binary_weights_gpu;
|
float *binary_weights_gpu;
|
||||||
|
float *bin_conv_shortcut_in_gpu;
|
||||||
|
float *bin_conv_shortcut_out_gpu;
|
||||||
|
|
||||||
float * mean_gpu;
|
float * mean_gpu;
|
||||||
float * variance_gpu;
|
float * variance_gpu;
|
||||||
|
@ -300,7 +300,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
|||||||
//start_timer();
|
//start_timer();
|
||||||
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
|
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
|
||||||
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
|
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
|
||||||
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY);
|
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
|
||||||
|
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
|
||||||
//cudaDeviceSynchronize();
|
//cudaDeviceSynchronize();
|
||||||
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
|
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
|
||||||
|
|
||||||
@ -370,7 +371,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
|||||||
//start_timer();
|
//start_timer();
|
||||||
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
|
gemm_nn_custom_bin_mean_transposed_gpu(m, n, k,
|
||||||
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
|
(unsigned char *)l.align_bit_weights_gpu, new_ldb, (unsigned char *)l.transposed_align_workspace_gpu,
|
||||||
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY);
|
new_ldb, l.output_gpu, n, l.mean_arr_gpu, l.biases_gpu, l.activation == LEAKY,
|
||||||
|
l.bin_conv_shortcut_in_gpu, l.bin_conv_shortcut_out_gpu);
|
||||||
//cudaDeviceSynchronize();
|
//cudaDeviceSynchronize();
|
||||||
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
|
//stop_timer_and_show_name("gemm_nn_custom_bin_mean_transposed_gpu");
|
||||||
//}
|
//}
|
||||||
|
@ -44,7 +44,8 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size);
|
|||||||
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
||||||
unsigned char *A, int lda,
|
unsigned char *A, int lda,
|
||||||
unsigned char *B, int ldb,
|
unsigned char *B, int ldb,
|
||||||
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation);
|
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
|
||||||
|
float *shortcut_in_gpu, float *shortcut_out_gpu);
|
||||||
|
|
||||||
// sequentially - BAD
|
// sequentially - BAD
|
||||||
void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
|
void gemm_nn_custom_bin_mean_transposed_sequentially_gpu(int M, int N, int K,
|
||||||
|
@ -1247,7 +1247,8 @@ int warpAllReduceSum(int val) {
|
|||||||
__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
|
__global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, int K,
|
||||||
unsigned char *A, int lda,
|
unsigned char *A, int lda,
|
||||||
unsigned char *B, int ldb,
|
unsigned char *B, int ldb,
|
||||||
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation)
|
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
|
||||||
|
float *shortcut_in_gpu, float *shortcut_out_gpu)
|
||||||
{
|
{
|
||||||
// total 57%
|
// total 57%
|
||||||
int index = blockIdx.x*blockDim.x + threadIdx.x;
|
int index = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
@ -1411,9 +1412,14 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
|||||||
float bias_val = bias_arr[i + i_d];
|
float bias_val = bias_arr[i + i_d];
|
||||||
float dst_val = count *mean_val + bias_val;
|
float dst_val = count *mean_val + bias_val;
|
||||||
if (leaky_activation)
|
if (leaky_activation)
|
||||||
dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
|
dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
|
||||||
|
|
||||||
C[(i + i_d)*ldc + (c_x*8 + j + j_d)] = dst_val;
|
size_t out_index = (i + i_d)*ldc + (c_x * 8 + j + j_d);
|
||||||
|
C[out_index] = dst_val;
|
||||||
|
|
||||||
|
if (shortcut_out_gpu) {
|
||||||
|
shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -1574,7 +1580,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
|||||||
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
|
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
|
||||||
unsigned char *A, int lda,
|
unsigned char *A, int lda,
|
||||||
unsigned char *B, int ldb,
|
unsigned char *B, int ldb,
|
||||||
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation)
|
float *C, int ldc, float *mean_arr, float *bias_arr, int leaky_activation,
|
||||||
|
float *shortcut_in_gpu, float *shortcut_out_gpu)
|
||||||
{
|
{
|
||||||
// total 57%
|
// total 57%
|
||||||
int index = blockIdx.x*blockDim.x + threadIdx.x;
|
int index = blockIdx.x*blockDim.x + threadIdx.x;
|
||||||
@ -1628,7 +1635,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i)); // weights
|
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + A_i)); // weights
|
||||||
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i)); // weights
|
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + A_i)); // weights
|
||||||
ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i)); // input
|
ulonglong4 b_bit256 = *((ulonglong4 *)(B + B_i)); // input
|
||||||
c_bit256 = xnor_int256(a_bit256, b_bit256);
|
c_bit256 = xor_int256(a_bit256, b_bit256);
|
||||||
int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) +
|
int tmp_count = __popcll(c_bit256.w) + __popcll(c_bit256.x) +
|
||||||
__popcll(c_bit256.y) + __popcll(c_bit256.z);
|
__popcll(c_bit256.y) + __popcll(c_bit256.z);
|
||||||
|
|
||||||
@ -1661,7 +1668,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
|
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
|
||||||
uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights
|
uint64_t a_bit64 = *((uint64_t *)(A_s + A_i)); // weights
|
||||||
uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input
|
uint64_t b_bit64 = *((uint64_t *)(B + B_i)); // input
|
||||||
c_bit64 = xnor_int64(a_bit64, b_bit64);
|
c_bit64 = xor_int64(a_bit64, b_bit64);
|
||||||
int tmp_count = __popcll(c_bit64);
|
int tmp_count = __popcll(c_bit64);
|
||||||
|
|
||||||
int sum_count = warpAllReduceSum(tmp_count);
|
int sum_count = warpAllReduceSum(tmp_count);
|
||||||
@ -1691,7 +1698,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
|
//uint64_t a_bit64 = *((uint64_t *)(A + A_i)); // weights
|
||||||
uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights
|
uint32_t a_bit32 = *((uint32_t *)(A_s + A_i)); // weights
|
||||||
uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input
|
uint32_t b_bit32 = *((uint32_t *)(B + B_i)); // input
|
||||||
uint32_t c_bit32 = xnor_int32(a_bit32, b_bit32);
|
uint32_t c_bit32 = xor_int32(a_bit32, b_bit32);
|
||||||
int tmp_count = __popc(c_bit32);
|
int tmp_count = __popc(c_bit32);
|
||||||
|
|
||||||
int sum_count = warpAllReduceSum(tmp_count);
|
int sum_count = warpAllReduceSum(tmp_count);
|
||||||
@ -1712,7 +1719,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights
|
//ulonglong4 a_bit256 = *((ulonglong4 *)(A + (i*lda + k) / 8)); // weights
|
||||||
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights
|
ulonglong4 a_bit256 = *((ulonglong4 *)(A_s + (local_i*lda + k) / 8)); // weights
|
||||||
ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input
|
ulonglong4 b_bit256 = *((ulonglong4 *)(B + (j*ldb + k) / 8)); // input
|
||||||
ulonglong4 c_bit256 = xnor_int256(a_bit256, b_bit256);
|
ulonglong4 c_bit256 = xor_int256(a_bit256, b_bit256);
|
||||||
|
|
||||||
count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
|
count += __popcll(c_bit256.w) + __popcll(c_bit256.x) +
|
||||||
__popcll(c_bit256.y) + __popcll(c_bit256.z);
|
__popcll(c_bit256.y) + __popcll(c_bit256.z);
|
||||||
@ -1724,7 +1731,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
//uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
|
//uint64_t a_bit64 = *((uint64_t *)(A + (i*lda + k) / 8)); // weights
|
||||||
uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights
|
uint64_t a_bit64 = *((uint64_t *)(A_s + (local_i*lda + k) / 8)); // weights
|
||||||
uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input
|
uint64_t b_bit64 = *((uint64_t *)(B + (j*ldb + k) / 8)); // input
|
||||||
uint64_t c_bit64 = xnor_int64(a_bit64, b_bit64);
|
uint64_t c_bit64 = xor_int64(a_bit64, b_bit64);
|
||||||
|
|
||||||
count += __popcll(c_bit64);
|
count += __popcll(c_bit64);
|
||||||
}
|
}
|
||||||
@ -1735,8 +1742,13 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
count = count - f1; // remove extra bits (from empty space for align only)
|
count = count - f1; // remove extra bits (from empty space for align only)
|
||||||
float dst_val = (2 * count - K) *mean_val + bias_val;
|
float dst_val = (2 * count - K) *mean_val + bias_val;
|
||||||
if(leaky_activation)
|
if(leaky_activation)
|
||||||
dst_val = (dst_val > 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
|
dst_val = (dst_val >= 0) ? (dst_val) : (0.1f*dst_val); // Leaky activation
|
||||||
C[i*ldc + j] = dst_val;
|
size_t out_index = i*ldc + j;
|
||||||
|
C[out_index] = dst_val;
|
||||||
|
|
||||||
|
if (shortcut_out_gpu) {
|
||||||
|
shortcut_out_gpu[out_index] = shortcut_in_gpu[out_index] + dst_val;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1762,7 +1774,8 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
|||||||
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
||||||
unsigned char *A, int lda,
|
unsigned char *A, int lda,
|
||||||
unsigned char *B, int ldb,
|
unsigned char *B, int ldb,
|
||||||
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation)
|
float *C, int ldc, float *mean_arr, float *bias, int leaky_activation,
|
||||||
|
float *shortcut_in_gpu, float *shortcut_out_gpu)
|
||||||
{
|
{
|
||||||
int size = M*N;
|
int size = M*N;
|
||||||
const int num_blocks = get_number_of_blocks(size, BLOCK);
|
const int num_blocks = get_number_of_blocks(size, BLOCK);
|
||||||
@ -1794,7 +1807,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
|||||||
A, lda,
|
A, lda,
|
||||||
B, ldb,
|
B, ldb,
|
||||||
C, ldc,
|
C, ldc,
|
||||||
mean_arr, bias, leaky_activation);
|
mean_arr, bias, leaky_activation,
|
||||||
|
shortcut_in_gpu, shortcut_out_gpu);
|
||||||
|
|
||||||
//cudaDeviceSynchronize();
|
//cudaDeviceSynchronize();
|
||||||
//getchar();
|
//getchar();
|
||||||
@ -1807,7 +1821,8 @@ void gemm_nn_custom_bin_mean_transposed_gpu(int M, int N, int K,
|
|||||||
A, lda,
|
A, lda,
|
||||||
B, ldb,
|
B, ldb,
|
||||||
C, ldc,
|
C, ldc,
|
||||||
mean_arr, bias, leaky_activation);
|
mean_arr, bias, leaky_activation,
|
||||||
|
shortcut_in_gpu, shortcut_out_gpu);
|
||||||
}
|
}
|
||||||
CHECK_CUDA(cudaPeekAtLastError());
|
CHECK_CUDA(cudaPeekAtLastError());
|
||||||
}
|
}
|
||||||
|
@ -975,7 +975,7 @@ void fuse_conv_batchnorm(network net)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void forward_blank_layer(layer l, network_state state) {}
|
||||||
|
|
||||||
void calculate_binary_weights(network net)
|
void calculate_binary_weights(network net)
|
||||||
{
|
{
|
||||||
@ -996,6 +996,19 @@ void calculate_binary_weights(network net)
|
|||||||
if (net.layers[j].use_bin_output) {
|
if (net.layers[j].use_bin_output) {
|
||||||
l->activation = LINEAR;
|
l->activation = LINEAR;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// fuse conv_xnor + shortcut -> conv_xnor
|
||||||
|
if ((j + 1) < net.n && net.layers[j].type == CONVOLUTIONAL) {
|
||||||
|
layer *sc = &net.layers[j + 1];
|
||||||
|
if (sc->type == SHORTCUT && sc->w == sc->out_w && sc->h == sc->out_h && sc->c == sc->out_c)
|
||||||
|
{
|
||||||
|
l->bin_conv_shortcut_in_gpu = net.layers[net.layers[j + 1].index].output_gpu;
|
||||||
|
l->bin_conv_shortcut_out_gpu = net.layers[j + 1].output_gpu;
|
||||||
|
|
||||||
|
net.layers[j + 1].type = BLANK;
|
||||||
|
net.layers[j + 1].forward_gpu = forward_blank_layer;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user