mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Added grouped convolutional (depth-wise convolutional)
This commit is contained in:
@ -166,20 +166,16 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
{
|
||||
//fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
if(l.binary){
|
||||
binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
|
||||
binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
|
||||
swap_binary(&l);
|
||||
}
|
||||
|
||||
if(l.xnor){
|
||||
if (!l.align_bit_weights_gpu || state.train) {
|
||||
//binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu);
|
||||
//binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu);
|
||||
|
||||
fast_binarize_weights_gpu(l.weights_gpu, l.n, l.c*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
|
||||
fast_binarize_weights_gpu(l.weights_gpu, l.n, (l.c / l.groups)*l.size*l.size, l.binary_weights_gpu, l.mean_arr_gpu);
|
||||
}
|
||||
//swap_binary(&l);
|
||||
//binarize_gpu(state.input, l.c*l.h*l.w*l.batch, l.binary_input_gpu);
|
||||
//state.input = l.binary_input_gpu;
|
||||
//cudaDeviceSynchronize();
|
||||
|
||||
if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
|
||||
{
|
||||
@ -187,11 +183,15 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
//cudaError_t status = cudaSuccess;
|
||||
//int input_size = l.c*l.h*l.w*l.batch;
|
||||
|
||||
int m = l.n;
|
||||
int k = l.size*l.size*l.c;
|
||||
int m = l.n / l.groups;
|
||||
int k = l.size*l.size*l.c / l.groups;
|
||||
int n = l.out_w*l.out_h;
|
||||
//float * a = l.weights_gpu;
|
||||
|
||||
// int i, j;
|
||||
// for(i = 0; i < l.batch; ++i){
|
||||
// for (j = 0; j < l.groups; ++j) {
|
||||
|
||||
int ldb_align = l.lda_align;
|
||||
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
|
||||
//size_t t_intput_size = new_ldb * n;
|
||||
@ -551,22 +551,25 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
#else
|
||||
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
|
||||
int i;
|
||||
int m = l.n;
|
||||
int k = l.size*l.size*l.c;
|
||||
int i, j;
|
||||
int m = l.n / l.groups;
|
||||
int k = l.size*l.size*l.c / l.groups;
|
||||
int n = l.out_w*l.out_h;
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
float *im = state.input + i*l.c*l.h*l.w;
|
||||
float * a = l.weights_gpu;
|
||||
float * b = state.workspace;
|
||||
float * c = l.output_gpu;
|
||||
if (l.size == 1) {
|
||||
b = im;
|
||||
for (j = 0; j < l.groups; ++j) {
|
||||
//float *im = state.input + i*l.c*l.h*l.w;
|
||||
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
||||
float *a = l.weights_gpu + j*l.nweights / l.groups;
|
||||
float *b = state.workspace;
|
||||
float *c = l.output_gpu + (i*l.groups + j)*n*m;
|
||||
if (l.size == 1) {
|
||||
b = im;
|
||||
}
|
||||
else {
|
||||
im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
|
||||
}
|
||||
gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
|
||||
}
|
||||
else {
|
||||
im2col_ongpu(im, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
|
||||
}
|
||||
gemm_ongpu(0,0,m,n,k,1.,a,k,b,n,1.,c+i*m*n,n);
|
||||
}
|
||||
|
||||
if (l.batch_normalize) {
|
||||
@ -782,32 +785,38 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
||||
backward_batchnorm_layer_gpu(l, state);
|
||||
}
|
||||
|
||||
int m = l.n;
|
||||
int n = l.size*l.size*l.c;
|
||||
int m = l.n / l.groups;
|
||||
int n = l.size*l.size*l.c / l.groups;
|
||||
int k = l.out_w*l.out_h;
|
||||
|
||||
int i;
|
||||
int i, j;
|
||||
for(i = 0; i < l.batch; ++i){
|
||||
float * a = l.delta_gpu;
|
||||
float * b = state.workspace;
|
||||
float * c = l.weight_updates_gpu;
|
||||
for (j = 0; j < l.groups; ++j) {
|
||||
float * a = l.delta_gpu + (i*l.groups + j)*m*k;
|
||||
float * b = state.workspace;
|
||||
float * c = l.weight_updates_gpu + j*l.nweights / l.groups;
|
||||
|
||||
im2col_ongpu(state.input + i*l.c*l.h*l.w, l.c, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
|
||||
gemm_ongpu(0,1,m,n,k,1,a + i*m*k,k,b,k,1,c,n);
|
||||
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
||||
|
||||
if(state.delta){
|
||||
if(l.binary || l.xnor) swap_binary(&l);
|
||||
float * a = l.weights_gpu;
|
||||
float * b = l.delta_gpu;
|
||||
float * c = state.workspace;
|
||||
im2col_ongpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, state.workspace);
|
||||
gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
|
||||
|
||||
gemm_ongpu(1,0,n,k,m,1,a,n,b + i*k*m,k,0,c,k);
|
||||
if (state.delta) {
|
||||
if (l.binary || l.xnor) swap_binary(&l);
|
||||
float * a = l.weights_gpu + j*l.nweights / l.groups;
|
||||
float * b = l.delta_gpu + (i*l.groups + j)*m*k;
|
||||
float * c = state.workspace;
|
||||
|
||||
col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta + i*l.c*l.h*l.w);
|
||||
if(l.binary || l.xnor) {
|
||||
swap_binary(&l);
|
||||
gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
|
||||
|
||||
float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
||||
|
||||
col2im_ongpu(state.workspace, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, delta);
|
||||
if (l.binary || l.xnor) {
|
||||
swap_binary(&l);
|
||||
}
|
||||
if (l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
|
||||
}
|
||||
if(l.xnor) gradient_array_ongpu(original_input + i*l.c*l.h*l.w, l.c*l.h*l.w, HARDTAN, state.delta + i*l.c*l.h*l.w);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -821,43 +830,43 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
||||
}
|
||||
}
|
||||
|
||||
void pull_convolutional_layer(convolutional_layer layer)
|
||||
void pull_convolutional_layer(convolutional_layer l)
|
||||
{
|
||||
cuda_pull_array_async(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_pull_array_async(layer.biases_gpu, layer.biases, layer.n);
|
||||
cuda_pull_array_async(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_pull_array_async(layer.bias_updates_gpu, layer.bias_updates, layer.n);
|
||||
if (layer.batch_normalize){
|
||||
cuda_pull_array_async(layer.scales_gpu, layer.scales, layer.n);
|
||||
cuda_pull_array_async(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
|
||||
cuda_pull_array_async(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
|
||||
cuda_pull_array_async(l.weights_gpu, l.weights, l.nweights);
|
||||
cuda_pull_array_async(l.biases_gpu, l.biases, l.n);
|
||||
cuda_pull_array_async(l.weight_updates_gpu, l.weight_updates, l.nweights);
|
||||
cuda_pull_array_async(l.bias_updates_gpu, l.bias_updates, l.n);
|
||||
if (l.batch_normalize){
|
||||
cuda_pull_array_async(l.scales_gpu, l.scales, l.n);
|
||||
cuda_pull_array_async(l.rolling_mean_gpu, l.rolling_mean, l.n);
|
||||
cuda_pull_array_async(l.rolling_variance_gpu, l.rolling_variance, l.n);
|
||||
}
|
||||
if (layer.adam){
|
||||
cuda_pull_array_async(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_pull_array_async(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
|
||||
if (l.adam){
|
||||
cuda_pull_array_async(l.m_gpu, l.m, l.nweights);
|
||||
cuda_pull_array_async(l.v_gpu, l.v, l.nweights);
|
||||
}
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
cudaStreamSynchronize(get_cuda_stream());
|
||||
}
|
||||
|
||||
void push_convolutional_layer(convolutional_layer layer)
|
||||
void push_convolutional_layer(convolutional_layer l)
|
||||
{
|
||||
cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_push_array(l.weights_gpu, l.weights, l.nweights);
|
||||
#ifdef CUDNN_HALF
|
||||
assert((layer.c*layer.n*layer.size*layer.size) > 0);
|
||||
cuda_convert_f32_to_f16(layer.weights_gpu, layer.c*layer.n*layer.size*layer.size, layer.weights_gpu16);
|
||||
assert(l.nweights > 0);
|
||||
cuda_convert_f32_to_f16(l.weights_gpu, l.nweights, l.weights_gpu16);
|
||||
#endif
|
||||
cuda_push_array(layer.biases_gpu, layer.biases, layer.n);
|
||||
cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n);
|
||||
if (layer.batch_normalize){
|
||||
cuda_push_array(layer.scales_gpu, layer.scales, layer.n);
|
||||
cuda_push_array(layer.rolling_mean_gpu, layer.rolling_mean, layer.n);
|
||||
cuda_push_array(layer.rolling_variance_gpu, layer.rolling_variance, layer.n);
|
||||
cuda_push_array(l.biases_gpu, l.biases, l.n);
|
||||
cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.nweights);
|
||||
cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n);
|
||||
if (l.batch_normalize){
|
||||
cuda_push_array(l.scales_gpu, l.scales, l.n);
|
||||
cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n);
|
||||
cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n);
|
||||
}
|
||||
if (layer.adam){
|
||||
cuda_push_array(layer.m_gpu, layer.m, layer.c*layer.n*layer.size*layer.size);
|
||||
cuda_push_array(layer.v_gpu, layer.v, layer.c*layer.n*layer.size*layer.size);
|
||||
if (l.adam){
|
||||
cuda_push_array(l.m_gpu, l.m, l.nweights);
|
||||
cuda_push_array(l.v_gpu, l.v, l.nweights);
|
||||
}
|
||||
CHECK_CUDA(cudaPeekAtLastError());
|
||||
}
|
||||
@ -868,11 +877,10 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
|
||||
//float momentum = a.momentum;
|
||||
//float decay = a.decay;
|
||||
//int batch = a.batch;
|
||||
int size = l.size*l.size*l.c*l.n; // old
|
||||
|
||||
if (l.adam) {
|
||||
//adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, a.B1, a.B2, a.eps, decay, learning_rate, l.nweights, batch, a.t);
|
||||
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, size, batch, l.t);
|
||||
adam_update_gpu(l.weights_gpu, l.weight_updates_gpu, l.m_gpu, l.v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.nweights, batch, l.t);
|
||||
|
||||
adam_update_gpu(l.biases_gpu, l.bias_updates_gpu, l.bias_m_gpu, l.bias_v_gpu, l.B1, l.B2, l.eps, decay, learning_rate, l.n, batch, l.t);
|
||||
if (l.scales_gpu) {
|
||||
@ -883,9 +891,9 @@ void update_convolutional_layer_gpu(layer l, int batch, float learning_rate_init
|
||||
//axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
//axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
//scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(size, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(size, momentum, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(l.nweights, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1);
|
||||
axpy_ongpu(l.nweights, learning_rate / batch, l.weight_updates_gpu, 1, l.weights_gpu, 1);
|
||||
scal_ongpu(l.nweights, momentum, l.weight_updates_gpu, 1);
|
||||
|
||||
axpy_ongpu(l.n, learning_rate / batch, l.bias_updates_gpu, 1, l.biases_gpu, 1);
|
||||
scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1);
|
||||
|
Reference in New Issue
Block a user