mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Fixed Depth-wise (grouped) convolution for CPU and GPU (CUDNN=0/1)
This commit is contained in:
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
|
avgpool_layer make_avgpool_layer(int batch, int w, int h, int c)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c);
|
fprintf(stderr, "avg %4d x%4d x%4d -> %4d\n", w, h, c, c);
|
||||||
avgpool_layer l = { (LAYER_TYPE)0 };
|
avgpool_layer l = { (LAYER_TYPE)0 };
|
||||||
l.type = AVGPOOL;
|
l.type = AVGPOOL;
|
||||||
l.batch = batch;
|
l.batch = batch;
|
||||||
|
@ -578,7 +578,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
|||||||
state.workspace); // output
|
state.workspace); // output
|
||||||
|
|
||||||
}
|
}
|
||||||
gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
|
//gemm_ongpu(0, 0, m, n, k, 1., a, k, b, n, 1., c + i*m*n, n);
|
||||||
|
gemm_ongpu(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -817,7 +818,8 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
|||||||
l.stride, l.stride, // stride (h, w)
|
l.stride, l.stride, // stride (h, w)
|
||||||
l.dilation, l.dilation, // dilation (h, w)
|
l.dilation, l.dilation, // dilation (h, w)
|
||||||
state.workspace); // output
|
state.workspace); // output
|
||||||
gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
|
//gemm_ongpu(0, 1, m, n, k, 1, a + i*m*k, k, b, k, 1, c, n);
|
||||||
|
gemm_ongpu(0, 1, m, n, k, 1, a, k, b, k, 1, c, n);
|
||||||
|
|
||||||
if (state.delta) {
|
if (state.delta) {
|
||||||
if (l.binary || l.xnor) swap_binary(&l);
|
if (l.binary || l.xnor) swap_binary(&l);
|
||||||
@ -825,7 +827,9 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
|||||||
float * b = l.delta_gpu + (i*l.groups + j)*m*k;
|
float * b = l.delta_gpu + (i*l.groups + j)*m*k;
|
||||||
float * c = state.workspace;
|
float * c = state.workspace;
|
||||||
|
|
||||||
gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
|
//gemm_ongpu(1, 0, n, k, m, 1, a, n, b + i*k*m, k, 0, c, k);
|
||||||
|
gemm_ongpu(1, 0, n, k, m, 1, a, n, b, k, 0, c, k);
|
||||||
|
|
||||||
|
|
||||||
float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
float *delta = state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
||||||
|
|
||||||
|
@ -547,7 +547,16 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w,
|
|||||||
if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
|
if (l.xnor && l.use_bin_output) fprintf(stderr, "convXB");
|
||||||
else if (l.xnor) fprintf(stderr, "convX ");
|
else if (l.xnor) fprintf(stderr, "convX ");
|
||||||
else fprintf(stderr, "conv ");
|
else fprintf(stderr, "conv ");
|
||||||
fprintf(stderr, "%5d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
|
|
||||||
|
if(groups > 1) fprintf(stderr, "%5d/%3d ", n, groups);
|
||||||
|
else fprintf(stderr, "%5d ", n);
|
||||||
|
|
||||||
|
if(dilation > 1) fprintf(stderr, "%2d x%2d/%2d(%1d)", size, size, stride, dilation);
|
||||||
|
else fprintf(stderr, "%2d x%2d/%2d ", size, size, stride);
|
||||||
|
|
||||||
|
fprintf(stderr, "%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
|
||||||
|
|
||||||
|
//fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
|
||||||
|
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
@ -897,12 +906,13 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
|||||||
static int u = 0;
|
static int u = 0;
|
||||||
u++;
|
u++;
|
||||||
|
|
||||||
for(i = 0; i < l.batch; ++i){
|
for(i = 0; i < l.batch; ++i)
|
||||||
for (j = 0; j < l.groups; ++j) {
|
{
|
||||||
|
for (j = 0; j < l.groups; ++j)
|
||||||
float *a = l.weights + j*l.nweights / l.groups;
|
{
|
||||||
|
float *a = l.weights +j*l.nweights / l.groups;
|
||||||
float *b = state.workspace;
|
float *b = state.workspace;
|
||||||
float *c = l.output + (i*l.groups + j)*n*m;
|
float *c = l.output +(i*l.groups + j)*n*m;
|
||||||
|
|
||||||
//gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
//gemm(0,0,m,n,k,1,a,k,b,n,1,c,n);
|
||||||
//gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
|
//gemm_nn_custom(m, n, k, 1, a, k, b, n, c, n);
|
||||||
@ -1025,23 +1035,29 @@ void forward_convolutional_layer(convolutional_layer l, network_state state)
|
|||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
//printf(" l.index = %d - FP32 \n", l.index);
|
//printf(" l.index = %d - FP32 \n", l.index);
|
||||||
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
float *im = state.input + (i*l.groups + j)*(l.c / l.groups)*l.h*l.w;
|
||||||
//im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
|
if (l.size == 1) {
|
||||||
|
b = im;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
//im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
|
||||||
|
|
||||||
im2col_cpu_ext(im, // input
|
im2col_cpu_ext(im, // input
|
||||||
l.c / l.groups, // input channels
|
l.c / l.groups, // input channels
|
||||||
l.h, l.w, // input size (h, w)
|
l.h, l.w, // input size (h, w)
|
||||||
l.size, l.size, // kernel size (h, w)
|
l.size, l.size, // kernel size (h, w)
|
||||||
l.pad, l.pad, // padding (h, w)
|
l.pad, l.pad, // padding (h, w)
|
||||||
l.stride, l.stride, // stride (h, w)
|
l.stride, l.stride, // stride (h, w)
|
||||||
l.dilation, l.dilation, // dilation (h, w)
|
l.dilation, l.dilation, // dilation (h, w)
|
||||||
b); // output
|
b); // output
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
|
gemm(0, 0, m, n, k, 1, a, k, b, n, 1, c, n);
|
||||||
// bit-count to float
|
// bit-count to float
|
||||||
}
|
}
|
||||||
c += n*m;
|
//c += n*m;
|
||||||
state.input += l.c*l.h*l.w;
|
//state.input += l.c*l.h*l.w;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1079,7 +1095,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
|
|||||||
float *b = state.workspace;
|
float *b = state.workspace;
|
||||||
float *c = l.weight_updates + j*l.nweights / l.groups;
|
float *c = l.weight_updates + j*l.nweights / l.groups;
|
||||||
|
|
||||||
float *im = state.input + (i*l.groups + j)*l.c / l.groups*l.h*l.w;
|
float *im = state.input + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w;
|
||||||
|
|
||||||
//im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
|
//im2col_cpu(im, l.c / l.groups, l.h, l.w, l.size, l.stride, l.pad, b);
|
||||||
im2col_cpu_ext(
|
im2col_cpu_ext(
|
||||||
@ -1112,7 +1128,7 @@ void backward_convolutional_layer(convolutional_layer l, network_state state)
|
|||||||
l.pad, l.pad, // padding (h, w)
|
l.pad, l.pad, // padding (h, w)
|
||||||
l.stride, l.stride, // stride (h, w)
|
l.stride, l.stride, // stride (h, w)
|
||||||
l.dilation, l.dilation, // dilation (h, w)
|
l.dilation, l.dilation, // dilation (h, w)
|
||||||
state.delta + (i*l.groups + j)*l.c / l.groups*l.h*l.w); // output (delta)
|
state.delta + (i*l.groups + j)* (l.c / l.groups)*l.h*l.w); // output (delta)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -78,7 +78,7 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s
|
|||||||
|
|
||||||
#endif // GPU
|
#endif // GPU
|
||||||
l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
|
l.bflops = (l.size*l.size*l.c * l.out_h*l.out_w) / 1000000000.;
|
||||||
fprintf(stderr, "max %d x %d / %d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
|
fprintf(stderr, "max %d x %d/%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops);
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -834,7 +834,7 @@ network parse_network_cfg_custom(char *filename, int batch, int time_steps)
|
|||||||
n = n->next;
|
n = n->next;
|
||||||
int count = 0;
|
int count = 0;
|
||||||
free_section(s);
|
free_section(s);
|
||||||
fprintf(stderr, "layer filters size input output\n");
|
fprintf(stderr, " layer filters size/strd(dil) input output\n");
|
||||||
while(n){
|
while(n){
|
||||||
params.index = count;
|
params.index = count;
|
||||||
fprintf(stderr, "%4d ", count);
|
fprintf(stderr, "%4d ", count);
|
||||||
|
Reference in New Issue
Block a user