diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 10c1f324..c5c59576 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -576,7 +576,7 @@ convolutional_layer make_convolutional_layer(int batch, int steps, int h, int w, //fprintf(stderr, "%5d/%2d %2d x%2d /%2d(%d)%4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", n, groups, size, size, stride, dilation, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); if (l.antialiasing) { - printf("AA: "); + printf("AA: "); l.input_layer = (layer*)calloc(1, sizeof(layer)); const int blur_size = 3; *(l.input_layer) = make_convolutional_layer(batch, steps, out_h, out_w, n, n, n, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, index, 0, NULL); @@ -1141,7 +1141,6 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) s.train = state.train; s.workspace = state.workspace; s.net = state.net; - if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) s.input = l.output; forward_convolutional_layer(*(l.input_layer), s); //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing); diff --git a/src/maxpool_layer.c b/src/maxpool_layer.c index 000efe90..12392621 100644 --- a/src/maxpool_layer.c +++ b/src/maxpool_layer.c @@ -1,4 +1,5 @@ #include "maxpool_layer.h" +#include "convolutional_layer.h" #include "dark_cuda.h" #include "gemm.h" #include @@ -45,10 +46,18 @@ void cudnn_maxpool_setup(layer *l) } -maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels) +maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing) { maxpool_layer l = { (LAYER_TYPE)0 }; l.type = MAXPOOL; + + const int blur_stride_x = stride_x; + const int blur_stride_y = stride_y; + l.antialiasing = antialiasing; + if (antialiasing) { + stride_x = stride_y = l.stride = l.stride_x = l.stride_y = 1; // use stride=1 in host-layer + } + l.batch = batch; l.h = h; l.w = w; @@ -94,6 +103,46 @@ maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int s else fprintf(stderr, "max %d x %d/%2dx%2d %4d x%4d x%4d -> %4d x%4d x%4d %5.3f BF\n", size, size, stride_x, stride_y, w, h, c, l.out_w, l.out_h, l.out_c, l.bflops); + if (l.antialiasing) { + printf("AA: "); + l.input_layer = (layer*)calloc(1, sizeof(layer)); + const int blur_size = 3; + *(l.input_layer) = make_convolutional_layer(batch, 1, l.out_h, l.out_w, l.out_c, l.out_c, l.out_c, blur_size, blur_stride_x, blur_stride_y, 1, blur_size / 2, LINEAR, 0, 0, 0, 0, 0, 1, 0, NULL); + const int blur_nweights = l.out_c * blur_size * blur_size; // (n / n) * n * blur_size * blur_size; + int i; + for (i = 0; i < blur_nweights; i += (blur_size*blur_size)) { + /* + l.input_layer->weights[i + 0] = 0; + l.input_layer->weights[i + 1] = 0; + l.input_layer->weights[i + 2] = 0; + + l.input_layer->weights[i + 3] = 0; + l.input_layer->weights[i + 4] = 1; + l.input_layer->weights[i + 5] = 0; + + l.input_layer->weights[i + 6] = 0; + l.input_layer->weights[i + 7] = 0; + l.input_layer->weights[i + 8] = 0; + */ + l.input_layer->weights[i + 0] = 1 / 16.f; + l.input_layer->weights[i + 1] = 2 / 16.f; + l.input_layer->weights[i + 2] = 1 / 16.f; + + l.input_layer->weights[i + 3] = 2 / 16.f; + l.input_layer->weights[i + 4] = 4 / 16.f; + l.input_layer->weights[i + 5] = 2 / 16.f; + + l.input_layer->weights[i + 6] = 1 / 16.f; + l.input_layer->weights[i + 7] = 2 / 16.f; + l.input_layer->weights[i + 8] = 1 / 16.f; + } + for (i = 0; i < l.out_c; ++i) l.input_layer->biases[i] = 0; +#ifdef GPU + l.input_antialiasing_gpu = cuda_make_array(NULL, l.batch*l.outputs); + push_convolutional_layer(*(l.input_layer)); +#endif // GPU + } + return l; } @@ -159,42 +208,54 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state) if (!state.train && l.stride_x == l.stride_y) { forward_maxpool_layer_avx(state.input, l.output, l.indexes, l.size, l.w, l.h, l.out_w, l.out_h, l.c, l.pad, l.stride, l.batch); - return; } + else { - int b,i,j,k,m,n; - int w_offset = -l.pad / 2; - int h_offset = -l.pad / 2; + int b, i, j, k, m, n; + int w_offset = -l.pad / 2; + int h_offset = -l.pad / 2; - int h = l.out_h; - int w = l.out_w; - int c = l.c; + int h = l.out_h; + int w = l.out_w; + int c = l.c; - for(b = 0; b < l.batch; ++b){ - for(k = 0; k < c; ++k){ - for(i = 0; i < h; ++i){ - for(j = 0; j < w; ++j){ - int out_index = j + w*(i + h*(k + c*b)); - float max = -FLT_MAX; - int max_i = -1; - for(n = 0; n < l.size; ++n){ - for(m = 0; m < l.size; ++m){ - int cur_h = h_offset + i*l.stride_y + n; - int cur_w = w_offset + j*l.stride_x + m; - int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c)); - int valid = (cur_h >= 0 && cur_h < l.h && - cur_w >= 0 && cur_w < l.w); - float val = (valid != 0) ? state.input[index] : -FLT_MAX; - max_i = (val > max) ? index : max_i; - max = (val > max) ? val : max; + for (b = 0; b < l.batch; ++b) { + for (k = 0; k < c; ++k) { + for (i = 0; i < h; ++i) { + for (j = 0; j < w; ++j) { + int out_index = j + w*(i + h*(k + c*b)); + float max = -FLT_MAX; + int max_i = -1; + for (n = 0; n < l.size; ++n) { + for (m = 0; m < l.size; ++m) { + int cur_h = h_offset + i*l.stride_y + n; + int cur_w = w_offset + j*l.stride_x + m; + int index = cur_w + l.w*(cur_h + l.h*(k + b*l.c)); + int valid = (cur_h >= 0 && cur_h < l.h && + cur_w >= 0 && cur_w < l.w); + float val = (valid != 0) ? state.input[index] : -FLT_MAX; + max_i = (val > max) ? index : max_i; + max = (val > max) ? val : max; + } } + l.output[out_index] = max; + l.indexes[out_index] = max_i; } - l.output[out_index] = max; - l.indexes[out_index] = max_i; } } } } + + if (l.antialiasing) { + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + s.input = l.output; + forward_convolutional_layer(*(l.input_layer), s); + //simple_copy_ongpu(l.outputs*l.batch, l.output, l.input_antialiasing); + memcpy(l.output, l.input_layer->output, l.input_layer->outputs * l.input_layer->batch * sizeof(float)); + } } void backward_maxpool_layer(const maxpool_layer l, network_state state) diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h index 4994d457..cfedf9d9 100644 --- a/src/maxpool_layer.h +++ b/src/maxpool_layer.h @@ -12,7 +12,7 @@ typedef layer maxpool_layer; extern "C" { #endif image get_maxpool_image(maxpool_layer l); -maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels); +maxpool_layer make_maxpool_layer(int batch, int h, int w, int c, int size, int stride_x, int stride_y, int padding, int maxpool_depth, int out_channels, int antialiasing); void resize_maxpool_layer(maxpool_layer *l, int w, int h); void forward_maxpool_layer(const maxpool_layer l, network_state state); void backward_maxpool_layer(const maxpool_layer l, network_state state); diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu index 8e851100..cc546a0b 100644 --- a/src/maxpool_layer_kernels.cu +++ b/src/maxpool_layer_kernels.cu @@ -3,6 +3,8 @@ #include #include "maxpool_layer.h" +#include "convolutional_layer.h" +#include "blas.h" #include "dark_cuda.h" __global__ void forward_maxpool_depth_layer_kernel(int n, int w, int h, int c, int out_c, int batch, float *input, float *output, int *indexes) @@ -163,22 +165,47 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta //cudnnDestroyTensorDescriptor(layer.srcTensorDesc); //cudnnDestroyTensorDescriptor(layer.dstTensorDesc); - return; } + else #endif + { + int h = layer.out_h; + int w = layer.out_w; + int c = layer.out_c; - int h = layer.out_h; - int w = layer.out_w; - int c = layer.out_c; + size_t n = h*w*c*layer.batch; - size_t n = h*w*c*layer.batch; + forward_maxpool_layer_kernel << > > (n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu); + CHECK_CUDA(cudaPeekAtLastError()); + } - forward_maxpool_layer_kernel<<>>(n, layer.h, layer.w, layer.c, layer.stride_x, layer.stride_y, layer.size, layer.pad, state.input, layer.output_gpu, layer.indexes_gpu); - CHECK_CUDA(cudaPeekAtLastError()); + if (layer.antialiasing) { + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) + s.input = layer.output_gpu; + forward_convolutional_layer_gpu(*(layer.input_layer), s); + simple_copy_ongpu(layer.outputs*layer.batch, layer.output_gpu, layer.input_antialiasing_gpu); + simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.input_layer->output_gpu, layer.output_gpu); + } } extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state) { + if (layer.antialiasing) { + network_state s = { 0 }; + s.train = state.train; + s.workspace = state.workspace; + s.net = state.net; + s.delta = layer.delta_gpu; + s.input = layer.input_antialiasing_gpu; + //if (!state.train) s.index = state.index; // don't use TC for training (especially without cuda_convert_f32_to_f16() ) + simple_copy_ongpu(layer.input_layer->outputs*layer.input_layer->batch, layer.delta_gpu, layer.input_layer->delta_gpu); + backward_convolutional_layer_gpu(*(layer.input_layer), s); + } + if (layer.maxpool_depth) { int h = layer.out_h; int w = layer.out_w; diff --git a/src/parser.c b/src/parser.c index fda2bacc..b89bf0ac 100644 --- a/src/parser.c +++ b/src/parser.c @@ -545,6 +545,7 @@ maxpool_layer parse_maxpool(list *options, size_params params) int padding = option_find_int_quiet(options, "padding", size-1); int maxpool_depth = option_find_int_quiet(options, "maxpool_depth", 0); int out_channels = option_find_int_quiet(options, "out_channels", 1); + int antialiasing = option_find_int_quiet(options, "antialiasing", 0); int batch,h,w,c; h = params.h; @@ -553,7 +554,7 @@ maxpool_layer parse_maxpool(list *options, size_params params) batch=params.batch; if(!(h && w && c)) error("Layer before maxpool layer must output image."); - maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels); + maxpool_layer layer = make_maxpool_layer(batch, h, w, c, size, stride_x, stride_y, padding, maxpool_depth, out_channels, antialiasing); return layer; }