diff --git a/src/blas.c b/src/blas.c index c68b64c3..a3ff84b6 100644 --- a/src/blas.c +++ b/src/blas.c @@ -334,3 +334,22 @@ void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int for } } } + + +void constrain_cpu(int size, float ALPHA, float *X) +{ + int i; + for (i = 0; i < size; ++i) { + X[i] = fminf(ALPHA, fmaxf(-ALPHA, X[i])); + } +} + +void fix_nan_and_inf_cpu(float *input, size_t size) +{ + int i; + for (i = 0; i < size; ++i) { + float val = input[i]; + if (isnan(val) || isinf(val)) + input[i] = 1.0f / i; // pseudo random value + } +} \ No newline at end of file diff --git a/src/blas.h b/src/blas.h index 8e91fff2..faec1236 100644 --- a/src/blas.h +++ b/src/blas.h @@ -46,6 +46,8 @@ void softmax(float *input, int n, float temp, float *output, int stride); void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out); void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error); +void constrain_cpu(int size, float ALPHA, float *X); +void fix_nan_and_inf_cpu(float *input, size_t size); #ifdef GPU @@ -105,6 +107,10 @@ void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float te void fix_nan_and_inf(float *input, size_t size); int is_nan_or_inf(float *input, size_t size); +void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst); +void sum_of_mults(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst); +void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst); + #endif #ifdef __cplusplus } diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 66905127..cf6bed27 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -1021,4 +1021,68 @@ extern "C" int is_nan_or_inf(float *input, size_t size) CHECK_CUDA(cudaFreeHost(pinned_return)); return ret_val; +} + + +__global__ void add_3_arrays_activate_kernel(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index < size) { + float val = 0; + val += a1[index]; + val += a2[index]; + if (a3) val += a3[index]; + if (a == LOGISTIC) val = 1.f / (1.f + expf(-val)); + else if(a == TANH) val = (2 / (1 + expf(-2 * val)) - 1); + dst[index] = val; + } +} + +extern "C" void add_3_arrays_activate(float *a1, float *a2, float *a3, size_t size, ACTIVATION a, float *dst) +{ + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + if (a != LOGISTIC && a != TANH) { + printf(" add_3_arrays_activate() doesn't support activation %d, it supports only LOGISTIC and TANH \n", a); + exit(EXIT_FAILURE); + } + add_3_arrays_activate_kernel << > >(a1, a2, a3, size, a, dst); +} + + +__global__ void sum_of_mults_kernel(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index < size) { + dst[index] = a1[index] * a2[index] + b1[index] * b2[index]; + } +} + +extern "C" void sum_of_mults(float *a1, float *a2, float *b1, float *b2, size_t size, float *dst) +{ + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + sum_of_mults_kernel << > >(a1, a2, b1, b2, size, dst); +} + + +__global__ void activate_and_mult_kernel(float *a1, float *a2, size_t size, ACTIVATION a, float *dst) +{ + const int index = blockIdx.x*blockDim.x + threadIdx.x; + if (index < size) { + float val = a1[index]; + if (a == TANH) val = (2 / (1 + expf(-2 * val)) - 1); + dst[index] = val * a2[index]; + } +} + +extern "C" void activate_and_mult(float *a1, float *a2, size_t size, ACTIVATION a, float *dst) +{ + const int block_size = BLOCK; + const int num_blocks = get_number_of_blocks(size, block_size); + if (a != TANH) { + printf(" activat_and_mult() doesn't support activation %d, it supports only TANH \n", a); + exit(EXIT_FAILURE); + } + activate_and_mult_kernel << > >(a1, a2, size, a, dst); } \ No newline at end of file diff --git a/src/conv_lstm_layer.c b/src/conv_lstm_layer.c index 26e06801..4dbc14d8 100644 --- a/src/conv_lstm_layer.c +++ b/src/conv_lstm_layer.c @@ -201,6 +201,9 @@ layer make_conv_lstm_layer(int batch, int h, int w, int c, int output_filters, i l.wf->bflops + l.wi->bflops + l.wg->bflops + l.wo->bflops + l.vf->bflops + l.vi->bflops + l.vo->bflops; + if(l.peephole) l.bflops += 12 * l.outputs*l.batch / 1000000000.; + else l.bflops += 9 * l.outputs*l.batch / 1000000000.; + return l; } @@ -429,22 +432,23 @@ void forward_conv_lstm_layer(layer l, network_state state) layer ug = *(l.ug); layer uo = *(l.uo); - if (l.peephole) { - fill_cpu(l.outputs * l.batch * l.steps, 0, vf.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, vi.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, vo.delta, 1); - } - - fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); - - fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); - fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); if (state.train) { + if (l.peephole) { + fill_cpu(l.outputs * l.batch * l.steps, 0, vf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, vi.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, vo.delta, 1); + } + + fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); + + fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); + fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); } @@ -467,6 +471,9 @@ void forward_conv_lstm_layer(layer l, network_state state) forward_convolutional_layer(wg, s); forward_convolutional_layer(wo, s); + assert(l.inputs == uf.w * uf.h * uf.c); + assert(uf.c == l.c && ui.c == l.c && ug.c == l.c && uo.c == l.c); + s.input = state.input; forward_convolutional_layer(uf, s); forward_convolutional_layer(ui, s); @@ -498,8 +505,10 @@ void forward_conv_lstm_layer(layer l, network_state state) axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); // o = wo + uo + vo(c_new) - s.input = l.c_cpu; - if (l.peephole) forward_convolutional_layer(vo, s); + if (l.peephole) { + s.input = l.c_cpu; + forward_convolutional_layer(vo, s); + } copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); if (l.peephole) axpy_cpu(l.outputs*l.batch, 1, vo.output, 1, l.o_cpu, 1); @@ -510,6 +519,10 @@ void forward_conv_lstm_layer(layer l, network_state state) activate_array(l.h_cpu, l.outputs*l.batch, TANH); mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1); + if (l.state_constrain) constrain_cpu(l.outputs*l.batch, l.state_constrain, l.c_cpu); + fix_nan_and_inf_cpu(l.c_cpu, l.outputs*l.batch); + fix_nan_and_inf_cpu(l.h_cpu, l.outputs*l.batch); + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1); copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1); @@ -864,52 +877,55 @@ void forward_conv_lstm_layer_gpu(layer l, network_state state) forward_convolutional_layer_gpu(uo, s); // f = wf + uf + vf - copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); - if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1); + add_3_arrays_activate(wf.output_gpu, uf.output_gpu, (l.peephole)?vf.output_gpu:NULL, l.outputs*l.batch, LOGISTIC, l.f_gpu); + //copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + //axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vf.output_gpu, 1, l.f_gpu, 1); + //activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); // i = wi + ui + vi - copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); - if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1); + add_3_arrays_activate(wi.output_gpu, ui.output_gpu, (l.peephole) ? vi.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.i_gpu); + //copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + //axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vi.output_gpu, 1, l.i_gpu, 1); + //activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); // g = wg + ug - copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - - activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + add_3_arrays_activate(wg.output_gpu, ug.output_gpu, NULL, l.outputs*l.batch, TANH, l.g_gpu); + //copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + //axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + //activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); // c = f*c + i*g - copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); + sum_of_mults(l.f_gpu, l.c_gpu, l.i_gpu, l.g_gpu, l.outputs*l.batch, l.c_gpu); // decreases mAP??? + //copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + //mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + //mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); + //axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); // o = wo + uo + vo(c_new) if (l.peephole) { s.input = l.c_gpu; forward_convolutional_layer_gpu(vo, s); } - copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); - if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1); - activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + add_3_arrays_activate(wo.output_gpu, uo.output_gpu, (l.peephole) ? vo.output_gpu : NULL, l.outputs*l.batch, LOGISTIC, l.o_gpu); + //copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + //axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + //if (l.peephole) axpy_ongpu(l.outputs*l.batch, 1, vo.output_gpu, 1, l.o_gpu, 1); + //activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); // h = o * tanh(c) - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); - activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); - mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); + activate_and_mult(l.c_gpu, l.o_gpu, l.outputs*l.batch, TANH, l.h_gpu); + //simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.h_gpu); + //activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); + //mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); if(l.state_constrain) constrain_ongpu(l.outputs*l.batch, l.state_constrain, l.c_gpu, 1); - //constrain_ongpu(l.outputs*l.batch, 1, l.c_gpu, 1); - //constrain_ongpu(l.outputs*l.batch, 1, l.h_gpu, 1); fix_nan_and_inf(l.c_gpu, l.outputs*l.batch); fix_nan_and_inf(l.h_gpu, l.outputs*l.batch); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1); // is required for both Detection and Training + if(state.train) simple_copy_ongpu(l.outputs*l.batch, l.c_gpu, l.cell_gpu); + simple_copy_ongpu(l.outputs*l.batch, l.h_gpu, l.output_gpu); // is required for both Detection and Training state.input += l.inputs*l.batch; l.output_gpu += l.outputs*l.batch;