diff --git a/src/connected_layer.c b/src/connected_layer.c index e6dc7594..96f7aaf2 100644 --- a/src/connected_layer.c +++ b/src/connected_layer.c @@ -10,15 +10,16 @@ #include #include -connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize) +connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize) { + int total_batch = batch*steps; int i; connected_layer l = {0}; l.type = CONNECTED; l.inputs = inputs; l.outputs = outputs; - l.batch=batch; + l.batch= batch; l.batch_normalize = batch_normalize; l.h = 1; l.w = 1; @@ -27,8 +28,8 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.out_w = 1; l.out_c = outputs; - l.output = calloc(batch*outputs, sizeof(float)); - l.delta = calloc(batch*outputs, sizeof(float)); + l.output = calloc(total_batch*outputs, sizeof(float)); + l.delta = calloc(total_batch*outputs, sizeof(float)); l.weight_updates = calloc(inputs*outputs, sizeof(float)); l.bias_updates = calloc(outputs, sizeof(float)); @@ -65,8 +66,8 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.rolling_mean = calloc(outputs, sizeof(float)); l.rolling_variance = calloc(outputs, sizeof(float)); - l.x = calloc(batch*outputs, sizeof(float)); - l.x_norm = calloc(batch*outputs, sizeof(float)); + l.x = calloc(total_batch*outputs, sizeof(float)); + l.x_norm = calloc(total_batch*outputs, sizeof(float)); } #ifdef GPU @@ -80,8 +81,8 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.weight_updates_gpu = cuda_make_array(l.weight_updates, outputs*inputs); l.bias_updates_gpu = cuda_make_array(l.bias_updates, outputs); - l.output_gpu = cuda_make_array(l.output, outputs*batch); - l.delta_gpu = cuda_make_array(l.delta, outputs*batch); + l.output_gpu = cuda_make_array(l.output, outputs*total_batch); + l.delta_gpu = cuda_make_array(l.delta, outputs*total_batch); if(batch_normalize){ l.scales_gpu = cuda_make_array(l.scales, outputs); l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs); @@ -95,13 +96,15 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT l.mean_delta_gpu = cuda_make_array(l.mean, outputs); l.variance_delta_gpu = cuda_make_array(l.variance, outputs); - l.x_gpu = cuda_make_array(l.output, l.batch*outputs); - l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs); + l.x_gpu = cuda_make_array(l.output, total_batch*outputs); + l.x_norm_gpu = cuda_make_array(l.output, total_batch*outputs); #ifdef CUDNN + cudnnCreateTensorDescriptor(&l.normDstTensorDesc); cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); + cudnnSetTensor4dDescriptor(l.normDstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); #endif } #endif @@ -147,7 +150,7 @@ void forward_connected_layer(connected_layer l, network_state state) axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); - normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); + normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); diff --git a/src/connected_layer.h b/src/connected_layer.h index 23797b10..3775e0a5 100644 --- a/src/connected_layer.h +++ b/src/connected_layer.h @@ -7,7 +7,7 @@ typedef layer connected_layer; -connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVATION activation, int batch_normalize); +connected_layer make_connected_layer(int batch, int steps, int inputs, int outputs, ACTIVATION activation, int batch_normalize); void forward_connected_layer(connected_layer layer, network_state state); void backward_connected_layer(connected_layer layer, network_state state); diff --git a/src/gru_layer.c b/src/gru_layer.c index b78e8682..fa03e7eb 100644 --- a/src/gru_layer.c +++ b/src/gru_layer.c @@ -38,36 +38,36 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no l.input_z_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_z_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.input_z_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.input_z_layer->batch = batch; l.state_z_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.state_z_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.state_z_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.state_z_layer->batch = batch; l.input_r_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_r_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.input_r_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.input_r_layer->batch = batch; l.state_r_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.state_r_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.state_r_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.state_r_layer->batch = batch; l.input_h_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_h_layer) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.input_h_layer) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.input_h_layer->batch = batch; l.state_h_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.state_h_layer) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.state_h_layer) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.state_h_layer->batch = batch; l.batch_normalize = batch_normalize; @@ -337,7 +337,7 @@ void backward_gru_layer_gpu(layer l, network_state state) #else activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC); #endif - + weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, input_h_layer.delta_gpu, input_z_layer.delta_gpu, l.outputs*l.batch, l.delta_gpu); #ifdef USET @@ -347,14 +347,14 @@ void backward_gru_layer_gpu(layer l, network_state state) #endif copy_ongpu(l.outputs*l.batch, input_h_layer.delta_gpu, 1, state_h_layer.delta_gpu, 1); - + copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1); mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1); fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1); s.input = l.forgot_state_gpu; s.delta = l.forgot_delta_gpu; - + backward_connected_layer_gpu(state_h_layer, s); if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu); mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, input_r_layer.delta_gpu); @@ -364,16 +364,16 @@ void backward_gru_layer_gpu(layer l, network_state state) gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, input_z_layer.delta_gpu); copy_ongpu(l.outputs*l.batch, input_z_layer.delta_gpu, 1, state_z_layer.delta_gpu, 1); - + s.input = l.prev_state_gpu; s.delta = prev_delta_gpu; - + backward_connected_layer_gpu(state_r_layer, s); backward_connected_layer_gpu(state_z_layer, s); s.input = state.input; s.delta = state.delta; - + backward_connected_layer_gpu(input_h_layer, s); backward_connected_layer_gpu(input_r_layer, s); backward_connected_layer_gpu(input_z_layer, s); diff --git a/src/lstm_layer.c b/src/lstm_layer.c index e61bf5c9..a8730f95 100644 --- a/src/lstm_layer.c +++ b/src/lstm_layer.c @@ -38,42 +38,42 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n l.uf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uf) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uf) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.uf->batch = batch; l.ui = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.ui) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.ui) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.ui->batch = batch; l.ug = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.ug) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.ug) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.ug->batch = batch; l.uo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.uo) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize); + *(l.uo) = make_connected_layer(batch, steps, inputs, outputs, LINEAR, batch_normalize); l.uo->batch = batch; l.wf = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wf) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wf) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.wf->batch = batch; l.wi = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wi) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wi) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.wi->batch = batch; l.wg = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wg) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wg) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.wg->batch = batch; l.wo = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.wo) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize); + *(l.wo) = make_connected_layer(batch, steps, outputs, outputs, LINEAR, batch_normalize); l.wo->batch = batch; l.batch_normalize = batch_normalize; @@ -125,15 +125,15 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n l.dc_gpu = cuda_make_array(0, batch*outputs); l.dh_gpu = cuda_make_array(0, batch*outputs); #ifdef CUDNN - cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w); - cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w); - cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w); - cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w); + cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w); + cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w); + cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w); + cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w); - cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w); - cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w); - cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w); - cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w); + cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w); + cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w); + cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w); + cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w); #endif #endif @@ -183,44 +183,44 @@ void forward_lstm_layer(layer l, network_state state) for (i = 0; i < l.steps; ++i) { s.input = l.h_cpu; - forward_connected_layer(wf, s); - forward_connected_layer(wi, s); - forward_connected_layer(wg, s); - forward_connected_layer(wo, s); + forward_connected_layer(wf, s); + forward_connected_layer(wi, s); + forward_connected_layer(wg, s); + forward_connected_layer(wo, s); s.input = state.input; - forward_connected_layer(uf, s); - forward_connected_layer(ui, s); - forward_connected_layer(ug, s); - forward_connected_layer(uo, s); + forward_connected_layer(uf, s); + forward_connected_layer(ui, s); + forward_connected_layer(ug, s); + forward_connected_layer(uo, s); copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1); - copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); + copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); - copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); + copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); - copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); + copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); - activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); - activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); - activate_array(l.g_cpu, l.outputs*l.batch, TANH); - activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.g_cpu, l.outputs*l.batch, TANH); + activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); - copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); - mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); - mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); + copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); - copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1); - activate_array(l.h_cpu, l.outputs*l.batch, TANH); - mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1); + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1); + activate_array(l.h_cpu, l.outputs*l.batch, TANH); + mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1); - copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1); + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1); copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1); state.input += l.inputs*l.batch; @@ -279,90 +279,90 @@ void backward_lstm_layer(layer l, network_state state) l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch; - copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1); + copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1); - copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); + copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1); - copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); + copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1); - copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); - axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); + copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1); - activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); - activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); - activate_array(l.g_cpu, l.outputs*l.batch, TANH); - activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC); + activate_array(l.g_cpu, l.outputs*l.batch, TANH); + activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC); - copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1); + copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1); - copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); - activate_array(l.temp_cpu, l.outputs*l.batch, TANH); + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); + activate_array(l.temp_cpu, l.outputs*l.batch, TANH); - copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); - mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1); + copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1); + mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1); gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu); - axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); + axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1); - copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); - activate_array(l.temp_cpu, l.outputs*l.batch, TANH); - mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); + copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1); + activate_array(l.temp_cpu, l.outputs*l.batch, TANH); + mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1); gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1); s.input = l.prev_state_cpu; - s.delta = l.dh_cpu; - backward_connected_layer(wo, s); + s.delta = l.dh_cpu; + backward_connected_layer(wo, s); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer(uo, s); + backward_connected_layer(uo, s); - copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); - mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); - gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu); + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1); + gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1); s.input = l.prev_state_cpu; - s.delta = l.dh_cpu; - backward_connected_layer(wg, s); + s.delta = l.dh_cpu; + backward_connected_layer(wg, s); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer(ug, s); + backward_connected_layer(ug, s); - copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); - mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); - gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1); + gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; - backward_connected_layer(wi, s); + backward_connected_layer(wi, s); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer(ui, s); + backward_connected_layer(ui, s); - copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1); gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1); s.input = l.prev_state_cpu; s.delta = l.dh_cpu; - backward_connected_layer(wf, s); + backward_connected_layer(wf, s); copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer(uf, s); + backward_connected_layer(uf, s); - copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); - mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1); - copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1); + copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1); + mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1); + copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1); state.input -= l.inputs*l.batch; if (state.delta) state.delta -= l.inputs*l.batch; @@ -425,44 +425,44 @@ void forward_lstm_layer_gpu(layer l, network_state state) for (i = 0; i < l.steps; ++i) { s.input = l.state_gpu; - forward_connected_layer_gpu(wf, s); - forward_connected_layer_gpu(wi, s); - forward_connected_layer_gpu(wg, s); - forward_connected_layer_gpu(wo, s); + forward_connected_layer_gpu(wf, s); + forward_connected_layer_gpu(wi, s); + forward_connected_layer_gpu(wg, s); + forward_connected_layer_gpu(wo, s); s.input = state.input; - forward_connected_layer_gpu(uf, s); - forward_connected_layer_gpu(ui, s); - forward_connected_layer_gpu(ug, s); - forward_connected_layer_gpu(uo, s); + forward_connected_layer_gpu(uf, s); + forward_connected_layer_gpu(ui, s); + forward_connected_layer_gpu(ug, s); + forward_connected_layer_gpu(uo, s); copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); - copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); - copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); - activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); - activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); - copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.c_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1, l.c_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); - activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); - mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.h_gpu, 1); + activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH); + mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.h_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1); copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1); state.input += l.inputs*l.batch; @@ -521,90 +521,90 @@ void backward_lstm_layer_gpu(layer l, network_state state) l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch; - copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); + copy_ongpu(l.outputs*l.batch, wf.output_gpu, 1, l.f_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uf.output_gpu, 1, l.f_gpu, 1); - copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); + copy_ongpu(l.outputs*l.batch, wi.output_gpu, 1, l.i_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ui.output_gpu, 1, l.i_gpu, 1); - copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); + copy_ongpu(l.outputs*l.batch, wg.output_gpu, 1, l.g_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, ug.output_gpu, 1, l.g_gpu, 1); - copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); - axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); + copy_ongpu(l.outputs*l.batch, wo.output_gpu, 1, l.o_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, uo.output_gpu, 1, l.o_gpu, 1); - activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); - activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); - activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC); + activate_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH); + activate_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC); - copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); - copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.o_gpu, 1, l.temp2_gpu, 1); gradient_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH, l.temp2_gpu); - axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); + axpy_ongpu(l.outputs*l.batch, 1, l.dc_gpu, 1, l.temp2_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); - activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); - mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.temp_gpu, 1); + activate_array_ongpu(l.temp_gpu, l.outputs*l.batch, TANH); + mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1); gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1); s.input = l.prev_state_gpu; - s.delta = l.dh_gpu; - backward_connected_layer_gpu(wo, s); + s.delta = l.dh_gpu; + backward_connected_layer_gpu(wo, s); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer_gpu(uo, s); + backward_connected_layer_gpu(uo, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1); + gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1); s.input = l.prev_state_gpu; - s.delta = l.dh_gpu; - backward_connected_layer_gpu(wg, s); + s.delta = l.dh_gpu; + backward_connected_layer_gpu(wg, s); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer_gpu(ug, s); + backward_connected_layer_gpu(ug, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); - gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1); + gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1); s.input = l.prev_state_gpu; s.delta = l.dh_gpu; - backward_connected_layer_gpu(wi, s); + backward_connected_layer_gpu(wi, s); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer_gpu(ui, s); + backward_connected_layer_gpu(ui, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1); gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1); s.input = l.prev_state_gpu; s.delta = l.dh_gpu; - backward_connected_layer_gpu(wf, s); + backward_connected_layer_gpu(wf, s); copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1); s.input = state.input; s.delta = state.delta; - backward_connected_layer_gpu(uf, s); + backward_connected_layer_gpu(uf, s); - copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); - mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); - copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1); + mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1); + copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1); state.input -= l.inputs*l.batch; if (state.delta) state.delta -= l.inputs*l.batch; diff --git a/src/parser.c b/src/parser.c index 644aa339..aae5f849 100644 --- a/src/parser.c +++ b/src/parser.c @@ -238,7 +238,7 @@ connected_layer parse_connected(list *options, size_params params) ACTIVATION activation = get_activation(activation_s); int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); - connected_layer layer = make_connected_layer(params.batch, params.inputs, output, activation, batch_normalize); + connected_layer layer = make_connected_layer(params.batch, 1, params.inputs, output, activation, batch_normalize); return layer; } diff --git a/src/rnn.c b/src/rnn.c index eca6f554..da49bd21 100644 --- a/src/rnn.c +++ b/src/rnn.c @@ -137,16 +137,16 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear, } else { FILE *fp = fopen(filename, "rb"); - fseek(fp, 0, SEEK_END); + fseek(fp, 0, SEEK_END); size = ftell(fp); - fseek(fp, 0, SEEK_SET); + fseek(fp, 0, SEEK_SET); text = calloc(size+1, sizeof(char)); fread(text, 1, size, fp); fclose(fp); } - char *backup_directory = "/home/pjreddie/backup/"; + char *backup_directory = "backup"; char *base = basecfg(cfgfile); fprintf(stderr, "%s\n", base); float avg_loss = -1; diff --git a/src/rnn_layer.c b/src/rnn_layer.c index 83fda13e..1b377b14 100644 --- a/src/rnn_layer.c +++ b/src/rnn_layer.c @@ -41,17 +41,17 @@ layer make_rnn_layer(int batch, int inputs, int hidden, int outputs, int steps, l.input_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.input_layer) = make_connected_layer(batch*steps, inputs, hidden, activation, batch_normalize); + *(l.input_layer) = make_connected_layer(batch, steps, inputs, hidden, activation, batch_normalize); l.input_layer->batch = batch; l.self_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.self_layer) = make_connected_layer(batch*steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize); + *(l.self_layer) = make_connected_layer(batch, steps, hidden, hidden, (log==2)?LOGGY:(log==1?LOGISTIC:activation), batch_normalize); l.self_layer->batch = batch; l.output_layer = malloc(sizeof(layer)); fprintf(stderr, "\t\t"); - *(l.output_layer) = make_connected_layer(batch*steps, hidden, outputs, activation, batch_normalize); + *(l.output_layer) = make_connected_layer(batch, steps, hidden, outputs, activation, batch_normalize); l.output_layer->batch = batch; l.outputs = outputs; @@ -95,6 +95,7 @@ void forward_rnn_layer(layer l, network_state state) if(state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { + s.input = state.input; forward_connected_layer(input_layer, s); @@ -209,6 +210,7 @@ void forward_rnn_layer_gpu(layer l, network_state state) if(state.train) fill_ongpu(l.hidden * l.batch, 0, l.state_gpu, 1); for (i = 0; i < l.steps; ++i) { + s.input = state.input; forward_connected_layer_gpu(input_layer, s);