mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
:charmandra: 🔥 🔥 🔥
This commit is contained in:
parent
c3e0d90e9f
commit
d8c5cfd6c6
2
Makefile
2
Makefile
@ -10,7 +10,7 @@ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
|
|||||||
-gencode arch=compute_52,code=[sm_52,compute_52]
|
-gencode arch=compute_52,code=[sm_52,compute_52]
|
||||||
|
|
||||||
# This is what I use, uncomment if you know your arch and want to specify
|
# This is what I use, uncomment if you know your arch and want to specify
|
||||||
ARCH= -gencode arch=compute_52,code=compute_52
|
# ARCH= -gencode arch=compute_52,code=compute_52
|
||||||
|
|
||||||
VPATH=./src/:./examples
|
VPATH=./src/:./examples
|
||||||
LIB=libdarknet.so
|
LIB=libdarknet.so
|
||||||
|
19
cfg/gru.cfg
19
cfg/gru.cfg
@ -1,27 +1,38 @@
|
|||||||
[net]
|
[net]
|
||||||
subdivisions=1
|
|
||||||
inputs=256
|
inputs=256
|
||||||
|
|
||||||
|
# Test
|
||||||
batch = 1
|
batch = 1
|
||||||
|
time_steps=1
|
||||||
|
|
||||||
|
# Train
|
||||||
|
# batch = 512
|
||||||
|
# time_steps=64
|
||||||
|
|
||||||
|
subdivisions=1
|
||||||
momentum=0.9
|
momentum=0.9
|
||||||
decay=0.001
|
decay=0.001
|
||||||
time_steps=1
|
learning_rate=0.1
|
||||||
learning_rate=0.5
|
|
||||||
|
|
||||||
|
burn_in=100
|
||||||
policy=poly
|
policy=poly
|
||||||
power=4
|
power=4
|
||||||
max_batches=2000
|
max_batches=10000
|
||||||
|
|
||||||
[gru]
|
[gru]
|
||||||
batch_normalize=1
|
batch_normalize=1
|
||||||
output = 1024
|
output = 1024
|
||||||
|
tanh = 1
|
||||||
|
|
||||||
[gru]
|
[gru]
|
||||||
batch_normalize=1
|
batch_normalize=1
|
||||||
output = 1024
|
output = 1024
|
||||||
|
tanh = 1
|
||||||
|
|
||||||
[gru]
|
[gru]
|
||||||
batch_normalize=1
|
batch_normalize=1
|
||||||
output = 1024
|
output = 1024
|
||||||
|
tanh = 1
|
||||||
|
|
||||||
[connected]
|
[connected]
|
||||||
output=256
|
output=256
|
||||||
|
@ -150,7 +150,7 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
|
|||||||
}
|
}
|
||||||
|
|
||||||
int inputs = net.inputs;
|
int inputs = net.inputs;
|
||||||
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g, Inputs: %d\n", net.learning_rate, net.momentum, net.decay, inputs);
|
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g, Inputs: %d %d %d\n", net.learning_rate, net.momentum, net.decay, inputs, net.batch, net.time_steps);
|
||||||
int batch = net.batch;
|
int batch = net.batch;
|
||||||
int steps = net.time_steps;
|
int steps = net.time_steps;
|
||||||
if(clear) *net.seen = 0;
|
if(clear) *net.seen = 0;
|
||||||
@ -174,8 +174,8 @@ void train_char_rnn(char *cfgfile, char *weightfile, char *filename, int clear,
|
|||||||
p = get_rnn_data(text, offsets, inputs, size, streams, steps);
|
p = get_rnn_data(text, offsets, inputs, size, streams, steps);
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(net.input, p.x, net.inputs*net.batch);
|
copy_cpu(net.inputs*net.batch, p.x, 1, net.input, 1);
|
||||||
memcpy(net.truth, p.y, net.truths*net.batch);
|
copy_cpu(net.truths*net.batch, p.y, 1, net.truth, 1);
|
||||||
float loss = train_network_datum(net) / (batch);
|
float loss = train_network_datum(net) / (batch);
|
||||||
free(p.x);
|
free(p.x);
|
||||||
free(p.y);
|
free(p.y);
|
||||||
|
@ -99,8 +99,8 @@ void train_vid_rnn(char *cfgfile, char *weightfile)
|
|||||||
time=clock();
|
time=clock();
|
||||||
float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps);
|
float_pair p = get_rnn_vid_data(extractor, paths, N, batch, steps);
|
||||||
|
|
||||||
memcpy(net.input, p.x, net.inputs*net.batch);
|
copy_cpu(net.inputs*net.batch, p.x, 1, net.input, 1);
|
||||||
memcpy(net.truth, p.y, net.truths*net.batch);
|
copy_cpu(net.truths*net.batch, p.y, 1, net.truth, 1);
|
||||||
float loss = train_network_datum(net) / (net.batch);
|
float loss = train_network_datum(net) / (net.batch);
|
||||||
|
|
||||||
|
|
||||||
|
@ -154,6 +154,7 @@ struct layer{
|
|||||||
int noadjust;
|
int noadjust;
|
||||||
int reorg;
|
int reorg;
|
||||||
int log;
|
int log;
|
||||||
|
int tanh;
|
||||||
|
|
||||||
int adam;
|
int adam;
|
||||||
float B1;
|
float B1;
|
||||||
@ -237,9 +238,26 @@ struct layer{
|
|||||||
float * scale_m;
|
float * scale_m;
|
||||||
float * scale_v;
|
float * scale_v;
|
||||||
|
|
||||||
float * z_cpu;
|
|
||||||
float * r_cpu;
|
float *z_cpu;
|
||||||
float * h_cpu;
|
float *r_cpu;
|
||||||
|
float *h_cpu;
|
||||||
|
float * prev_state_cpu;
|
||||||
|
|
||||||
|
float *temp_cpu;
|
||||||
|
float *temp2_cpu;
|
||||||
|
float *temp3_cpu;
|
||||||
|
|
||||||
|
float *dh_cpu;
|
||||||
|
float *hh_cpu;
|
||||||
|
float *prev_cell_cpu;
|
||||||
|
float *cell_cpu;
|
||||||
|
float *f_cpu;
|
||||||
|
float *i_cpu;
|
||||||
|
float *g_cpu;
|
||||||
|
float *o_cpu;
|
||||||
|
float *c_cpu;
|
||||||
|
float *dc_cpu;
|
||||||
|
|
||||||
float * binary_input;
|
float * binary_input;
|
||||||
|
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
#include "connected_layer.h"
|
#include "connected_layer.h"
|
||||||
|
#include "convolutional_layer.h"
|
||||||
#include "batchnorm_layer.h"
|
#include "batchnorm_layer.h"
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
#include "cuda.h"
|
#include "cuda.h"
|
||||||
@ -83,9 +84,6 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
|
|||||||
l.output_gpu = cuda_make_array(l.output, outputs*batch);
|
l.output_gpu = cuda_make_array(l.output, outputs*batch);
|
||||||
l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
|
l.delta_gpu = cuda_make_array(l.delta, outputs*batch);
|
||||||
if(batch_normalize){
|
if(batch_normalize){
|
||||||
l.scales_gpu = cuda_make_array(l.scales, outputs);
|
|
||||||
l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
|
|
||||||
|
|
||||||
l.mean_gpu = cuda_make_array(l.mean, outputs);
|
l.mean_gpu = cuda_make_array(l.mean, outputs);
|
||||||
l.variance_gpu = cuda_make_array(l.variance, outputs);
|
l.variance_gpu = cuda_make_array(l.variance, outputs);
|
||||||
|
|
||||||
@ -95,6 +93,9 @@ connected_layer make_connected_layer(int batch, int inputs, int outputs, ACTIVAT
|
|||||||
l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
|
l.mean_delta_gpu = cuda_make_array(l.mean, outputs);
|
||||||
l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
|
l.variance_delta_gpu = cuda_make_array(l.variance, outputs);
|
||||||
|
|
||||||
|
l.scales_gpu = cuda_make_array(l.scales, outputs);
|
||||||
|
l.scale_updates_gpu = cuda_make_array(l.scale_updates, outputs);
|
||||||
|
|
||||||
l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
|
l.x_gpu = cuda_make_array(l.output, l.batch*outputs);
|
||||||
l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
|
l.x_norm_gpu = cuda_make_array(l.output, l.batch*outputs);
|
||||||
#ifdef CUDNN
|
#ifdef CUDNN
|
||||||
@ -127,7 +128,6 @@ void update_connected_layer(connected_layer l, int batch, float learning_rate, f
|
|||||||
|
|
||||||
void forward_connected_layer(connected_layer l, network net)
|
void forward_connected_layer(connected_layer l, network net)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
|
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
|
||||||
int m = l.batch;
|
int m = l.batch;
|
||||||
int k = l.inputs;
|
int k = l.inputs;
|
||||||
@ -137,44 +137,21 @@ void forward_connected_layer(connected_layer l, network net)
|
|||||||
float *c = l.output;
|
float *c = l.output;
|
||||||
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
gemm(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||||
if(l.batch_normalize){
|
if(l.batch_normalize){
|
||||||
if(net.train){
|
forward_batchnorm_layer(l, net);
|
||||||
mean_cpu(l.output, l.batch, l.outputs, 1, l.mean);
|
|
||||||
variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance);
|
|
||||||
|
|
||||||
scal_cpu(l.outputs, .95, l.rolling_mean, 1);
|
|
||||||
axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1);
|
|
||||||
scal_cpu(l.outputs, .95, l.rolling_variance, 1);
|
|
||||||
axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1);
|
|
||||||
|
|
||||||
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
|
|
||||||
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1);
|
|
||||||
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
|
|
||||||
} else {
|
} else {
|
||||||
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1);
|
add_bias(l.output, l.biases, l.batch, l.outputs, 1);
|
||||||
}
|
|
||||||
scale_bias(l.output, l.scales, l.batch, l.outputs, 1);
|
|
||||||
}
|
|
||||||
for(i = 0; i < l.batch; ++i){
|
|
||||||
axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1);
|
|
||||||
}
|
}
|
||||||
activate_array(l.output, l.outputs*l.batch, l.activation);
|
activate_array(l.output, l.outputs*l.batch, l.activation);
|
||||||
}
|
}
|
||||||
|
|
||||||
void backward_connected_layer(connected_layer l, network net)
|
void backward_connected_layer(connected_layer l, network net)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
|
gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta);
|
||||||
for(i = 0; i < l.batch; ++i){
|
|
||||||
axpy_cpu(l.outputs, 1, l.delta + i*l.outputs, 1, l.bias_updates, 1);
|
|
||||||
}
|
|
||||||
if(l.batch_normalize){
|
if(l.batch_normalize){
|
||||||
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.outputs, 1, l.scale_updates);
|
backward_batchnorm_layer(l, net);
|
||||||
|
} else {
|
||||||
scale_bias(l.delta, l.scales, l.batch, l.outputs, 1);
|
backward_bias(l.bias_updates, l.delta, l.batch, l.outputs, 1);
|
||||||
|
|
||||||
mean_delta_cpu(l.delta, l.variance, l.batch, l.outputs, 1, l.mean_delta);
|
|
||||||
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.outputs, 1, l.variance_delta);
|
|
||||||
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.outputs, 1, l.delta);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int m = l.outputs;
|
int m = l.outputs;
|
||||||
@ -276,7 +253,6 @@ void update_connected_layer_gpu(connected_layer l, int batch, float learning_rat
|
|||||||
|
|
||||||
void forward_connected_layer_gpu(connected_layer l, network net)
|
void forward_connected_layer_gpu(connected_layer l, network net)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||||
|
|
||||||
int m = l.batch;
|
int m = l.batch;
|
||||||
@ -286,26 +262,23 @@ void forward_connected_layer_gpu(connected_layer l, network net)
|
|||||||
float * b = l.weights_gpu;
|
float * b = l.weights_gpu;
|
||||||
float * c = l.output_gpu;
|
float * c = l.output_gpu;
|
||||||
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
|
||||||
if(l.batch_normalize){
|
|
||||||
|
if (l.batch_normalize) {
|
||||||
forward_batchnorm_layer_gpu(l, net);
|
forward_batchnorm_layer_gpu(l, net);
|
||||||
}
|
} else {
|
||||||
for(i = 0; i < l.batch; ++i){
|
add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.outputs, 1);
|
||||||
axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
|
|
||||||
}
|
}
|
||||||
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
|
||||||
}
|
}
|
||||||
|
|
||||||
void backward_connected_layer_gpu(connected_layer l, network net)
|
void backward_connected_layer_gpu(connected_layer l, network net)
|
||||||
{
|
{
|
||||||
int i;
|
|
||||||
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1);
|
||||||
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
|
||||||
for(i = 0; i < l.batch; ++i){
|
|
||||||
axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
if(l.batch_normalize){
|
if(l.batch_normalize){
|
||||||
backward_batchnorm_layer_gpu(l, net);
|
backward_batchnorm_layer_gpu(l, net);
|
||||||
|
} else {
|
||||||
|
backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.outputs, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
int m = l.outputs;
|
int m = l.outputs;
|
||||||
|
@ -120,7 +120,7 @@ int float_abs_compare (const void * a, const void * b)
|
|||||||
|
|
||||||
void forward_cost_layer_gpu(cost_layer l, network net)
|
void forward_cost_layer_gpu(cost_layer l, network net)
|
||||||
{
|
{
|
||||||
if (!net.truth) return;
|
if (!net.truth_gpu) return;
|
||||||
if(l.smooth){
|
if(l.smooth){
|
||||||
scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
|
scal_ongpu(l.batch*l.inputs, (1-l.smooth), net.truth_gpu, 1);
|
||||||
add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
|
add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, net.truth_gpu, 1);
|
||||||
|
@ -26,7 +26,7 @@ int cuda_get_device()
|
|||||||
|
|
||||||
void check_error(cudaError_t status)
|
void check_error(cudaError_t status)
|
||||||
{
|
{
|
||||||
//cudaDeviceSynchronize();
|
cudaDeviceSynchronize();
|
||||||
cudaError_t status2 = cudaGetLastError();
|
cudaError_t status2 = cudaGetLastError();
|
||||||
if (status != cudaSuccess)
|
if (status != cudaSuccess)
|
||||||
{
|
{
|
||||||
|
367
src/gru_layer.c
367
src/gru_layer.c
@ -30,47 +30,67 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
|
|||||||
{
|
{
|
||||||
fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
|
fprintf(stderr, "GRU Layer: %d inputs, %d outputs\n", inputs, outputs);
|
||||||
batch = batch / steps;
|
batch = batch / steps;
|
||||||
layer l = { 0 };
|
layer l = {0};
|
||||||
l.batch = batch;
|
l.batch = batch;
|
||||||
l.type = GRU;
|
l.type = GRU;
|
||||||
l.steps = steps;
|
l.steps = steps;
|
||||||
l.inputs = inputs;
|
l.inputs = inputs;
|
||||||
|
|
||||||
l.wz = malloc(sizeof(layer));
|
|
||||||
fprintf(stderr, "\t\t");
|
|
||||||
*(l.wz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
|
||||||
l.wz->batch = batch;
|
|
||||||
|
|
||||||
l.uz = malloc(sizeof(layer));
|
l.uz = malloc(sizeof(layer));
|
||||||
fprintf(stderr, "\t\t");
|
fprintf(stderr, "\t\t");
|
||||||
*(l.uz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
*(l.uz) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
||||||
l.uz->batch = batch;
|
l.uz->batch = batch;
|
||||||
|
|
||||||
l.wr = malloc(sizeof(layer));
|
l.wz = malloc(sizeof(layer));
|
||||||
fprintf(stderr, "\t\t");
|
fprintf(stderr, "\t\t");
|
||||||
*(l.wr) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
*(l.wz) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
||||||
l.wr->batch = batch;
|
l.wz->batch = batch;
|
||||||
|
|
||||||
l.ur = malloc(sizeof(layer));
|
l.ur = malloc(sizeof(layer));
|
||||||
fprintf(stderr, "\t\t");
|
fprintf(stderr, "\t\t");
|
||||||
*(l.ur) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
*(l.ur) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
||||||
l.ur->batch = batch;
|
l.ur->batch = batch;
|
||||||
|
|
||||||
l.wh = malloc(sizeof(layer));
|
l.wr = malloc(sizeof(layer));
|
||||||
fprintf(stderr, "\t\t");
|
fprintf(stderr, "\t\t");
|
||||||
*(l.wh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
*(l.wr) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
||||||
l.wh->batch = batch;
|
l.wr->batch = batch;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
l.uh = malloc(sizeof(layer));
|
l.uh = malloc(sizeof(layer));
|
||||||
fprintf(stderr, "\t\t");
|
fprintf(stderr, "\t\t");
|
||||||
*(l.uh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
*(l.uh) = make_connected_layer(batch*steps, inputs, outputs, LINEAR, batch_normalize);
|
||||||
l.uh->batch = batch;
|
l.uh->batch = batch;
|
||||||
|
|
||||||
l.batch_normalize = batch_normalize;
|
l.wh = malloc(sizeof(layer));
|
||||||
l.outputs = outputs;
|
fprintf(stderr, "\t\t");
|
||||||
|
*(l.wh) = make_connected_layer(batch*steps, outputs, outputs, LINEAR, batch_normalize);
|
||||||
|
l.wh->batch = batch;
|
||||||
|
|
||||||
|
#ifdef CUDNN
|
||||||
|
cudnnSetTensor4dDescriptor(l.uz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uz->out_c, l.uz->out_h, l.uz->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.uh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uh->out_c, l.uh->out_h, l.uh->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.ur->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ur->out_c, l.ur->out_h, l.ur->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wz->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wz->out_c, l.wz->out_h, l.wz->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wh->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wh->out_c, l.wh->out_h, l.wh->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wr->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wr->out_c, l.wr->out_h, l.wr->out_w);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
l.batch_normalize = batch_normalize;
|
||||||
|
|
||||||
|
|
||||||
|
l.outputs = outputs;
|
||||||
l.output = calloc(outputs*batch*steps, sizeof(float));
|
l.output = calloc(outputs*batch*steps, sizeof(float));
|
||||||
|
l.delta = calloc(outputs*batch*steps, sizeof(float));
|
||||||
l.state = calloc(outputs*batch, sizeof(float));
|
l.state = calloc(outputs*batch, sizeof(float));
|
||||||
|
l.prev_state = calloc(outputs*batch, sizeof(float));
|
||||||
|
l.forgot_state = calloc(outputs*batch, sizeof(float));
|
||||||
|
l.forgot_delta = calloc(outputs*batch, sizeof(float));
|
||||||
|
|
||||||
|
l.r_cpu = calloc(outputs*batch, sizeof(float));
|
||||||
|
l.z_cpu = calloc(outputs*batch, sizeof(float));
|
||||||
|
l.h_cpu = calloc(outputs*batch, sizeof(float));
|
||||||
|
|
||||||
l.forward = forward_gru_layer;
|
l.forward = forward_gru_layer;
|
||||||
l.backward = backward_gru_layer;
|
l.backward = backward_gru_layer;
|
||||||
@ -81,31 +101,104 @@ layer make_gru_layer(int batch, int inputs, int outputs, int steps, int batch_no
|
|||||||
l.backward_gpu = backward_gru_layer_gpu;
|
l.backward_gpu = backward_gru_layer_gpu;
|
||||||
l.update_gpu = update_gru_layer_gpu;
|
l.update_gpu = update_gru_layer_gpu;
|
||||||
|
|
||||||
|
l.forgot_state_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.forgot_delta_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
|
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.state_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.output_gpu = cuda_make_array(0, batch*outputs*steps);
|
l.output_gpu = cuda_make_array(0, batch*outputs*steps);
|
||||||
l.delta_gpu = cuda_make_array(0, batch*outputs*steps);
|
l.delta_gpu = cuda_make_array(0, batch*outputs*steps);
|
||||||
|
l.r_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.r_gpu = cuda_make_array(l.output, batch*outputs);
|
l.z_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.z_gpu = cuda_make_array(l.output, batch*outputs);
|
l.h_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.hh_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.h_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.temp_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.temp2_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.temp3_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.dh_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay)
|
void update_gru_layer(layer l, int batch, float learning_rate, float momentum, float decay)
|
||||||
{
|
{
|
||||||
|
update_connected_layer(*(l.input_layer), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.self_layer), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.output_layer), batch, learning_rate, momentum, decay);
|
||||||
}
|
}
|
||||||
|
|
||||||
void forward_gru_layer(layer l, network state)
|
void forward_gru_layer(layer l, network net)
|
||||||
{
|
{
|
||||||
|
network s = net;
|
||||||
|
s.train = net.train;
|
||||||
|
int i;
|
||||||
|
layer uz = *(l.uz);
|
||||||
|
layer ur = *(l.ur);
|
||||||
|
layer uh = *(l.uh);
|
||||||
|
|
||||||
|
layer wz = *(l.wz);
|
||||||
|
layer wr = *(l.wr);
|
||||||
|
layer wh = *(l.wh);
|
||||||
|
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, uz.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, ur.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, uh.delta, 1);
|
||||||
|
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wz.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wr.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wh.delta, 1);
|
||||||
|
if(net.train) {
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.state, 1, l.prev_state, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < l.steps; ++i) {
|
||||||
|
s.input = l.state;
|
||||||
|
forward_connected_layer(wz, s);
|
||||||
|
forward_connected_layer(wr, s);
|
||||||
|
|
||||||
|
s.input = net.input;
|
||||||
|
forward_connected_layer(uz, s);
|
||||||
|
forward_connected_layer(ur, s);
|
||||||
|
forward_connected_layer(uh, s);
|
||||||
|
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, uz.output, 1, l.z_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, wz.output, 1, l.z_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, ur.output, 1, l.r_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, wr.output, 1, l.r_cpu, 1);
|
||||||
|
|
||||||
|
activate_array(l.z_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
activate_array(l.r_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.state, 1, l.forgot_state, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.r_cpu, 1, l.forgot_state, 1);
|
||||||
|
|
||||||
|
s.input = l.forgot_state;
|
||||||
|
forward_connected_layer(wh, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, uh.output, 1, l.h_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, wh.output, 1, l.h_cpu, 1);
|
||||||
|
|
||||||
|
if(l.tanh){
|
||||||
|
activate_array(l.h_cpu, l.outputs*l.batch, TANH);
|
||||||
|
} else {
|
||||||
|
activate_array(l.h_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
}
|
||||||
|
|
||||||
|
weighted_sum_cpu(l.state, l.h_cpu, l.z_cpu, l.outputs*l.batch, l.output);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.output, 1, l.state, 1);
|
||||||
|
|
||||||
|
net.input += l.inputs*l.batch;
|
||||||
|
l.output += l.outputs*l.batch;
|
||||||
|
increment_layer(&uz, 1);
|
||||||
|
increment_layer(&ur, 1);
|
||||||
|
increment_layer(&uh, 1);
|
||||||
|
|
||||||
|
increment_layer(&wz, 1);
|
||||||
|
increment_layer(&wr, 1);
|
||||||
|
increment_layer(&wh, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void backward_gru_layer(layer l, network state)
|
void backward_gru_layer(layer l, network net)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -121,201 +214,187 @@ void push_gru_layer(layer l)
|
|||||||
|
|
||||||
void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
|
void update_gru_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay)
|
||||||
{
|
{
|
||||||
update_connected_layer_gpu(*(l.wr), batch, learning_rate, momentum, decay);
|
|
||||||
update_connected_layer_gpu(*(l.wz), batch, learning_rate, momentum, decay);
|
|
||||||
update_connected_layer_gpu(*(l.wh), batch, learning_rate, momentum, decay);
|
|
||||||
update_connected_layer_gpu(*(l.ur), batch, learning_rate, momentum, decay);
|
update_connected_layer_gpu(*(l.ur), batch, learning_rate, momentum, decay);
|
||||||
update_connected_layer_gpu(*(l.uz), batch, learning_rate, momentum, decay);
|
update_connected_layer_gpu(*(l.uz), batch, learning_rate, momentum, decay);
|
||||||
update_connected_layer_gpu(*(l.uh), batch, learning_rate, momentum, decay);
|
update_connected_layer_gpu(*(l.uh), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer_gpu(*(l.wr), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer_gpu(*(l.wz), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer_gpu(*(l.wh), batch, learning_rate, momentum, decay);
|
||||||
}
|
}
|
||||||
|
|
||||||
void forward_gru_layer_gpu(layer l, network state)
|
void forward_gru_layer_gpu(layer l, network net)
|
||||||
{
|
{
|
||||||
network s = { 0 };
|
network s = {0};
|
||||||
s.train = state.train;
|
s.train = net.train;
|
||||||
int i;
|
int i;
|
||||||
layer wz = *(l.wz);
|
|
||||||
layer wr = *(l.wr);
|
|
||||||
layer wh = *(l.wh);
|
|
||||||
|
|
||||||
layer uz = *(l.uz);
|
layer uz = *(l.uz);
|
||||||
layer ur = *(l.ur);
|
layer ur = *(l.ur);
|
||||||
layer uh = *(l.uh);
|
layer uh = *(l.uh);
|
||||||
|
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
|
layer wz = *(l.wz);
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
|
layer wr = *(l.wr);
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
|
layer wh = *(l.wh);
|
||||||
|
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, uz.delta_gpu, 1);
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, ur.delta_gpu, 1);
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, uh.delta_gpu, 1);
|
||||||
|
|
||||||
if (state.train) {
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, wz.delta_gpu, 1);
|
||||||
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, wr.delta_gpu, 1);
|
||||||
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, wh.delta_gpu, 1);
|
||||||
|
if(net.train) {
|
||||||
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
fill_ongpu(l.outputs * l.batch * l.steps, 0, l.delta_gpu, 1);
|
||||||
|
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.prev_state_gpu, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < l.steps; ++i) {
|
for (i = 0; i < l.steps; ++i) {
|
||||||
s.input = l.h_gpu;
|
s.input_gpu = l.state_gpu;
|
||||||
forward_connected_layer_gpu(uz, s);
|
|
||||||
forward_connected_layer_gpu(ur, s);
|
|
||||||
|
|
||||||
s.input = state.input;
|
|
||||||
forward_connected_layer_gpu(wz, s);
|
forward_connected_layer_gpu(wz, s);
|
||||||
forward_connected_layer_gpu(wr, s);
|
forward_connected_layer_gpu(wr, s);
|
||||||
forward_connected_layer_gpu(wh, s);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wz.output_gpu, 1, l.z_gpu, 1);
|
s.input_gpu = net.input_gpu;
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, uz.output_gpu, 1, l.z_gpu, 1);
|
forward_connected_layer_gpu(uz, s);
|
||||||
|
forward_connected_layer_gpu(ur, s);
|
||||||
|
forward_connected_layer_gpu(uh, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wr.output_gpu, 1, l.r_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, ur.output_gpu, 1, l.r_gpu, 1);
|
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||||
|
|
||||||
|
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||||
|
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||||
|
|
||||||
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.hh_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.state_gpu, 1, l.forgot_state_gpu, 1);
|
||||||
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.hh_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||||
|
|
||||||
s.input = l.hh_gpu;
|
s.input_gpu = l.forgot_state_gpu;
|
||||||
forward_connected_layer_gpu(uh, s);
|
forward_connected_layer_gpu(wh, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wh.output_gpu, 1, l.hh_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, uh.output_gpu, 1, l.hh_gpu, 1);
|
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||||
|
|
||||||
activate_array_ongpu(l.hh_gpu, l.outputs*l.batch, TANH);
|
if(l.tanh){
|
||||||
|
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||||
|
} else {
|
||||||
|
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
}
|
||||||
|
|
||||||
weighted_sum_gpu(l.h_gpu, l.hh_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
|
weighted_sum_gpu(l.state_gpu, l.h_gpu, l.z_gpu, l.outputs*l.batch, l.output_gpu);
|
||||||
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.h_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.state_gpu, 1);
|
||||||
|
|
||||||
state.input += l.inputs*l.batch;
|
net.input_gpu += l.inputs*l.batch;
|
||||||
l.output_gpu += l.outputs*l.batch;
|
l.output_gpu += l.outputs*l.batch;
|
||||||
|
increment_layer(&uz, 1);
|
||||||
|
increment_layer(&ur, 1);
|
||||||
|
increment_layer(&uh, 1);
|
||||||
|
|
||||||
increment_layer(&wz, 1);
|
increment_layer(&wz, 1);
|
||||||
increment_layer(&wr, 1);
|
increment_layer(&wr, 1);
|
||||||
increment_layer(&wh, 1);
|
increment_layer(&wh, 1);
|
||||||
|
|
||||||
increment_layer(&uz, 1);
|
|
||||||
increment_layer(&ur, 1);
|
|
||||||
increment_layer(&uh, 1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void backward_gru_layer_gpu(layer l, network state)
|
void backward_gru_layer_gpu(layer l, network net)
|
||||||
{
|
{
|
||||||
network s = { 0 };
|
network s = {0};
|
||||||
s.train = state.train;
|
s.train = net.train;
|
||||||
int i;
|
int i;
|
||||||
layer wz = *(l.wz);
|
|
||||||
layer wr = *(l.wr);
|
|
||||||
layer wh = *(l.wh);
|
|
||||||
|
|
||||||
layer uz = *(l.uz);
|
layer uz = *(l.uz);
|
||||||
layer ur = *(l.ur);
|
layer ur = *(l.ur);
|
||||||
layer uh = *(l.uh);
|
layer uh = *(l.uh);
|
||||||
|
|
||||||
increment_layer(&wz, l.steps - 1);
|
layer wz = *(l.wz);
|
||||||
increment_layer(&wr, l.steps - 1);
|
layer wr = *(l.wr);
|
||||||
increment_layer(&wh, l.steps - 1);
|
layer wh = *(l.wh);
|
||||||
|
|
||||||
increment_layer(&uz, l.steps - 1);
|
increment_layer(&uz, l.steps - 1);
|
||||||
increment_layer(&ur, l.steps - 1);
|
increment_layer(&ur, l.steps - 1);
|
||||||
increment_layer(&uh, l.steps - 1);
|
increment_layer(&uh, l.steps - 1);
|
||||||
|
|
||||||
state.input += l.inputs*l.batch*(l.steps - 1);
|
increment_layer(&wz, l.steps - 1);
|
||||||
if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
|
increment_layer(&wr, l.steps - 1);
|
||||||
|
increment_layer(&wh, l.steps - 1);
|
||||||
|
|
||||||
l.output_gpu += l.outputs*l.batch*(l.steps - 1);
|
net.input_gpu += l.inputs*l.batch*(l.steps-1);
|
||||||
l.delta_gpu += l.outputs*l.batch*(l.steps - 1);
|
if(net.delta_gpu) net.delta_gpu += l.inputs*l.batch*(l.steps-1);
|
||||||
|
l.output_gpu += l.outputs*l.batch*(l.steps-1);
|
||||||
|
l.delta_gpu += l.outputs*l.batch*(l.steps-1);
|
||||||
|
for (i = l.steps-1; i >= 0; --i) {
|
||||||
|
if(i != 0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
|
||||||
|
float *prev_delta_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
|
||||||
|
|
||||||
for (i = l.steps - 1; i >= 0; --i) {
|
copy_ongpu(l.outputs*l.batch, uz.output_gpu, 1, l.z_gpu, 1);
|
||||||
if (i>0) copy_ongpu(l.outputs*l.batch, l.output_gpu - l.outputs*l.batch, 1, l.prev_state_gpu, 1);
|
axpy_ongpu(l.outputs*l.batch, 1, wz.output_gpu, 1, l.z_gpu, 1);
|
||||||
l.dh_gpu = (i == 0) ? 0 : l.delta_gpu - l.outputs*l.batch;
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wz.output_gpu, 1, l.z_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, ur.output_gpu, 1, l.r_gpu, 1);
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, uz.output_gpu, 1, l.z_gpu, 1);
|
axpy_ongpu(l.outputs*l.batch, 1, wr.output_gpu, 1, l.r_gpu, 1);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wr.output_gpu, 1, l.r_gpu, 1);
|
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, ur.output_gpu, 1, l.r_gpu, 1);
|
|
||||||
|
|
||||||
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
activate_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
activate_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, wh.output_gpu, 1, l.hh_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, uh.output_gpu, 1, l.h_gpu, 1);
|
||||||
axpy_ongpu(l.outputs*l.batch, 1, uh.output_gpu, 1, l.hh_gpu, 1);
|
axpy_ongpu(l.outputs*l.batch, 1, wh.output_gpu, 1, l.h_gpu, 1);
|
||||||
|
|
||||||
activate_array_ongpu(l.hh_gpu, l.outputs*l.batch, TANH);
|
if(l.tanh){
|
||||||
|
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH);
|
||||||
|
} else {
|
||||||
|
activate_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
}
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, l.temp3_gpu, 1);
|
weighted_delta_gpu(l.prev_state_gpu, l.h_gpu, l.z_gpu, prev_delta_gpu, uh.delta_gpu, uz.delta_gpu, l.outputs*l.batch, l.delta_gpu);
|
||||||
|
|
||||||
fill_ongpu(l.outputs*l.batch, 1, l.temp_gpu, 1);
|
if(l.tanh){
|
||||||
axpy_ongpu(l.outputs*l.batch, -1, l.z_gpu, 1, l.temp_gpu, 1);
|
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, TANH, uh.delta_gpu);
|
||||||
mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
|
} else {
|
||||||
gradient_array_ongpu(l.hh_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
|
gradient_array_ongpu(l.h_gpu, l.outputs*l.batch, LOGISTIC, uh.delta_gpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_ongpu(l.outputs*l.batch, uh.delta_gpu, 1, wh.delta_gpu, 1);
|
||||||
|
|
||||||
|
copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.forgot_state_gpu, 1);
|
||||||
|
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.forgot_state_gpu, 1);
|
||||||
|
fill_ongpu(l.outputs*l.batch, 0, l.forgot_delta_gpu, 1);
|
||||||
|
|
||||||
|
s.input_gpu = l.forgot_state_gpu;
|
||||||
|
s.delta_gpu = l.forgot_delta_gpu;
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wh.delta_gpu, 1);
|
|
||||||
s.input = state.input;
|
|
||||||
s.delta = state.delta;
|
|
||||||
backward_connected_layer_gpu(wh, s);
|
backward_connected_layer_gpu(wh, s);
|
||||||
|
if(prev_delta_gpu) mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.r_gpu, prev_delta_gpu);
|
||||||
|
mult_add_into_gpu(l.outputs*l.batch, l.forgot_delta_gpu, l.prev_state_gpu, ur.delta_gpu);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.temp2_gpu, 1);
|
gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, ur.delta_gpu);
|
||||||
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.temp2_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, ur.delta_gpu, 1, wr.delta_gpu, 1);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uh.delta_gpu, 1);
|
gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, uz.delta_gpu);
|
||||||
fill_ongpu(l.outputs*l.batch, 0, l.temp_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, uz.delta_gpu, 1, wz.delta_gpu, 1);
|
||||||
s.input = l.temp2_gpu;
|
|
||||||
s.delta = l.temp_gpu;
|
|
||||||
backward_connected_layer_gpu(uh, s);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.temp2_gpu, 1);
|
s.input_gpu = l.prev_state_gpu;
|
||||||
mul_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.temp2_gpu, 1);
|
s.delta_gpu = prev_delta_gpu;
|
||||||
gradient_array_ongpu(l.r_gpu, l.outputs*l.batch, LOGISTIC, l.temp2_gpu);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, wr.delta_gpu, 1);
|
|
||||||
s.input = state.input;
|
|
||||||
s.delta = state.delta;
|
|
||||||
backward_connected_layer_gpu(wr, s);
|
backward_connected_layer_gpu(wr, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, ur.delta_gpu, 1);
|
|
||||||
s.input = l.prev_state_gpu;
|
|
||||||
s.delta = l.dh_gpu;
|
|
||||||
backward_connected_layer_gpu(ur, s);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
mul_ongpu(l.outputs*l.batch, l.r_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
if (l.dh_gpu) axpy_ongpu(l.outputs*l.batch, 1, l.temp2_gpu, 1, l.dh_gpu, 1);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
mul_ongpu(l.outputs*l.batch, l.z_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
if (l.dh_gpu) axpy_ongpu(l.outputs*l.batch, 1, l.temp2_gpu, 1, l.dh_gpu, 1);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
mul_ongpu(l.outputs*l.batch, l.prev_state_gpu, 1, l.temp3_gpu, 1);
|
|
||||||
mul_ongpu(l.outputs*l.batch, l.hh_gpu, 1, l.temp2_gpu, 1);
|
|
||||||
axpy_ongpu(l.outputs*l.batch, -1, l.temp2_gpu, 1, l.temp3_gpu, 1);
|
|
||||||
gradient_array_ongpu(l.z_gpu, l.outputs*l.batch, LOGISTIC, l.temp3_gpu);
|
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, wz.delta_gpu, 1);
|
|
||||||
s.input = state.input;
|
|
||||||
s.delta = state.delta;
|
|
||||||
backward_connected_layer_gpu(wz, s);
|
backward_connected_layer_gpu(wz, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, uz.delta_gpu, 1);
|
s.input_gpu = net.input_gpu;
|
||||||
s.input = l.prev_state_gpu;
|
s.delta_gpu = net.delta_gpu;
|
||||||
s.delta = l.dh_gpu;
|
|
||||||
|
backward_connected_layer_gpu(uh, s);
|
||||||
|
backward_connected_layer_gpu(ur, s);
|
||||||
backward_connected_layer_gpu(uz, s);
|
backward_connected_layer_gpu(uz, s);
|
||||||
|
|
||||||
state.input -= l.inputs*l.batch;
|
|
||||||
if (state.delta) state.delta -= l.inputs*l.batch;
|
net.input_gpu -= l.inputs*l.batch;
|
||||||
|
if(net.delta_gpu) net.delta_gpu -= l.inputs*l.batch;
|
||||||
l.output_gpu -= l.outputs*l.batch;
|
l.output_gpu -= l.outputs*l.batch;
|
||||||
l.delta_gpu -= l.outputs*l.batch;
|
l.delta_gpu -= l.outputs*l.batch;
|
||||||
|
increment_layer(&uz, -1);
|
||||||
|
increment_layer(&ur, -1);
|
||||||
|
increment_layer(&uh, -1);
|
||||||
|
|
||||||
increment_layer(&wz, -1);
|
increment_layer(&wz, -1);
|
||||||
increment_layer(&wr, -1);
|
increment_layer(&wr, -1);
|
||||||
increment_layer(&wh, -1);
|
increment_layer(&wh, -1);
|
||||||
|
|
||||||
increment_layer(&uz, -1);
|
|
||||||
increment_layer(&ur, -1);
|
|
||||||
increment_layer(&uh, -1);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
337
src/lstm_layer.c
337
src/lstm_layer.c
@ -85,29 +85,57 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
|
|||||||
l.forward = forward_lstm_layer;
|
l.forward = forward_lstm_layer;
|
||||||
l.update = update_lstm_layer;
|
l.update = update_lstm_layer;
|
||||||
|
|
||||||
|
l.prev_state_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.prev_cell_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.cell_cpu = calloc(batch*outputs*steps, sizeof(float));
|
||||||
|
|
||||||
|
l.f_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.i_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.g_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.o_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.c_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.h_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.temp_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.temp2_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.temp3_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.dc_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
l.dh_cpu = calloc(batch*outputs, sizeof(float));
|
||||||
|
|
||||||
#ifdef GPU
|
#ifdef GPU
|
||||||
l.forward_gpu = forward_lstm_layer_gpu;
|
l.forward_gpu = forward_lstm_layer_gpu;
|
||||||
l.backward_gpu = backward_lstm_layer_gpu;
|
l.backward_gpu = backward_lstm_layer_gpu;
|
||||||
l.update_gpu = update_lstm_layer_gpu;
|
l.update_gpu = update_lstm_layer_gpu;
|
||||||
|
|
||||||
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
|
|
||||||
l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
|
|
||||||
|
|
||||||
l.output_gpu = cuda_make_array(0, batch*outputs*steps);
|
l.output_gpu = cuda_make_array(0, batch*outputs*steps);
|
||||||
l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
|
|
||||||
l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
|
l.delta_gpu = cuda_make_array(0, batch*l.outputs*steps);
|
||||||
|
|
||||||
l.f_gpu = cuda_make_array(l.output, batch*outputs);
|
l.prev_state_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.i_gpu = cuda_make_array(l.output, batch*outputs);
|
l.prev_cell_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.g_gpu = cuda_make_array(l.output, batch*outputs);
|
l.cell_gpu = cuda_make_array(0, batch*outputs*steps);
|
||||||
l.o_gpu = cuda_make_array(l.output, batch*outputs);
|
|
||||||
l.c_gpu = cuda_make_array(l.output, batch*outputs);
|
l.f_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.h_gpu = cuda_make_array(l.output, batch*outputs);
|
l.i_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.temp_gpu = cuda_make_array(l.output, batch*outputs);
|
l.g_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.temp2_gpu = cuda_make_array(l.output, batch*outputs);
|
l.o_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.temp3_gpu = cuda_make_array(l.output, batch*outputs);
|
l.c_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.dc_gpu = cuda_make_array(l.output, batch*outputs);
|
l.h_gpu = cuda_make_array(0, batch*outputs);
|
||||||
l.dh_gpu = cuda_make_array(l.output, batch*outputs);
|
l.temp_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.temp2_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.temp3_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.dc_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
l.dh_gpu = cuda_make_array(0, batch*outputs);
|
||||||
|
#ifdef CUDNN
|
||||||
|
cudnnSetTensor4dDescriptor(l.wf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wf->out_c, l.wf->out_h, l.wf->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wi->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wi->out_c, l.wi->out_h, l.wi->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wg->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wg->out_c, l.wg->out_h, l.wg->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.wo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.wo->out_c, l.wo->out_h, l.wo->out_w);
|
||||||
|
|
||||||
|
cudnnSetTensor4dDescriptor(l.uf->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uf->out_c, l.uf->out_h, l.uf->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.ui->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ui->out_c, l.ui->out_h, l.ui->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.ug->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.ug->out_c, l.ug->out_h, l.ug->out_w);
|
||||||
|
cudnnSetTensor4dDescriptor(l.uo->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, batch, l.uo->out_c, l.uo->out_h, l.uo->out_w);
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return l;
|
return l;
|
||||||
@ -115,10 +143,243 @@ layer make_lstm_layer(int batch, int inputs, int outputs, int steps, int batch_n
|
|||||||
|
|
||||||
void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
|
void update_lstm_layer(layer l, int batch, float learning_rate, float momentum, float decay)
|
||||||
{
|
{
|
||||||
|
update_connected_layer(*(l.wf), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.wi), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.wg), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.wo), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.uf), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.ui), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.ug), batch, learning_rate, momentum, decay);
|
||||||
|
update_connected_layer(*(l.uo), batch, learning_rate, momentum, decay);
|
||||||
}
|
}
|
||||||
|
|
||||||
void forward_lstm_layer(layer l, network state)
|
void forward_lstm_layer(layer l, network state)
|
||||||
{
|
{
|
||||||
|
network s = { 0 };
|
||||||
|
s.train = state.train;
|
||||||
|
int i;
|
||||||
|
layer wf = *(l.wf);
|
||||||
|
layer wi = *(l.wi);
|
||||||
|
layer wg = *(l.wg);
|
||||||
|
layer wo = *(l.wo);
|
||||||
|
|
||||||
|
layer uf = *(l.uf);
|
||||||
|
layer ui = *(l.ui);
|
||||||
|
layer ug = *(l.ug);
|
||||||
|
layer uo = *(l.uo);
|
||||||
|
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1);
|
||||||
|
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1);
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1);
|
||||||
|
if (state.train) {
|
||||||
|
fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < l.steps; ++i) {
|
||||||
|
s.input = l.h_cpu;
|
||||||
|
forward_connected_layer(wf, s);
|
||||||
|
forward_connected_layer(wi, s);
|
||||||
|
forward_connected_layer(wg, s);
|
||||||
|
forward_connected_layer(wo, s);
|
||||||
|
|
||||||
|
s.input = state.input;
|
||||||
|
forward_connected_layer(uf, s);
|
||||||
|
forward_connected_layer(ui, s);
|
||||||
|
forward_connected_layer(ug, s);
|
||||||
|
forward_connected_layer(uo, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
|
||||||
|
|
||||||
|
activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
activate_array(l.g_cpu, l.outputs*l.batch, TANH);
|
||||||
|
activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.c_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.h_cpu, 1);
|
||||||
|
activate_array(l.h_cpu, l.outputs*l.batch, TANH);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.h_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.cell_cpu, 1);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.h_cpu, 1, l.output, 1);
|
||||||
|
|
||||||
|
state.input += l.inputs*l.batch;
|
||||||
|
l.output += l.outputs*l.batch;
|
||||||
|
l.cell_cpu += l.outputs*l.batch;
|
||||||
|
|
||||||
|
increment_layer(&wf, 1);
|
||||||
|
increment_layer(&wi, 1);
|
||||||
|
increment_layer(&wg, 1);
|
||||||
|
increment_layer(&wo, 1);
|
||||||
|
|
||||||
|
increment_layer(&uf, 1);
|
||||||
|
increment_layer(&ui, 1);
|
||||||
|
increment_layer(&ug, 1);
|
||||||
|
increment_layer(&uo, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void backward_lstm_layer(layer l, network state)
|
||||||
|
{
|
||||||
|
network s = { 0 };
|
||||||
|
s.train = state.train;
|
||||||
|
int i;
|
||||||
|
layer wf = *(l.wf);
|
||||||
|
layer wi = *(l.wi);
|
||||||
|
layer wg = *(l.wg);
|
||||||
|
layer wo = *(l.wo);
|
||||||
|
|
||||||
|
layer uf = *(l.uf);
|
||||||
|
layer ui = *(l.ui);
|
||||||
|
layer ug = *(l.ug);
|
||||||
|
layer uo = *(l.uo);
|
||||||
|
|
||||||
|
increment_layer(&wf, l.steps - 1);
|
||||||
|
increment_layer(&wi, l.steps - 1);
|
||||||
|
increment_layer(&wg, l.steps - 1);
|
||||||
|
increment_layer(&wo, l.steps - 1);
|
||||||
|
|
||||||
|
increment_layer(&uf, l.steps - 1);
|
||||||
|
increment_layer(&ui, l.steps - 1);
|
||||||
|
increment_layer(&ug, l.steps - 1);
|
||||||
|
increment_layer(&uo, l.steps - 1);
|
||||||
|
|
||||||
|
state.input += l.inputs*l.batch*(l.steps - 1);
|
||||||
|
if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
|
||||||
|
|
||||||
|
l.output += l.outputs*l.batch*(l.steps - 1);
|
||||||
|
l.cell_cpu += l.outputs*l.batch*(l.steps - 1);
|
||||||
|
l.delta += l.outputs*l.batch*(l.steps - 1);
|
||||||
|
|
||||||
|
for (i = l.steps - 1; i >= 0; --i) {
|
||||||
|
if (i != 0) copy_cpu(l.outputs*l.batch, l.cell_cpu - l.outputs*l.batch, 1, l.prev_cell_cpu, 1);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.cell_cpu, 1, l.c_cpu, 1);
|
||||||
|
if (i != 0) copy_cpu(l.outputs*l.batch, l.output - l.outputs*l.batch, 1, l.prev_state_cpu, 1);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.output, 1, l.h_cpu, 1);
|
||||||
|
|
||||||
|
l.dh_cpu = (i == 0) ? 0 : l.delta - l.outputs*l.batch;
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wf.output, 1, l.f_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, uf.output, 1, l.f_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wi.output, 1, l.i_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, ui.output, 1, l.i_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wg.output, 1, l.g_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, ug.output, 1, l.g_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, wo.output, 1, l.o_cpu, 1);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, uo.output, 1, l.o_cpu, 1);
|
||||||
|
|
||||||
|
activate_array(l.f_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
activate_array(l.i_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
activate_array(l.g_cpu, l.outputs*l.batch, TANH);
|
||||||
|
activate_array(l.o_cpu, l.outputs*l.batch, LOGISTIC);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.delta, 1, l.temp3_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
|
||||||
|
activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp2_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.o_cpu, 1, l.temp2_cpu, 1);
|
||||||
|
|
||||||
|
gradient_array(l.temp_cpu, l.outputs*l.batch, TANH, l.temp2_cpu);
|
||||||
|
axpy_cpu(l.outputs*l.batch, 1, l.dc_cpu, 1, l.temp2_cpu, 1);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.c_cpu, 1, l.temp_cpu, 1);
|
||||||
|
activate_array(l.temp_cpu, l.outputs*l.batch, TANH);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.temp3_cpu, 1, l.temp_cpu, 1);
|
||||||
|
gradient_array(l.o_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wo.delta, 1);
|
||||||
|
s.input = l.prev_state_cpu;
|
||||||
|
s.delta = l.dh_cpu;
|
||||||
|
backward_connected_layer(wo, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uo.delta, 1);
|
||||||
|
s.input = state.input;
|
||||||
|
s.delta = state.delta;
|
||||||
|
backward_connected_layer(uo, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.i_cpu, 1, l.temp_cpu, 1);
|
||||||
|
gradient_array(l.g_cpu, l.outputs*l.batch, TANH, l.temp_cpu);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wg.delta, 1);
|
||||||
|
s.input = l.prev_state_cpu;
|
||||||
|
s.delta = l.dh_cpu;
|
||||||
|
backward_connected_layer(wg, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ug.delta, 1);
|
||||||
|
s.input = state.input;
|
||||||
|
s.delta = state.delta;
|
||||||
|
backward_connected_layer(ug, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.g_cpu, 1, l.temp_cpu, 1);
|
||||||
|
gradient_array(l.i_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wi.delta, 1);
|
||||||
|
s.input = l.prev_state_cpu;
|
||||||
|
s.delta = l.dh_cpu;
|
||||||
|
backward_connected_layer(wi, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, ui.delta, 1);
|
||||||
|
s.input = state.input;
|
||||||
|
s.delta = state.delta;
|
||||||
|
backward_connected_layer(ui, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.prev_cell_cpu, 1, l.temp_cpu, 1);
|
||||||
|
gradient_array(l.f_cpu, l.outputs*l.batch, LOGISTIC, l.temp_cpu);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, wf.delta, 1);
|
||||||
|
s.input = l.prev_state_cpu;
|
||||||
|
s.delta = l.dh_cpu;
|
||||||
|
backward_connected_layer(wf, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, uf.delta, 1);
|
||||||
|
s.input = state.input;
|
||||||
|
s.delta = state.delta;
|
||||||
|
backward_connected_layer(uf, s);
|
||||||
|
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp2_cpu, 1, l.temp_cpu, 1);
|
||||||
|
mul_cpu(l.outputs*l.batch, l.f_cpu, 1, l.temp_cpu, 1);
|
||||||
|
copy_cpu(l.outputs*l.batch, l.temp_cpu, 1, l.dc_cpu, 1);
|
||||||
|
|
||||||
|
state.input -= l.inputs*l.batch;
|
||||||
|
if (state.delta) state.delta -= l.inputs*l.batch;
|
||||||
|
l.output -= l.outputs*l.batch;
|
||||||
|
l.cell_cpu -= l.outputs*l.batch;
|
||||||
|
l.delta -= l.outputs*l.batch;
|
||||||
|
|
||||||
|
increment_layer(&wf, -1);
|
||||||
|
increment_layer(&wi, -1);
|
||||||
|
increment_layer(&wg, -1);
|
||||||
|
increment_layer(&wo, -1);
|
||||||
|
|
||||||
|
increment_layer(&uf, -1);
|
||||||
|
increment_layer(&ui, -1);
|
||||||
|
increment_layer(&ug, -1);
|
||||||
|
increment_layer(&uo, -1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GPU
|
#ifdef GPU
|
||||||
@ -163,13 +424,13 @@ void forward_lstm_layer_gpu(layer l, network state)
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; i < l.steps; ++i) {
|
for (i = 0; i < l.steps; ++i) {
|
||||||
s.input = l.h_gpu;
|
s.input_gpu = l.h_gpu;
|
||||||
forward_connected_layer_gpu(wf, s);
|
forward_connected_layer_gpu(wf, s);
|
||||||
forward_connected_layer_gpu(wi, s);
|
forward_connected_layer_gpu(wi, s);
|
||||||
forward_connected_layer_gpu(wg, s);
|
forward_connected_layer_gpu(wg, s);
|
||||||
forward_connected_layer_gpu(wo, s);
|
forward_connected_layer_gpu(wo, s);
|
||||||
|
|
||||||
s.input = state.input;
|
s.input_gpu = state.input_gpu;
|
||||||
forward_connected_layer_gpu(uf, s);
|
forward_connected_layer_gpu(uf, s);
|
||||||
forward_connected_layer_gpu(ui, s);
|
forward_connected_layer_gpu(ui, s);
|
||||||
forward_connected_layer_gpu(ug, s);
|
forward_connected_layer_gpu(ug, s);
|
||||||
@ -204,7 +465,7 @@ void forward_lstm_layer_gpu(layer l, network state)
|
|||||||
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.c_gpu, 1, l.cell_gpu, 1);
|
||||||
copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.h_gpu, 1, l.output_gpu, 1);
|
||||||
|
|
||||||
state.input += l.inputs*l.batch;
|
state.input_gpu += l.inputs*l.batch;
|
||||||
l.output_gpu += l.outputs*l.batch;
|
l.output_gpu += l.outputs*l.batch;
|
||||||
l.cell_gpu += l.outputs*l.batch;
|
l.cell_gpu += l.outputs*l.batch;
|
||||||
|
|
||||||
@ -245,8 +506,8 @@ void backward_lstm_layer_gpu(layer l, network state)
|
|||||||
increment_layer(&ug, l.steps - 1);
|
increment_layer(&ug, l.steps - 1);
|
||||||
increment_layer(&uo, l.steps - 1);
|
increment_layer(&uo, l.steps - 1);
|
||||||
|
|
||||||
state.input += l.inputs*l.batch*(l.steps - 1);
|
state.input_gpu += l.inputs*l.batch*(l.steps - 1);
|
||||||
if (state.delta) state.delta += l.inputs*l.batch*(l.steps - 1);
|
if (state.delta_gpu) state.delta_gpu += l.inputs*l.batch*(l.steps - 1);
|
||||||
|
|
||||||
l.output_gpu += l.outputs*l.batch*(l.steps - 1);
|
l.output_gpu += l.outputs*l.batch*(l.steps - 1);
|
||||||
l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
|
l.cell_gpu += l.outputs*l.batch*(l.steps - 1);
|
||||||
@ -293,60 +554,60 @@ void backward_lstm_layer_gpu(layer l, network state)
|
|||||||
mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.temp3_gpu, 1, l.temp_gpu, 1);
|
||||||
gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
gradient_array_ongpu(l.o_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wo.delta_gpu, 1);
|
||||||
s.input = l.prev_state_gpu;
|
s.input_gpu = l.prev_state_gpu;
|
||||||
s.delta = l.dh_gpu;
|
s.delta_gpu = l.dh_gpu;
|
||||||
backward_connected_layer_gpu(wo, s);
|
backward_connected_layer_gpu(wo, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uo.delta_gpu, 1);
|
||||||
s.input = state.input;
|
s.input_gpu = state.input_gpu;
|
||||||
s.delta = state.delta;
|
s.delta_gpu = state.delta_gpu;
|
||||||
backward_connected_layer_gpu(uo, s);
|
backward_connected_layer_gpu(uo, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||||
mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.i_gpu, 1, l.temp_gpu, 1);
|
||||||
gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
|
gradient_array_ongpu(l.g_gpu, l.outputs*l.batch, TANH, l.temp_gpu);
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wg.delta_gpu, 1);
|
||||||
s.input = l.prev_state_gpu;
|
s.input_gpu = l.prev_state_gpu;
|
||||||
s.delta = l.dh_gpu;
|
s.delta_gpu = l.dh_gpu;
|
||||||
backward_connected_layer_gpu(wg, s);
|
backward_connected_layer_gpu(wg, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ug.delta_gpu, 1);
|
||||||
s.input = state.input;
|
s.input_gpu = state.input_gpu;
|
||||||
s.delta = state.delta;
|
s.delta_gpu = state.delta_gpu;
|
||||||
backward_connected_layer_gpu(ug, s);
|
backward_connected_layer_gpu(ug, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||||
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.g_gpu, 1, l.temp_gpu, 1);
|
||||||
gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
gradient_array_ongpu(l.i_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wi.delta_gpu, 1);
|
||||||
s.input = l.prev_state_gpu;
|
s.input_gpu = l.prev_state_gpu;
|
||||||
s.delta = l.dh_gpu;
|
s.delta_gpu = l.dh_gpu;
|
||||||
backward_connected_layer_gpu(wi, s);
|
backward_connected_layer_gpu(wi, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, ui.delta_gpu, 1);
|
||||||
s.input = state.input;
|
s.input_gpu = state.input_gpu;
|
||||||
s.delta = state.delta;
|
s.delta_gpu = state.delta_gpu;
|
||||||
backward_connected_layer_gpu(ui, s);
|
backward_connected_layer_gpu(ui, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||||
mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.prev_cell_gpu, 1, l.temp_gpu, 1);
|
||||||
gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
gradient_array_ongpu(l.f_gpu, l.outputs*l.batch, LOGISTIC, l.temp_gpu);
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, wf.delta_gpu, 1);
|
||||||
s.input = l.prev_state_gpu;
|
s.input_gpu = l.prev_state_gpu;
|
||||||
s.delta = l.dh_gpu;
|
s.delta_gpu = l.dh_gpu;
|
||||||
backward_connected_layer_gpu(wf, s);
|
backward_connected_layer_gpu(wf, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, uf.delta_gpu, 1);
|
||||||
s.input = state.input;
|
s.input_gpu = state.input_gpu;
|
||||||
s.delta = state.delta;
|
s.delta_gpu = state.delta_gpu;
|
||||||
backward_connected_layer_gpu(uf, s);
|
backward_connected_layer_gpu(uf, s);
|
||||||
|
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp2_gpu, 1, l.temp_gpu, 1);
|
||||||
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
|
mul_ongpu(l.outputs*l.batch, l.f_gpu, 1, l.temp_gpu, 1);
|
||||||
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
|
copy_ongpu(l.outputs*l.batch, l.temp_gpu, 1, l.dc_gpu, 1);
|
||||||
|
|
||||||
state.input -= l.inputs*l.batch;
|
state.input_gpu -= l.inputs*l.batch;
|
||||||
if (state.delta) state.delta -= l.inputs*l.batch;
|
if (state.delta_gpu) state.delta_gpu -= l.inputs*l.batch;
|
||||||
l.output_gpu -= l.outputs*l.batch;
|
l.output_gpu -= l.outputs*l.batch;
|
||||||
l.cell_gpu -= l.outputs*l.batch;
|
l.cell_gpu -= l.outputs*l.batch;
|
||||||
l.delta_gpu -= l.outputs*l.batch;
|
l.delta_gpu -= l.outputs*l.batch;
|
||||||
|
@ -237,6 +237,7 @@ layer parse_gru(list *options, size_params params)
|
|||||||
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
|
int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0);
|
||||||
|
|
||||||
layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
|
layer l = make_gru_layer(params.batch, params.inputs, output, params.time_steps, batch_normalize);
|
||||||
|
l.tanh = option_find_int_quiet(options, "tanh", 0);
|
||||||
|
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user