mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
Fixed nan issue for training with CUDNN_HALF=1 by using Tensor Cores
This commit is contained in:
@ -91,8 +91,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
|
||||
|
||||
int init_w = net.w;
|
||||
int init_h = net.h;
|
||||
int iter_save;
|
||||
int iter_save, iter_save_last;
|
||||
iter_save = get_current_batch(net);
|
||||
iter_save_last = get_current_batch(net);
|
||||
|
||||
load_args args = {0};
|
||||
args.w = net.w;
|
||||
@ -210,7 +211,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
|
||||
|
||||
//if (i % 1000 == 0 || (i < 1000 && i % 100 == 0)) {
|
||||
//if (i % 100 == 0) {
|
||||
if(i >= (iter_save + 100)) {
|
||||
if(i >= (iter_save + 1000)) {
|
||||
iter_save = i;
|
||||
#ifdef GPU
|
||||
if (ngpus != 1) sync_nets(nets, ngpus, 0);
|
||||
@ -219,6 +220,16 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
|
||||
sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i);
|
||||
save_weights(net, buff);
|
||||
}
|
||||
|
||||
if (i >= (iter_save_last + 100)) {
|
||||
iter_save_last = i;
|
||||
#ifdef GPU
|
||||
if (ngpus != 1) sync_nets(nets, ngpus, 0);
|
||||
#endif
|
||||
char buff[256];
|
||||
sprintf(buff, "%s/%s_last.weights", backup_directory, base, i);
|
||||
save_weights(net, buff);
|
||||
}
|
||||
free_data(train);
|
||||
}
|
||||
#ifdef GPU
|
||||
|
Reference in New Issue
Block a user