From 60e952ba694e3e0811db5868d70ad7ebfe676836 Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Sun, 26 Mar 2017 23:42:30 -0700 Subject: [PATCH] :eyeofthetiger::noseofthetiger::eyeofthetiger2: --- Makefile | 6 +- cfg/coco.data | 4 +- cfg/go.cfg | 135 +++++ cfg/go.test.cfg | 35 +- cfg/tiny-yolo-voc.cfg | 2 +- cfg/yolo-voc.2.0.cfg | 244 +++++++++ cfg/yolo-voc.cfg | 26 +- cfg/yolo.2.0.cfg | 244 +++++++++ cfg/yolo.cfg | 32 +- scripts/voc_label.py | 13 +- src/batchnorm_layer.c | 148 ++++-- src/blas.c | 33 +- src/blas.h | 8 +- src/blas_kernels.cu | 77 ++- src/box.c | 8 +- src/box.h | 2 +- src/classifier.c | 12 +- src/convolutional_kernels.cu | 52 +- src/convolutional_layer.c | 37 +- src/cost_layer.c | 15 + src/cuda.c | 9 + src/cuda.h | 1 + src/darknet.c | 24 +- src/data.c | 100 +++- src/data.h | 4 +- src/deconvolutional_kernels.cu | 129 +++-- src/deconvolutional_layer.c | 203 +++++--- src/deconvolutional_layer.h | 29 +- src/detection_layer.c | 8 +- src/detector.c | 202 ++++++- src/go.c | 326 +++++++----- src/image.c | 108 +++- src/image.h | 7 + src/layer.c | 2 - src/layer.h | 13 +- src/local_layer.c | 28 +- src/lsd.c | 924 +++++++++++++++++++++++++++++++++ src/matrix.c | 15 + src/matrix.h | 1 + src/network.c | 39 +- src/network.h | 4 + src/network_kernels.cu | 18 +- src/nightmare.c | 114 +++- src/parser.c | 51 +- src/parser.h | 2 +- src/region_layer.c | 248 +++++---- src/regressor.c | 261 ++++++++++ src/reorg_layer.c | 76 ++- src/reorg_layer.h | 2 +- src/softmax_layer.c | 40 +- src/super.c | 6 +- src/tree.c | 14 +- src/tree.h | 6 +- 53 files changed, 3486 insertions(+), 661 deletions(-) create mode 100644 cfg/go.cfg create mode 100644 cfg/yolo-voc.2.0.cfg create mode 100644 cfg/yolo.2.0.cfg create mode 100644 src/lsd.c create mode 100644 src/regressor.c diff --git a/Makefile b/Makefile index 3d3d5e43..f0de805f 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ -gencode arch=compute_52,code=[sm_52,compute_52] # This is what I use, uncomment if you know your arch and want to specify -# ARCH= -gencode arch=compute_52,code=compute_52 +ARCH= -gencode arch=compute_52,code=compute_52 VPATH=./src/ EXEC=darknet @@ -48,10 +48,10 @@ CFLAGS+= -DCUDNN LDFLAGS+= -lcudnn endif -OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o super.o voxel.o tree.o +OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o regressor.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o lsd.o super.o voxel.o tree.o ifeq ($(GPU), 1) LDFLAGS+= -lstdc++ -OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o +OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o endif OBJS = $(addprefix $(OBJDIR), $(OBJ)) diff --git a/cfg/coco.data b/cfg/coco.data index 610151dc..30038417 100644 --- a/cfg/coco.data +++ b/cfg/coco.data @@ -1,7 +1,7 @@ classes= 80 train = /home/pjreddie/data/coco/trainvalno5k.txt -#valid = coco_testdev -valid = data/coco_val_5k.list +valid = coco_testdev +#valid = data/coco_val_5k.list names = data/coco.names backup = /home/pjreddie/backup/ eval=coco diff --git a/cfg/go.cfg b/cfg/go.cfg new file mode 100644 index 00000000..77d20c4d --- /dev/null +++ b/cfg/go.cfg @@ -0,0 +1,135 @@ +[net] +batch=512 +subdivisions=1 +height=19 +width=19 +channels=1 +momentum=0.9 +decay=0.0005 + +burn_in=1000 +learning_rate=0.1 +policy=poly +power=4 +max_batches=10000000 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=256 +size=3 +stride=1 +pad=1 +activation=relu +batch_normalize=1 + +[convolutional] +filters=1 +size=1 +stride=1 +pad=1 +activation=linear + +[reorg] +extra=1 +stride=1 + +[softmax] + +[cost] +type=sse + diff --git a/cfg/go.test.cfg b/cfg/go.test.cfg index 6b92d335..ba15c2e5 100644 --- a/cfg/go.test.cfg +++ b/cfg/go.test.cfg @@ -7,13 +7,13 @@ channels=1 momentum=0.9 decay=0.0005 -learning_rate=0.1 +learning_rate=0.01 policy=poly power=4 -max_batches=400000 +max_batches=100000 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -21,7 +21,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -29,7 +29,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -37,7 +37,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -45,7 +45,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -53,7 +53,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -61,7 +61,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -69,7 +69,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -77,7 +77,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -85,7 +85,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -93,7 +93,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -101,7 +101,7 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 @@ -109,14 +109,13 @@ activation=relu batch_normalize=1 [convolutional] -filters=192 +filters=256 size=3 stride=1 pad=1 activation=relu batch_normalize=1 - [convolutional] filters=1 size=1 @@ -124,6 +123,10 @@ stride=1 pad=1 activation=linear +[reorg] +extra=1 +stride=1 + [softmax] [cost] diff --git a/cfg/tiny-yolo-voc.cfg b/cfg/tiny-yolo-voc.cfg index 1f33c35b..ab2c066a 100644 --- a/cfg/tiny-yolo-voc.cfg +++ b/cfg/tiny-yolo-voc.cfg @@ -12,7 +12,7 @@ exposure = 1.5 hue=.1 learning_rate=0.001 -max_batches = 40100 +max_batches = 40200 policy=steps steps=-1,100,20000,30000 scales=.1,10,.1,.1 diff --git a/cfg/yolo-voc.2.0.cfg b/cfg/yolo-voc.2.0.cfg new file mode 100644 index 00000000..ceb3f2ac --- /dev/null +++ b/cfg/yolo-voc.2.0.cfg @@ -0,0 +1,244 @@ +[net] +batch=64 +subdivisions=8 +height=416 +width=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.0001 +max_batches = 45000 +policy=steps +steps=100,25000,35000 +scales=10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[reorg] +stride=2 + +[route] +layers=-1,-3 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=125 +activation=linear + +[region] +anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 +bias_match=1 +classes=20 +coords=4 +num=5 +softmax=1 +jitter=.2 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=0 diff --git a/cfg/yolo-voc.cfg b/cfg/yolo-voc.cfg index ceb3f2ac..9f3e0d51 100644 --- a/cfg/yolo-voc.cfg +++ b/cfg/yolo-voc.cfg @@ -11,11 +11,12 @@ saturation = 1.5 exposure = 1.5 hue=.1 -learning_rate=0.0001 -max_batches = 45000 +learning_rate=0.001 +burn_in=1000 +max_batches = 80200 policy=steps -steps=100,25000,35000 -scales=10,.1,.1 +steps=40000,60000 +scales=.1,.1 [convolutional] batch_normalize=1 @@ -203,11 +204,19 @@ activation=leaky [route] layers=-9 +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + [reorg] stride=2 [route] -layers=-1,-3 +layers=-1,-4 [convolutional] batch_normalize=1 @@ -224,14 +233,15 @@ pad=1 filters=125 activation=linear + [region] -anchors = 1.08,1.19, 3.42,4.41, 6.63,11.38, 9.42,5.11, 16.62,10.52 +anchors = 1.3221, 1.73145, 3.19275, 4.00944, 5.05587, 8.09892, 9.47112, 4.84053, 11.2364, 10.0071 bias_match=1 classes=20 coords=4 num=5 softmax=1 -jitter=.2 +jitter=.3 rescore=1 object_scale=5 @@ -241,4 +251,4 @@ coord_scale=1 absolute=1 thresh = .6 -random=0 +random=1 diff --git a/cfg/yolo.2.0.cfg b/cfg/yolo.2.0.cfg new file mode 100644 index 00000000..fda339a2 --- /dev/null +++ b/cfg/yolo.2.0.cfg @@ -0,0 +1,244 @@ +[net] +batch=1 +subdivisions=1 +width=416 +height=416 +channels=3 +momentum=0.9 +decay=0.0005 +angle=0 +saturation = 1.5 +exposure = 1.5 +hue=.1 + +learning_rate=0.001 +max_batches = 120000 +policy=steps +steps=-1,100,80000,100000 +scales=.1,10,.1,.1 + +[convolutional] +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=64 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=128 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=256 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=512 +size=1 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + + +####### + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[route] +layers=-9 + +[reorg] +stride=2 + +[route] +layers=-1,-3 + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=1024 +activation=leaky + +[convolutional] +size=1 +stride=1 +pad=1 +filters=425 +activation=linear + +[region] +anchors = 0.738768,0.874946, 2.42204,2.65704, 4.30971,7.04493, 10.246,4.59428, 12.6868,11.8741 +bias_match=1 +classes=80 +coords=4 +num=5 +softmax=1 +jitter=.2 +rescore=1 + +object_scale=5 +noobject_scale=1 +class_scale=1 +coord_scale=1 + +absolute=1 +thresh = .6 +random=0 diff --git a/cfg/yolo.cfg b/cfg/yolo.cfg index fda339a2..2d31027d 100644 --- a/cfg/yolo.cfg +++ b/cfg/yolo.cfg @@ -1,8 +1,8 @@ [net] -batch=1 -subdivisions=1 -width=416 -height=416 +batch=64 +subdivisions=8 +height=608 +width=608 channels=3 momentum=0.9 decay=0.0005 @@ -12,10 +12,11 @@ exposure = 1.5 hue=.1 learning_rate=0.001 -max_batches = 120000 +burn_in=1000 +max_batches = 500200 policy=steps -steps=-1,100,80000,100000 -scales=.1,10,.1,.1 +steps=400000,450000 +scales=.1,.1 [convolutional] batch_normalize=1 @@ -203,11 +204,19 @@ activation=leaky [route] layers=-9 +[convolutional] +batch_normalize=1 +size=1 +stride=1 +pad=1 +filters=64 +activation=leaky + [reorg] stride=2 [route] -layers=-1,-3 +layers=-1,-4 [convolutional] batch_normalize=1 @@ -224,14 +233,15 @@ pad=1 filters=425 activation=linear + [region] -anchors = 0.738768,0.874946, 2.42204,2.65704, 4.30971,7.04493, 10.246,4.59428, 12.6868,11.8741 +anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828 bias_match=1 classes=80 coords=4 num=5 softmax=1 -jitter=.2 +jitter=.3 rescore=1 object_scale=5 @@ -241,4 +251,4 @@ coord_scale=1 absolute=1 thresh = .6 -random=0 +random=1 diff --git a/scripts/voc_label.py b/scripts/voc_label.py index d1e88236..679fc366 100644 --- a/scripts/voc_label.py +++ b/scripts/voc_label.py @@ -10,10 +10,10 @@ classes = ["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat" def convert(size, box): - dw = 1./size[0] - dh = 1./size[1] - x = (box[0] + box[1])/2.0 - y = (box[2] + box[3])/2.0 + dw = 1./(size[0]) + dh = 1./(size[1]) + x = (box[0] + box[1])/2.0 - 1 + y = (box[2] + box[3])/2.0 - 1 w = box[1] - box[0] h = box[3] - box[2] x = x*dw @@ -34,7 +34,7 @@ def convert_annotation(year, image_id): for obj in root.iter('object'): difficult = obj.find('difficult').text cls = obj.find('name').text - if cls not in classes or int(difficult) == 1: + if cls not in classes or int(difficult)==1: continue cls_id = classes.index(cls) xmlbox = obj.find('bndbox') @@ -54,3 +54,6 @@ for year, image_set in sets: convert_annotation(year, image_id) list_file.close() +os.system("cat 2007_train.txt 2007_val.txt 2012_train.txt 2012_val.txt > train.txt") +os.system("cat 2007_train.txt 2007_val.txt 2007_test.txt 2012_train.txt 2012_val.txt > train.all.txt") + diff --git a/src/batchnorm_layer.c b/src/batchnorm_layer.c index b53548bd..1be70aa0 100644 --- a/src/batchnorm_layer.c +++ b/src/batchnorm_layer.c @@ -1,3 +1,4 @@ +#include "convolutional_layer.h" #include "batchnorm_layer.h" #include "blas.h" #include @@ -5,55 +6,67 @@ layer make_batchnorm_layer(int batch, int w, int h, int c) { fprintf(stderr, "Batch Normalization Layer: %d x %d x %d image\n", w,h,c); - layer layer = {0}; - layer.type = BATCHNORM; - layer.batch = batch; - layer.h = layer.out_h = h; - layer.w = layer.out_w = w; - layer.c = layer.out_c = c; - layer.output = calloc(h * w * c * batch, sizeof(float)); - layer.delta = calloc(h * w * c * batch, sizeof(float)); - layer.inputs = w*h*c; - layer.outputs = layer.inputs; + layer l = {0}; + l.type = BATCHNORM; + l.batch = batch; + l.h = l.out_h = h; + l.w = l.out_w = w; + l.c = l.out_c = c; + l.output = calloc(h * w * c * batch, sizeof(float)); + l.delta = calloc(h * w * c * batch, sizeof(float)); + l.inputs = w*h*c; + l.outputs = l.inputs; - layer.scales = calloc(c, sizeof(float)); - layer.scale_updates = calloc(c, sizeof(float)); + l.scales = calloc(c, sizeof(float)); + l.scale_updates = calloc(c, sizeof(float)); + l.biases = calloc(c, sizeof(float)); + l.bias_updates = calloc(c, sizeof(float)); int i; for(i = 0; i < c; ++i){ - layer.scales[i] = 1; + l.scales[i] = 1; } - layer.mean = calloc(c, sizeof(float)); - layer.variance = calloc(c, sizeof(float)); + l.mean = calloc(c, sizeof(float)); + l.variance = calloc(c, sizeof(float)); - layer.rolling_mean = calloc(c, sizeof(float)); - layer.rolling_variance = calloc(c, sizeof(float)); + l.rolling_mean = calloc(c, sizeof(float)); + l.rolling_variance = calloc(c, sizeof(float)); - layer.forward = forward_batchnorm_layer; - layer.backward = backward_batchnorm_layer; + l.forward = forward_batchnorm_layer; + l.backward = backward_batchnorm_layer; #ifdef GPU - layer.forward_gpu = forward_batchnorm_layer_gpu; - layer.backward_gpu = backward_batchnorm_layer_gpu; + l.forward_gpu = forward_batchnorm_layer_gpu; + l.backward_gpu = backward_batchnorm_layer_gpu; - layer.output_gpu = cuda_make_array(layer.output, h * w * c * batch); - layer.delta_gpu = cuda_make_array(layer.delta, h * w * c * batch); + l.output_gpu = cuda_make_array(l.output, h * w * c * batch); + l.delta_gpu = cuda_make_array(l.delta, h * w * c * batch); - layer.scales_gpu = cuda_make_array(layer.scales, c); - layer.scale_updates_gpu = cuda_make_array(layer.scale_updates, c); + l.biases_gpu = cuda_make_array(l.biases, c); + l.bias_updates_gpu = cuda_make_array(l.bias_updates, c); - layer.mean_gpu = cuda_make_array(layer.mean, c); - layer.variance_gpu = cuda_make_array(layer.variance, c); + l.scales_gpu = cuda_make_array(l.scales, c); + l.scale_updates_gpu = cuda_make_array(l.scale_updates, c); - layer.rolling_mean_gpu = cuda_make_array(layer.mean, c); - layer.rolling_variance_gpu = cuda_make_array(layer.variance, c); + l.mean_gpu = cuda_make_array(l.mean, c); + l.variance_gpu = cuda_make_array(l.variance, c); - layer.mean_delta_gpu = cuda_make_array(layer.mean, c); - layer.variance_delta_gpu = cuda_make_array(layer.variance, c); + l.rolling_mean_gpu = cuda_make_array(l.mean, c); + l.rolling_variance_gpu = cuda_make_array(l.variance, c); - layer.x_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs); - layer.x_norm_gpu = cuda_make_array(layer.output, layer.batch*layer.outputs); + l.mean_delta_gpu = cuda_make_array(l.mean, c); + l.variance_delta_gpu = cuda_make_array(l.variance, c); + + l.x_gpu = cuda_make_array(l.output, l.batch*l.outputs); + l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.outputs); + #ifdef CUDNN + cudnnCreateTensorDescriptor(&l.normTensorDesc); + cudnnCreateTensorDescriptor(&l.dstTensorDesc); + cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); + cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); + + #endif #endif - return layer; + return l; } void backward_scale_cpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates) @@ -108,7 +121,7 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del for(f = 0; f < filters; ++f){ for(k = 0; k < spatial; ++k){ int index = j*filters*spatial + f*spatial + k; - delta[index] = delta[index] * 1./(sqrt(variance[f]) + .00001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); + delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); } } } @@ -130,10 +143,10 @@ void forward_batchnorm_layer(layer l, network_state state) mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance); - scal_cpu(l.out_c, .9, l.rolling_mean, 1); - axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1); - scal_cpu(l.out_c, .9, l.rolling_variance, 1); - axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1); + scal_cpu(l.out_c, .99, l.rolling_mean, 1); + axpy_cpu(l.out_c, .01, l.mean, 1, l.rolling_mean, 1); + scal_cpu(l.out_c, .99, l.rolling_variance, 1); + axpy_cpu(l.out_c, .01, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w); @@ -142,10 +155,12 @@ void forward_batchnorm_layer(layer l, network_state state) normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w); + add_bias(l.output, l.biases, l.batch, l.out_c, l.out_h*l.out_w); } void backward_batchnorm_layer(const layer l, network_state state) { + backward_bias(l.bias_updates, l.delta, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates); scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w); @@ -179,6 +194,28 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) l.out_h = l.out_w = 1; } if (state.train) { +#ifdef CUDNN + copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); + float one = 1; + float zero = 0; + cudnnBatchNormalizationForwardTraining(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + l.dstTensorDesc, + l.x_gpu, + l.dstTensorDesc, + l.output_gpu, + l.normTensorDesc, + l.scales_gpu, + l.biases_gpu, + .01, + l.rolling_mean_gpu, + l.rolling_variance_gpu, + .00001, + l.mean_gpu, + l.variance_gpu); +#else fast_mean_gpu(l.output_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.mean_gpu); fast_variance_gpu(l.output_gpu, l.mean_gpu, l.batch, l.out_c, l.out_h*l.out_w, l.variance_gpu); @@ -190,15 +227,45 @@ void forward_batchnorm_layer_gpu(layer l, network_state state) copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_gpu, 1); normalize_gpu(l.output_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); copy_ongpu(l.outputs*l.batch, l.output_gpu, 1, l.x_norm_gpu, 1); + + scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); +#endif } else { normalize_gpu(l.output_gpu, l.rolling_mean_gpu, l.rolling_variance_gpu, l.batch, l.out_c, l.out_h*l.out_w); + scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.out_c, l.out_w*l.out_h); } - scale_bias_gpu(l.output_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); } void backward_batchnorm_layer_gpu(const layer l, network_state state) { +#ifdef CUDNN + float one = 1; + float zero = 0; + cudnnBatchNormalizationBackward(cudnn_handle(), + CUDNN_BATCHNORM_SPATIAL, + &one, + &zero, + &one, + &one, + l.dstTensorDesc, + l.x_gpu, + l.dstTensorDesc, + l.delta_gpu, + l.dstTensorDesc, + l.x_norm_gpu, + l.normTensorDesc, + l.scales_gpu, + l.scale_updates_gpu, + l.bias_updates_gpu, + .00001, + l.mean_gpu, + l.variance_gpu); + copy_ongpu(l.outputs*l.batch, l.x_norm_gpu, 1, l.delta_gpu, 1); +#else + backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h); backward_scale_gpu(l.x_norm_gpu, l.delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates_gpu); scale_bias_gpu(l.delta_gpu, l.scales_gpu, l.batch, l.out_c, l.out_h*l.out_w); @@ -206,6 +273,7 @@ void backward_batchnorm_layer_gpu(const layer l, network_state state) fast_mean_delta_gpu(l.delta_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta_gpu); fast_variance_delta_gpu(l.x_gpu, l.delta_gpu, l.mean_gpu, l.variance_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta_gpu); normalize_delta_gpu(l.x_gpu, l.mean_gpu, l.variance_gpu, l.mean_delta_gpu, l.variance_delta_gpu, l.batch, l.out_c, l.out_w*l.out_h, l.delta_gpu); +#endif if(l.type == BATCHNORM) copy_ongpu(l.outputs*l.batch, l.delta_gpu, 1, state.delta, 1); } #endif diff --git a/src/blas.c b/src/blas.c index 31bd86b2..1582f5fe 100644 --- a/src/blas.c +++ b/src/blas.c @@ -179,11 +179,21 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) } else { error[i] = 2*abs_val - 1; - delta[i] = (diff < 0) ? -1 : 1; + delta[i] = (diff < 0) ? 1 : -1; } } } +void l1_cpu(int n, float *pred, float *truth, float *delta, float *error) +{ + int i; + for(i = 0; i < n; ++i){ + float diff = truth[i] - pred[i]; + error[i] = fabs(diff); + delta[i] = diff > 0 ? 1 : -1; + } +} + void l2_cpu(int n, float *pred, float *truth, float *delta, float *error) { int i; @@ -202,21 +212,32 @@ float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) return dot; } -void softmax(float *input, int n, float temp, float *output) +void softmax(float *input, int n, float temp, int stride, float *output) { int i; float sum = 0; float largest = -FLT_MAX; for(i = 0; i < n; ++i){ - if(input[i] > largest) largest = input[i]; + if(input[i*stride] > largest) largest = input[i*stride]; } for(i = 0; i < n; ++i){ - float e = exp(input[i]/temp - largest/temp); + float e = exp(input[i*stride]/temp - largest/temp); sum += e; - output[i] = e; + output[i*stride] = e; } for(i = 0; i < n; ++i){ - output[i] /= sum; + output[i*stride] /= sum; + } +} + + +void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output) +{ + int g, b; + for(b = 0; b < batch; ++b){ + for(g = 0; g < groups; ++g){ + softmax(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset); + } } } diff --git a/src/blas.h b/src/blas.h index 3d6ee7d3..968bb955 100644 --- a/src/blas.h +++ b/src/blas.h @@ -33,9 +33,11 @@ void normalize_delta_cpu(float *x, float *mean, float *variance, float *mean_del void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error); void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); +void l1_cpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); -void softmax(float *input, int n, float temp, float *output); +void softmax(float *input, int n, float temp, int stride, float *output); +void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); #ifdef GPU #include "cuda.h" @@ -45,6 +47,7 @@ void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); void scal_ongpu(int N, float ALPHA, float * X, int INCX); +void add_ongpu(int N, float ALPHA, float * X, int INCX); void supp_ongpu(int N, float ALPHA, float * X, int INCX); void mask_ongpu(int N, float * X, float mask_num, float * mask); void const_ongpu(int N, float ALPHA, float *X, int INCX); @@ -72,13 +75,14 @@ void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error); void l2_gpu(int n, float *pred, float *truth, float *delta, float *error); +void l1_gpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc); void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c); void mult_add_into_gpu(int num, float *a, float *b, float *c); void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); -void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output); +void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output); void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t); void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index d9401766..a833adbb 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -161,7 +161,7 @@ __global__ void normalize_kernel(int N, float *x, float *mean, float *variance, if (index >= N) return; int f = (index/spatial)%filters; - x[index] = (x[index] - mean[f])/(sqrt(variance[f]) + .000001f); + x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f)); } __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) @@ -170,7 +170,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari if (index >= N) return; int f = (index/spatial)%filters; - delta[index] = delta[index] * 1./(sqrt(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); + delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); } extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) @@ -192,7 +192,7 @@ __global__ void variance_delta_kernel(float *x, float *delta, float *mean, floa variance_delta[i] += delta[index]*(x[index] - mean[i]); } } - variance_delta[i] *= -.5 * pow(variance[i] + .000001f, (float)(-3./2.)); + variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.)); } __global__ void accumulate_kernel(float *x, int n, int groups, float *sum) @@ -224,12 +224,14 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch, } } + __syncthreads(); + if(id == 0){ mean_delta[filter] = 0; for(i = 0; i < threads; ++i){ mean_delta[filter] += local[i]; } - mean_delta[filter] *= (-1./sqrt(variance[filter] + .000001f)); + mean_delta[filter] *= (-1./sqrt(variance[filter] + .00001f)); } } @@ -252,12 +254,14 @@ __global__ void fast_variance_delta_kernel(float *x, float *delta, float *mean, } } + __syncthreads(); + if(id == 0){ variance_delta[filter] = 0; for(i = 0; i < threads; ++i){ variance_delta[filter] += local[i]; } - variance_delta[filter] *= -.5 * pow(variance[filter] + .000001f, (float)(-3./2.)); + variance_delta[filter] *= -.5 * pow(variance[filter] + .00001f, (float)(-3./2.)); } } @@ -274,7 +278,7 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int mean_delta[i] += delta[index]; } } - mean_delta[i] *= (-1./sqrt(variance[i] + .000001f)); + mean_delta[i] *= (-1./sqrt(variance[i] + .00001f)); } extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) @@ -391,6 +395,12 @@ __global__ void supp_kernel(int N, float ALPHA, float *X, int INCX) } } +__global__ void add_kernel(int N, float ALPHA, float *X, int INCX) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < N) X[i*INCX] += ALPHA; +} + __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; @@ -447,6 +457,8 @@ __global__ void fast_mean_kernel(float *x, int batch, int filters, int spatial, } } + __syncthreads(); + if(id == 0){ mean[filter] = 0; for(i = 0; i < threads; ++i){ @@ -475,6 +487,8 @@ __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filt } } + __syncthreads(); + if(id == 0){ variance[filter] = 0; for(i = 0; i < threads; ++i){ @@ -593,6 +607,12 @@ extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX) } +extern "C" void add_ongpu(int N, float ALPHA, float * X, int INCX) +{ + add_kernel<<>>(N, ALPHA, X, INCX); + check_error(cudaPeekAtLastError()); +} + extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) { scal_kernel<<>>(N, ALPHA, X, INCX); @@ -658,7 +678,7 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta, } else { error[i] = 2*abs_val - 1; - delta[i] = (diff < 0) ? -1 : 1; + delta[i] = (diff > 0) ? 1 : -1; } } } @@ -685,6 +705,23 @@ extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *er check_error(cudaPeekAtLastError()); } +__global__ void l1_kernel(int n, float *pred, float *truth, float *delta, float *error) +{ + int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(i < n){ + float diff = truth[i] - pred[i]; + error[i] = abs(diff); + delta[i] = (diff > 0) ? 1 : -1; + } +} + +extern "C" void l1_gpu(int n, float *pred, float *truth, float *delta, float *error) +{ + l1_kernel<<>>(n, pred, truth, delta, error); + check_error(cudaPeekAtLastError()); +} + + __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float *c) @@ -732,36 +769,36 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c) } -__device__ void softmax_device(int n, float *input, float temp, float *output) +__device__ void softmax_device(float *input, int n, float temp, int stride, float *output) { int i; float sum = 0; float largest = -INFINITY; for(i = 0; i < n; ++i){ - int val = input[i]; + int val = input[i*stride]; largest = (val>largest) ? val : largest; } for(i = 0; i < n; ++i){ - float e = exp(input[i]/temp - largest/temp); + float e = exp(input[i*stride]/temp - largest/temp); sum += e; - output[i] = e; + output[i*stride] = e; } for(i = 0; i < n; ++i){ - output[i] /= sum; + output[i*stride] /= sum; } } -__global__ void softmax_kernel(int n, int offset, int batch, float *input, float temp, float *output) +__global__ void softmax_kernel(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output) { - int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if(b >= batch) return; - softmax_device(n, input + b*offset, temp, output + b*offset); + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if (id >= batch*groups) return; + int b = id / groups; + int g = id % groups; + softmax_device(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset); } -extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output) +extern "C" void softmax_gpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output) { - int inputs = n; - int batch = groups; - softmax_kernel<<>>(inputs, offset, batch, input, temp, output); + softmax_kernel<<>>(input, n, batch, batch_offset, groups, group_offset, stride, temp, output); check_error(cudaPeekAtLastError()); } diff --git a/src/box.c b/src/box.c index 39dea067..88ca71ac 100644 --- a/src/box.c +++ b/src/box.c @@ -3,13 +3,13 @@ #include #include -box float_to_box(float *f) +box float_to_box(float *f, int stride) { box b; b.x = f[0]; - b.y = f[1]; - b.w = f[2]; - b.h = f[3]; + b.y = f[1*stride]; + b.w = f[2*stride]; + b.h = f[3*stride]; return b; } diff --git a/src/box.h b/src/box.h index c65589b9..49585ed4 100644 --- a/src/box.h +++ b/src/box.h @@ -9,7 +9,7 @@ typedef struct{ float dx, dy, dw, dh; } dbox; -box float_to_box(float *f); +box float_to_box(float *f, int stride); float box_iou(box a, box b); float box_rmse(box a, box b); dbox diou(box a, box b); diff --git a/src/classifier.c b/src/classifier.c index 586530aa..2ac0b756 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -379,7 +379,7 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile) float *pred = calloc(classes, sizeof(float)); for(j = 0; j < 10; ++j){ float *p = network_predict(net, images[j].data); - if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1); + if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1, 1); axpy_cpu(classes, 1, p, 1, pred, 1); free_image(images[j]); } @@ -440,7 +440,7 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile) //show_image(crop, "cropped"); //cvWaitKey(0); float *pred = network_predict(net, resized.data); - if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1); + if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1, 1); free_image(im); free_image(resized); @@ -502,7 +502,7 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile) //show_image(crop, "cropped"); //cvWaitKey(0); float *pred = network_predict(net, crop.data); - if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1); + if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy, 1, 1); if(resized.data != im.data) free_image(resized); free_image(im); @@ -563,7 +563,7 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile) image r = resize_min(im, scales[j]); resize_network(&net, r.w, r.h); float *p = network_predict(net, r.data); - if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1); + if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy, 1 , 1); axpy_cpu(classes, 1, p, 1, pred, 1); flip_image(r); p = network_predict(net, r.data); @@ -703,7 +703,7 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi float *X = r.data; time=clock(); float *predictions = network_predict(net, X); - if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0); + if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 0, 1); top_k(predictions, net.outputs, top, indexes); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); for(i = 0; i < top; ++i){ @@ -1084,7 +1084,7 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind show_image(in, "Classifier"); float *predictions = network_predict(net, in_s.data); - if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1); + if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy, 1, 1); top_predictions(net, top, indexes); printf("\033[2J"); diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index fcaea031..9eb058ce 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -117,26 +117,70 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) if (l.batch_normalize) { forward_batchnorm_layer_gpu(l, state); + } else { + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); } - add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation); //if(l.dot > 0) dot_error_gpu(l); if(l.binary || l.xnor) swap_binary(&l); } +__global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, float rate, float *delta) +{ + int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(id >= n) return; + + int j = id % w; + id /= w; + int i = id % h; + id /= h; + int k = id % c; + id /= c; + int b = id; + + int w_offset = -(size/2.); + int h_offset = -(size/2.); + + int out_index = j + w*(i + h*(k + c*b)); + int l, m; + for(l = 0; l < size; ++l){ + for(m = 0; m < size; ++m){ + int cur_h = h_offset + i + l; + int cur_w = w_offset + j + m; + int index = cur_w + w*(cur_h + h*(k + b*c)); + int valid = (cur_h >= 0 && cur_h < h && + cur_w >= 0 && cur_w < w); + delta[out_index] += valid ? rate*(x[index] - x[out_index]) : 0; + } + } +} + +extern "C" void smooth_layer(layer l, int size, float rate) +{ + int h = l.out_h; + int w = l.out_w; + int c = l.out_c; + + size_t n = h*w*c*l.batch; + + smooth_kernel<<>>(l.output_gpu, n, l.w, l.h, l.c, size, rate, l.delta_gpu); + check_error(cudaPeekAtLastError()); +} + void backward_convolutional_layer_gpu(convolutional_layer l, network_state state) { + if(l.smooth){ + smooth_layer(l, 5, l.smooth); + } //constrain_ongpu(l.outputs*l.batch, 1, l.delta_gpu, 1); gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); if(l.batch_normalize){ backward_batchnorm_layer_gpu(l, state); - //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.x_gpu, 1, l.delta_gpu, 1); } else { - //axpy_ongpu(l.outputs*l.batch, -state.net.decay, l.output_gpu, 1, l.delta_gpu, 1); + backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); } float *original_input = state.input; diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 37211ab7..04d21310 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -23,11 +23,11 @@ void swap_binary(convolutional_layer *l) l->weights = l->binary_weights; l->binary_weights = swap; - #ifdef GPU +#ifdef GPU swap = l->weights_gpu; l->weights_gpu = l->binary_weights_gpu; l->binary_weights_gpu = swap; - #endif +#endif } void binarize_weights(float *weights, int n, int size, float *binary) @@ -80,23 +80,15 @@ int convolutional_out_width(convolutional_layer l) image get_convolutional_image(convolutional_layer l) { - int h,w,c; - h = convolutional_out_height(l); - w = convolutional_out_width(l); - c = l.n; - return float_to_image(w,h,c,l.output); + return float_to_image(l.out_w,l.out_h,l.out_c,l.output); } image get_convolutional_delta(convolutional_layer l) { - int h,w,c; - h = convolutional_out_height(l); - w = convolutional_out_width(l); - c = l.n; - return float_to_image(w,h,c,l.delta); + return float_to_image(l.out_w,l.out_h,l.out_c,l.delta); } -size_t get_workspace_size(layer l){ +static size_t get_workspace_size(layer l){ #ifdef CUDNN if(gpu_index >= 0){ size_t most = 0; @@ -127,7 +119,7 @@ size_t get_workspace_size(layer l){ if (s > most) most = s; return most; } - #endif +#endif return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float); } @@ -141,6 +133,7 @@ void cudnn_convolutional_setup(layer *l) cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); + cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); cudnnSetFilter4dDescriptor(l->weightDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetConvolution2dDescriptor(l->convDesc, l->pad, l->pad, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), @@ -198,8 +191,8 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int // float scale = 1./sqrt(size*size*c); float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1, 1); - int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); + int out_h = convolutional_out_height(l); l.out_h = out_h; l.out_w = out_w; l.out_c = n; @@ -291,6 +284,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.x_norm_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); } #ifdef CUDNN + cudnnCreateTensorDescriptor(&l.normTensorDesc); cudnnCreateTensorDescriptor(&l.srcTensorDesc); cudnnCreateTensorDescriptor(&l.dstTensorDesc); cudnnCreateFilterDescriptor(&l.weightDesc); @@ -426,8 +420,8 @@ void backward_bias(float *bias_updates, float *delta, int batch, int n, int size void forward_convolutional_layer(convolutional_layer l, network_state state) { - int out_h = convolutional_out_height(l); - int out_w = convolutional_out_width(l); + int out_h = l.out_h; + int out_w = l.out_w; int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); @@ -458,8 +452,9 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) if(l.batch_normalize){ forward_batchnorm_layer(l, state); + } else { + add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); } - add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); if(l.binary || l.xnor) swap_binary(&l); @@ -470,14 +465,14 @@ void backward_convolutional_layer(convolutional_layer l, network_state state) int i; int m = l.n; int n = l.size*l.size*l.c; - int k = convolutional_out_height(l)* - convolutional_out_width(l); + int k = l.out_w*l.out_h; gradient_array(l.output, m*k*l.batch, l.activation, l.delta); - backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); if(l.batch_normalize){ backward_batchnorm_layer(l, state); + } else { + backward_bias(l.bias_updates, l.delta, l.batch, l.n, k); } for(i = 0; i < l.batch; ++i){ diff --git a/src/cost_layer.c b/src/cost_layer.c index 39d2398b..320f7fe5 100644 --- a/src/cost_layer.c +++ b/src/cost_layer.c @@ -12,6 +12,7 @@ COST_TYPE get_cost_type(char *s) if (strcmp(s, "sse")==0) return SSE; if (strcmp(s, "masked")==0) return MASKED; if (strcmp(s, "smooth")==0) return SMOOTH; + if (strcmp(s, "L1")==0) return L1; fprintf(stderr, "Couldn't find cost type %s, going with SSE\n", s); return SSE; } @@ -25,6 +26,8 @@ char *get_cost_string(COST_TYPE a) return "masked"; case SMOOTH: return "smooth"; + case L1: + return "L1"; } return "sse"; } @@ -81,6 +84,8 @@ void forward_cost_layer(cost_layer l, network_state state) } if(l.cost_type == SMOOTH){ smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); + }else if(l.cost_type == L1){ + l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); } else { l2_cpu(l.batch*l.inputs, state.input, state.truth, l.delta, l.output); } @@ -116,12 +121,18 @@ int float_abs_compare (const void * a, const void * b) void forward_cost_layer_gpu(cost_layer l, network_state state) { if (!state.truth) return; + if(l.smooth){ + scal_ongpu(l.batch*l.inputs, (1-l.smooth), state.truth, 1); + add_ongpu(l.batch*l.inputs, l.smooth * 1./l.inputs, state.truth, 1); + } if (l.cost_type == MASKED) { mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth); } if(l.cost_type == SMOOTH){ smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); + } else if (l.cost_type == L1){ + l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); } else { l2_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu, l.output_gpu); } @@ -136,6 +147,10 @@ void forward_cost_layer_gpu(cost_layer l, network_state state) supp_ongpu(l.batch*l.inputs, thresh, l.delta_gpu, 1); } + if(l.thresh){ + supp_ongpu(l.batch*l.inputs, l.thresh*1./l.inputs, l.delta_gpu, 1); + } + cuda_pull_array(l.output_gpu, l.output, l.batch*l.inputs); l.cost[0] = sum_array(l.output, l.batch*l.inputs); } diff --git a/src/cuda.c b/src/cuda.c index 1b51271f..af3d412b 100644 --- a/src/cuda.c +++ b/src/cuda.c @@ -157,4 +157,13 @@ void cuda_pull_array(float *x_gpu, float *x, size_t n) check_error(status); } +float cuda_mag_array(float *x_gpu, size_t n) +{ + float *temp = calloc(n, sizeof(float)); + cuda_pull_array(x_gpu, temp, n); + float m = mag_array(temp, n); + free(temp); + return m; +} + #endif diff --git a/src/cuda.h b/src/cuda.h index 29b1eefc..a825ded5 100644 --- a/src/cuda.h +++ b/src/cuda.h @@ -26,6 +26,7 @@ void cuda_free(float *x_gpu); void cuda_random(float *x_gpu, size_t n); float cuda_compare(float *x_gpu, float *x, size_t n, char *s); dim3 cuda_gridsize(size_t n); +float cuda_mag_array(float *x_gpu, size_t n); #ifdef CUDNN cudnnHandle_t cudnn_handle(); diff --git a/src/darknet.c b/src/darknet.c index 6e560728..f7b4c225 100644 --- a/src/darknet.c +++ b/src/darknet.c @@ -24,6 +24,7 @@ extern void run_nightmare(int argc, char **argv); extern void run_dice(int argc, char **argv); extern void run_compare(int argc, char **argv); extern void run_classifier(int argc, char **argv); +extern void run_regressor(int argc, char **argv); extern void run_char_rnn(int argc, char **argv); extern void run_vid_rnn(int argc, char **argv); extern void run_tag(int argc, char **argv); @@ -31,6 +32,7 @@ extern void run_cifar(int argc, char **argv); extern void run_go(int argc, char **argv); extern void run_art(int argc, char **argv); extern void run_super(int argc, char **argv); +extern void run_lsd(int argc, char **argv); void average(int argc, char *argv[]) { @@ -95,7 +97,7 @@ void speed(char *cfgfile, int tics) set_batch_network(&net, 1); int i; time_t start = time(0); - image im = make_image(net.w, net.h, net.c); + image im = make_image(net.w, net.h, net.c*net.batch); for(i = 0; i < tics; ++i){ network_predict(net, im.data); } @@ -150,12 +152,24 @@ void oneoff(char *cfgfile, char *weightfile, char *outfile) save_weights(net, outfile); } +void oneoff2(char *cfgfile, char *weightfile, char *outfile, int l) +{ + gpu_index = -1; + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights_upto(&net, weightfile, 0, net.n); + load_weights_upto(&net, weightfile, l, net.n); + } + *net.seen = 0; + save_weights_upto(net, outfile, net.n); +} + void partial(char *cfgfile, char *weightfile, char *outfile, int max) { gpu_index = -1; network net = parse_network_cfg(cfgfile); if(weightfile){ - load_weights_upto(&net, weightfile, max); + load_weights_upto(&net, weightfile, 0, max); } *net.seen = 0; save_weights_upto(net, outfile, max); @@ -380,6 +394,8 @@ int main(int argc, char **argv) run_voxel(argc, argv); } else if (0 == strcmp(argv[1], "super")){ run_super(argc, argv); + } else if (0 == strcmp(argv[1], "lsd")){ + run_lsd(argc, argv); } else if (0 == strcmp(argv[1], "detector")){ run_detector(argc, argv); } else if (0 == strcmp(argv[1], "detect")){ @@ -400,6 +416,8 @@ int main(int argc, char **argv) predict_classifier("cfg/imagenet1k.data", argv[2], argv[3], argv[4], 5); } else if (0 == strcmp(argv[1], "classifier")){ run_classifier(argc, argv); + } else if (0 == strcmp(argv[1], "regressor")){ + run_regressor(argc, argv); } else if (0 == strcmp(argv[1], "art")){ run_art(argc, argv); } else if (0 == strcmp(argv[1], "tag")){ @@ -436,6 +454,8 @@ int main(int argc, char **argv) speed(argv[2], (argc > 3 && argv[3]) ? atoi(argv[3]) : 0); } else if (0 == strcmp(argv[1], "oneoff")){ oneoff(argv[2], argv[3], argv[4]); + } else if (0 == strcmp(argv[1], "oneoff2")){ + oneoff2(argv[2], argv[3], argv[4], atoi(argv[5])); } else if (0 == strcmp(argv[1], "partial")){ partial(argv[2], argv[3], argv[4], atoi(argv[5])); } else if (0 == strcmp(argv[1], "average")){ diff --git a/src/data.c b/src/data.c index 05e5a91b..f17bd73f 100644 --- a/src/data.c +++ b/src/data.c @@ -317,7 +317,7 @@ void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, h = boxes[i].h; id = boxes[i].id; - if ((w < .005 || h < .005)) continue; + if ((w < .001 || h < .001)) continue; truth[i*5+0] = x; truth[i*5+1] = y; @@ -393,7 +393,7 @@ void fill_truth(char *path, char **labels, int k, float *truth) ++count; } } - if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path); + if(count != 1 && (k != 1 || count != 0)) printf("Too many or too few labels: %d, %s\n", count, path); } void fill_hierarchy(float *truth, int k, tree *hierarchy) @@ -428,6 +428,24 @@ void fill_hierarchy(float *truth, int k, tree *hierarchy) } } +matrix load_regression_labels_paths(char **paths, int n) +{ + matrix y = make_matrix(n, 1); + int i; + for(i = 0; i < n; ++i){ + char labelpath[4096]; + find_replace(paths[i], "images", "targets", labelpath); + find_replace(labelpath, "JPEGImages", "targets", labelpath); + find_replace(labelpath, ".jpg", ".txt", labelpath); + find_replace(labelpath, ".png", ".txt", labelpath); + + FILE *file = fopen(labelpath, "r"); + fscanf(file, "%f", &(y.vals[i][0])); + fclose(file); + } + return y; +} + matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy) { matrix y = make_matrix(n, k); @@ -673,45 +691,44 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in d.y = make_matrix(n, 5*boxes); for(i = 0; i < n; ++i){ image orig = load_image_color(random_paths[i], 0, 0); + image sized = make_image(w, h, orig.c); + fill_image(sized, .5); + + float dw = jitter * orig.w; + float dh = jitter * orig.h; - int oh = orig.h; - int ow = orig.w; + float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh)); + float scale = rand_uniform(.25, 2); - int dw = (ow*jitter); - int dh = (oh*jitter); + float nw, nh; + + if(new_ar < 1){ + nh = scale * h; + nw = nh * new_ar; + } else { + nw = scale * w; + nh = nw / new_ar; + } - int pleft = rand_uniform(-dw, dw); - int pright = rand_uniform(-dw, dw); - int ptop = rand_uniform(-dh, dh); - int pbot = rand_uniform(-dh, dh); + float dx = rand_uniform(0, w - nw); + float dy = rand_uniform(0, h - nh); - int swidth = ow - pleft - pright; - int sheight = oh - ptop - pbot; + place_image(orig, nw, nh, dx, dy, sized); - float sx = (float)swidth / ow; - float sy = (float)sheight / oh; - - int flip = rand()%2; - image cropped = crop_image(orig, pleft, ptop, swidth, sheight); - - float dx = ((float)pleft/ow)/sx; - float dy = ((float)ptop /oh)/sy; - - image sized = resize_image(cropped, w, h); - if(flip) flip_image(sized); random_distort_image(sized, hue, saturation, exposure); + int flip = rand()%2; + if(flip) flip_image(sized); d.X.vals[i] = sized.data; - fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, dx, dy, 1./sx, 1./sy); + + fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h); free_image(orig); - free_image(cropped); } free(random_paths); return d; } - void *load_thread(void *ptr) { //printf("Loading data: %d\n", rand()); @@ -722,6 +739,8 @@ void *load_thread(void *ptr) if (a.type == OLD_CLASSIFICATION_DATA){ *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h); + } else if (a.type == REGRESSION_DATA){ + *a.d = load_data_regression(a.paths, a.n, a.m, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure); } else if (a.type == CLASSIFICATION_DATA){ *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure); } else if (a.type == SUPER_DATA){ @@ -739,6 +758,9 @@ void *load_thread(void *ptr) } else if (a.type == IMAGE_DATA){ *(a.im) = load_image_color(a.path, 0, 0); *(a.resized) = resize_image(*(a.im), a.w, a.h); + } else if (a.type == LETTERBOX_DATA){ + *(a.im) = load_image_color(a.path, 0, 0); + *(a.resized) = letterbox_image(*(a.im), a.w, a.h); } else if (a.type == TAG_DATA){ *a.d = load_data_tag(a.paths, a.n, a.m, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure); } @@ -863,6 +885,17 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale) return d; } +data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) +{ + if(m) paths = get_random_paths(paths, n, m); + data d = {0}; + d.shallow = 0; + d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure); + d.y = load_regression_labels_paths(paths, n); + if(m) free(paths); + return d; +} + data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) { if(m) paths = get_random_paths(paths, n, m); @@ -962,7 +995,6 @@ data load_cifar10_data(char *filename) X.vals[i][j] = (double)bytes[j+1]; } } - //translate_data_rows(d, -128); scale_data_rows(d, 1./255); //normalize_data_rows(d); fclose(fp); @@ -1029,7 +1061,6 @@ data load_all_cifar10() fclose(fp); } //normalize_data_rows(d); - //translate_data_rows(d, -128); scale_data_rows(d, 1./255); smooth_data(d); return d; @@ -1113,6 +1144,19 @@ void translate_data_rows(data d, float s) } } +data copy_data(data d) +{ + data c = {0}; + c.w = d.w; + c.h = d.h; + c.shallow = 0; + c.num_boxes = d.num_boxes; + c.boxes = d.boxes; + c.X = copy_matrix(d.X); + c.y = copy_matrix(d.y); + return c; +} + void normalize_data_rows(data d) { int i; diff --git a/src/data.h b/src/data.h index 3f6ef610..30e025c7 100644 --- a/src/data.h +++ b/src/data.h @@ -28,7 +28,7 @@ typedef struct{ } data; typedef enum { - CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA + CLASSIFICATION_DATA, DETECTION_DATA, CAPTCHA_DATA, REGION_DATA, IMAGE_DATA, COMPARE_DATA, WRITING_DATA, SWAG_DATA, TAG_DATA, OLD_CLASSIFICATION_DATA, STUDY_DATA, DET_DATA, SUPER_DATA, LETTERBOX_DATA, REGRESSION_DATA } data_type; typedef struct load_args{ @@ -83,6 +83,7 @@ data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); data load_data_super(char **paths, int n, int m, int w, int h, int scale); data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); +data load_data_regression(char **paths, int n, int m, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); data load_go(char *filename); box_label *read_boxes(char *filename, int *n); @@ -106,5 +107,6 @@ data *split_data(data d, int part, int total); data concat_data(data d1, data d2); data concat_datas(data *d, int n); void fill_truth(char *path, char **labels, int k, float *truth); +data copy_data(data d); #endif diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu index d6259fb3..381be23b 100644 --- a/src/deconvolutional_kernels.cu +++ b/src/deconvolutional_kernels.cu @@ -5,6 +5,7 @@ extern "C" { #include "convolutional_layer.h" #include "deconvolutional_layer.h" +#include "batchnorm_layer.h" #include "gemm.h" #include "blas.h" #include "im2col.h" @@ -13,97 +14,119 @@ extern "C" { #include "cuda.h" } -extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) +extern "C" void forward_deconvolutional_layer_gpu(layer l, network_state state) { int i; - int out_h = deconvolutional_out_height(layer); - int out_w = deconvolutional_out_width(layer); + int out_h = l.out_h; + int out_w = l.out_w; int size = out_h*out_w; - int m = layer.size*layer.size*layer.n; - int n = layer.h*layer.w; - int k = layer.c; + int m = l.size*l.size*l.n; + int n = l.h*l.w; + int k = l.c; - fill_ongpu(layer.outputs*layer.batch, 0, layer.output_gpu, 1); + fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); - for(i = 0; i < layer.batch; ++i){ - float *a = layer.weights_gpu; - float *b = state.input + i*layer.c*layer.h*layer.w; - float *c = layer.col_image_gpu; + for(i = 0; i < l.batch; ++i){ + float *a = l.weights_gpu; + float *b = state.input + i*l.c*l.h*l.w; + float *c = state.workspace; gemm_ongpu(1,0,m,n,k,1,a,m,b,n,0,c,n); - col2im_ongpu(c, layer.n, out_h, out_w, layer.size, layer.stride, 0, layer.output_gpu+i*layer.n*size); + col2im_ongpu(c, l.n, out_h, out_w, l.size, l.stride, l.pad, l.output_gpu+i*l.n*size); } - add_bias_gpu(layer.output_gpu, layer.biases_gpu, layer.batch, layer.n, size); - activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation); + if (l.batch_normalize) { + forward_batchnorm_layer_gpu(l, state); + } else { + add_bias_gpu(l.output_gpu, l.biases_gpu, l.batch, l.n, l.out_w*l.out_h); + } + activate_array_ongpu(l.output_gpu, l.batch*l.n*size, l.activation); } -extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) +extern "C" void backward_deconvolutional_layer_gpu(layer l, network_state state) { - float alpha = 1./layer.batch; - int out_h = deconvolutional_out_height(layer); - int out_w = deconvolutional_out_width(layer); + int out_h = l.out_h; + int out_w = l.out_w; int size = out_h*out_w; int i; - gradient_array(layer.output_gpu, size*layer.n*layer.batch, layer.activation, layer.delta_gpu); - backward_bias(layer.bias_updates_gpu, layer.delta, layer.batch, layer.n, size); + gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu); - if(state.delta) memset(state.delta, 0, layer.batch*layer.h*layer.w*layer.c*sizeof(float)); + if(l.batch_normalize){ + backward_batchnorm_layer_gpu(l, state); + } else { + backward_bias_gpu(l.bias_updates_gpu, l.delta_gpu, l.batch, l.n, l.out_w*l.out_h); + } - for(i = 0; i < layer.batch; ++i){ - int m = layer.c; - int n = layer.size*layer.size*layer.n; - int k = layer.h*layer.w; + //if(state.delta) memset(state.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); + + for(i = 0; i < l.batch; ++i){ + int m = l.c; + int n = l.size*l.size*l.n; + int k = l.h*l.w; float *a = state.input + i*m*n; - float *b = layer.col_image_gpu; - float *c = layer.weight_updates_gpu; + float *b = state.workspace; + float *c = l.weight_updates_gpu; - im2col_ongpu(layer.delta_gpu + i*layer.n*size, layer.n, out_h, out_w, - layer.size, layer.stride, 0, b); - gemm_ongpu(0,1,m,n,k,alpha,a,k,b,k,1,c,n); + im2col_ongpu(l.delta_gpu + i*l.n*size, l.n, out_h, out_w, + l.size, l.stride, l.pad, b); + gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(state.delta){ - int m = layer.c; - int n = layer.h*layer.w; - int k = layer.size*layer.size*layer.n; + int m = l.c; + int n = l.h*l.w; + int k = l.size*l.size*l.n; - float *a = layer.weights_gpu; - float *b = layer.col_image_gpu; + float *a = l.weights_gpu; + float *b = state.workspace; float *c = state.delta + i*n*m; - gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); + gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } } -extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer) +extern "C" void pull_deconvolutional_layer(layer l) { - cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array(layer.biases_gpu, layer.biases, layer.n); - cuda_pull_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size); - cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); + cuda_pull_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size); + cuda_pull_array(l.biases_gpu, l.biases, l.n); + cuda_pull_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size); + cuda_pull_array(l.bias_updates_gpu, l.bias_updates, l.n); + if (l.batch_normalize){ + cuda_pull_array(l.scales_gpu, l.scales, l.n); + cuda_pull_array(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_pull_array(l.rolling_variance_gpu, l.rolling_variance, l.n); + } } -extern "C" void push_deconvolutional_layer(deconvolutional_layer layer) +extern "C" void push_deconvolutional_layer(layer l) { - cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); - cuda_push_array(layer.biases_gpu, layer.biases, layer.n); - cuda_push_array(layer.weight_updates_gpu, layer.weight_updates, layer.c*layer.n*layer.size*layer.size); - cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); + cuda_push_array(l.weights_gpu, l.weights, l.c*l.n*l.size*l.size); + cuda_push_array(l.biases_gpu, l.biases, l.n); + cuda_push_array(l.weight_updates_gpu, l.weight_updates, l.c*l.n*l.size*l.size); + cuda_push_array(l.bias_updates_gpu, l.bias_updates, l.n); + if (l.batch_normalize){ + cuda_push_array(l.scales_gpu, l.scales, l.n); + cuda_push_array(l.rolling_mean_gpu, l.rolling_mean, l.n); + cuda_push_array(l.rolling_variance_gpu, l.rolling_variance, l.n); + } } -extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay) +void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay) { - int size = layer.size*layer.size*layer.c*layer.n; + int size = l.size*l.size*l.c*l.n; + axpy_ongpu(l.n, learning_rate/batch, l.bias_updates_gpu, 1, l.biases_gpu, 1); + scal_ongpu(l.n, momentum, l.bias_updates_gpu, 1); - axpy_ongpu(layer.n, learning_rate, layer.bias_updates_gpu, 1, layer.biases_gpu, 1); - scal_ongpu(layer.n, momentum, layer.bias_updates_gpu, 1); + if(l.scales_gpu){ + axpy_ongpu(l.n, learning_rate/batch, l.scale_updates_gpu, 1, l.scales_gpu, 1); + scal_ongpu(l.n, momentum, l.scale_updates_gpu, 1); + } - axpy_ongpu(size, -decay, layer.weights_gpu, 1, layer.weight_updates_gpu, 1); - axpy_ongpu(size, learning_rate, layer.weight_updates_gpu, 1, layer.weights_gpu, 1); - scal_ongpu(size, momentum, layer.weight_updates_gpu, 1); + axpy_ongpu(size, -decay*batch, l.weights_gpu, 1, l.weight_updates_gpu, 1); + axpy_ongpu(size, learning_rate/batch, l.weight_updates_gpu, 1, l.weights_gpu, 1); + scal_ongpu(size, momentum, l.weight_updates_gpu, 1); } diff --git a/src/deconvolutional_layer.c b/src/deconvolutional_layer.c index fbef9d58..7170975c 100644 --- a/src/deconvolutional_layer.c +++ b/src/deconvolutional_layer.c @@ -1,5 +1,6 @@ #include "deconvolutional_layer.h" #include "convolutional_layer.h" +#include "batchnorm_layer.h" #include "utils.h" #include "im2col.h" #include "col2im.h" @@ -8,45 +9,25 @@ #include #include -int deconvolutional_out_height(deconvolutional_layer l) -{ - int h = l.stride*(l.h - 1) + l.size; - return h; + +static size_t get_workspace_size(layer l){ + return (size_t)l.h*l.w*l.size*l.size*l.c*sizeof(float); } -int deconvolutional_out_width(deconvolutional_layer l) +int deconvolutional_out_height(layer l) { - int w = l.stride*(l.w - 1) + l.size; - return w; + return (l.h) * l.stride + l.size/2 - l.pad; } -int deconvolutional_out_size(deconvolutional_layer l) +int deconvolutional_out_width(layer l) { - return deconvolutional_out_height(l) * deconvolutional_out_width(l); + return (l.w) * l.stride + l.size/2 - l.pad; } -image get_deconvolutional_image(deconvolutional_layer l) -{ - int h,w,c; - h = deconvolutional_out_height(l); - w = deconvolutional_out_width(l); - c = l.n; - return float_to_image(w,h,c,l.output); -} - -image get_deconvolutional_delta(deconvolutional_layer l) -{ - int h,w,c; - h = deconvolutional_out_height(l); - w = deconvolutional_out_width(l); - c = l.n; - return float_to_image(w,h,c,l.delta); -} - -deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation) +layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize) { int i; - deconvolutional_layer l = {0}; + layer l = {0}; l.type = DECONVOLUTIONAL; l.h = h; @@ -67,71 +48,135 @@ deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, for(i = 0; i < n; ++i){ l.biases[i] = scale; } - int out_h = deconvolutional_out_height(l); - int out_w = deconvolutional_out_width(l); + l.pad = l.size/2; - l.out_h = out_h; - l.out_w = out_w; + l.out_h = (l.h) * l.stride + l.size/2 - l.pad; + l.out_w = (l.w) * l.stride + l.size/2 - l.pad; l.out_c = n; l.outputs = l.out_w * l.out_h * l.out_c; l.inputs = l.w * l.h * l.c; - l.col_image = calloc(h*w*size*size*n, sizeof(float)); - l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); - l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); + l.output = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); + l.delta = calloc(l.batch*l.out_h * l.out_w * n, sizeof(float)); l.forward = forward_deconvolutional_layer; l.backward = backward_deconvolutional_layer; l.update = update_deconvolutional_layer; - #ifdef GPU - l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); - l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); + l.batch_normalize = batch_normalize; - l.biases_gpu = cuda_make_array(l.biases, n); - l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); + if(batch_normalize){ + l.scales = calloc(n, sizeof(float)); + l.scale_updates = calloc(n, sizeof(float)); + for(i = 0; i < n; ++i){ + l.scales[i] = 1; + } - l.col_image_gpu = cuda_make_array(l.col_image, h*w*size*size*n); - l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); - l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); + l.mean = calloc(n, sizeof(float)); + l.variance = calloc(n, sizeof(float)); + + l.mean_delta = calloc(n, sizeof(float)); + l.variance_delta = calloc(n, sizeof(float)); + + l.rolling_mean = calloc(n, sizeof(float)); + l.rolling_variance = calloc(n, sizeof(float)); + l.x = calloc(l.batch*l.outputs, sizeof(float)); + l.x_norm = calloc(l.batch*l.outputs, sizeof(float)); + } + +#ifdef GPU + l.forward_gpu = forward_deconvolutional_layer_gpu; + l.backward_gpu = backward_deconvolutional_layer_gpu; + l.update_gpu = update_deconvolutional_layer_gpu; + + if(gpu_index >= 0){ + + l.weights_gpu = cuda_make_array(l.weights, c*n*size*size); + l.weight_updates_gpu = cuda_make_array(l.weight_updates, c*n*size*size); + + l.biases_gpu = cuda_make_array(l.biases, n); + l.bias_updates_gpu = cuda_make_array(l.bias_updates, n); + + l.delta_gpu = cuda_make_array(l.delta, l.batch*l.out_h*l.out_w*n); + l.output_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); + + if(batch_normalize){ + l.mean_gpu = cuda_make_array(l.mean, n); + l.variance_gpu = cuda_make_array(l.variance, n); + + l.rolling_mean_gpu = cuda_make_array(l.mean, n); + l.rolling_variance_gpu = cuda_make_array(l.variance, n); + + l.mean_delta_gpu = cuda_make_array(l.mean, n); + l.variance_delta_gpu = cuda_make_array(l.variance, n); + + l.scales_gpu = cuda_make_array(l.scales, n); + l.scale_updates_gpu = cuda_make_array(l.scale_updates, n); + + l.x_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); + l.x_norm_gpu = cuda_make_array(l.output, l.batch*l.out_h*l.out_w*n); + } + } + #ifdef CUDNN + cudnnCreateTensorDescriptor(&l.dstTensorDesc); + cudnnCreateTensorDescriptor(&l.normTensorDesc); + cudnnSetTensor4dDescriptor(l.dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l.batch, l.out_c, l.out_h, l.out_w); + cudnnSetTensor4dDescriptor(l.normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l.out_c, 1, 1); #endif +#endif l.activation = activation; + l.workspace_size = get_workspace_size(l); - fprintf(stderr, "Deconvolutional Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); + fprintf(stderr, "deconv%5d %2d x%2d /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", n, size, size, stride, w, h, c, l.out_w, l.out_h, l.out_c); return l; } -void resize_deconvolutional_layer(deconvolutional_layer *l, int h, int w) +void resize_deconvolutional_layer(layer *l, int h, int w) { l->h = h; l->w = w; - int out_h = deconvolutional_out_height(*l); - int out_w = deconvolutional_out_width(*l); + l->out_h = (l->h) * l->stride + l->size/2 - l->pad; + l->out_w = (l->w) * l->stride + l->size/2 - l->pad; - l->col_image = realloc(l->col_image, - out_h*out_w*l->size*l->size*l->c*sizeof(float)); - l->output = realloc(l->output, - l->batch*out_h * out_w * l->n*sizeof(float)); - l->delta = realloc(l->delta, - l->batch*out_h * out_w * l->n*sizeof(float)); - #ifdef GPU - cuda_free(l->col_image_gpu); + l->outputs = l->out_h * l->out_w * l->out_c; + l->inputs = l->w * l->h * l->c; + + l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); + l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); + if(l->batch_normalize){ + l->x = realloc(l->x, l->batch*l->outputs*sizeof(float)); + l->x_norm = realloc(l->x_norm, l->batch*l->outputs*sizeof(float)); + } + +#ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); - l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c); - l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); - l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); + l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); + l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); + + if(l->batch_normalize){ + cuda_free(l->x_gpu); + cuda_free(l->x_norm_gpu); + + l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs); + l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs); + } + #ifdef CUDNN + cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); + cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); #endif +#endif + l->workspace_size = get_workspace_size(*l); } -void forward_deconvolutional_layer(const deconvolutional_layer l, network_state state) +void forward_deconvolutional_layer(const layer l, network_state state) { int i; - int out_h = deconvolutional_out_height(l); - int out_w = deconvolutional_out_width(l); + int out_h = l.out_h; + int out_w = l.out_w; int size = out_h*out_w; int m = l.size*l.size*l.n; @@ -143,17 +188,22 @@ void forward_deconvolutional_layer(const deconvolutional_layer l, network_state for(i = 0; i < l.batch; ++i){ float *a = l.weights; float *b = state.input + i*l.c*l.h*l.w; - float *c = l.col_image; + float *c = state.workspace; gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size); } - add_bias(l.output, l.biases, l.batch, l.n, size); + + if(l.batch_normalize){ + forward_batchnorm_layer(l, state); + } else { + add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); + } activate_array(l.output, l.batch*l.n*size, l.activation); } -void backward_deconvolutional_layer(deconvolutional_layer l, network_state state) +void backward_deconvolutional_layer(layer l, network_state state) { float alpha = 1./l.batch; int out_h = deconvolutional_out_height(l); @@ -162,7 +212,11 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state int i; gradient_array(l.output, size*l.n*l.batch, l.activation, l.delta); - backward_bias(l.bias_updates, l.delta, l.batch, l.n, size); + if(l.batch_normalize){ + backward_batchnorm_layer(l, state); + } else { + backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); + } for(i = 0; i < l.batch; ++i){ int m = l.c; @@ -170,7 +224,7 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state int k = l.h*l.w; float *a = state.input + i*m*n; - float *b = l.col_image; + float *b = state.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.n*size, l.n, out_h, out_w, @@ -183,7 +237,7 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state int k = l.size*l.size*l.n; float *a = l.weights; - float *b = l.col_image; + float *b = state.workspace; float *c = state.delta + i*n*m; gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); @@ -191,14 +245,19 @@ void backward_deconvolutional_layer(deconvolutional_layer l, network_state state } } -void update_deconvolutional_layer(deconvolutional_layer l, float learning_rate, float momentum, float decay) +void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay) { int size = l.size*l.size*l.c*l.n; - axpy_cpu(l.n, learning_rate, l.bias_updates, 1, l.biases, 1); + axpy_cpu(l.n, learning_rate/batch, l.bias_updates, 1, l.biases, 1); scal_cpu(l.n, momentum, l.bias_updates, 1); - axpy_cpu(size, -decay, l.weights, 1, l.weight_updates, 1); - axpy_cpu(size, learning_rate, l.weight_updates, 1, l.weights, 1); + if(l.scales){ + axpy_cpu(l.n, learning_rate/batch, l.scale_updates, 1, l.scales, 1); + scal_cpu(l.n, momentum, l.scale_updates, 1); + } + + axpy_cpu(size, -decay*batch, l.weights, 1, l.weight_updates, 1); + axpy_cpu(size, learning_rate/batch, l.weight_updates, 1, l.weights, 1); scal_cpu(size, momentum, l.weight_updates, 1); } diff --git a/src/deconvolutional_layer.h b/src/deconvolutional_layer.h index 2d36e02a..6a57513e 100644 --- a/src/deconvolutional_layer.h +++ b/src/deconvolutional_layer.h @@ -7,28 +7,19 @@ #include "layer.h" #include "network.h" -typedef layer deconvolutional_layer; - #ifdef GPU -void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state); -void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state); -void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay); -void push_deconvolutional_layer(deconvolutional_layer layer); -void pull_deconvolutional_layer(deconvolutional_layer layer); +void forward_deconvolutional_layer_gpu(layer l, network_state state); +void backward_deconvolutional_layer_gpu(layer l, network_state state); +void update_deconvolutional_layer_gpu(layer l, int batch, float learning_rate, float momentum, float decay); +void push_deconvolutional_layer(layer l); +void pull_deconvolutional_layer(layer l); #endif -deconvolutional_layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation); -void resize_deconvolutional_layer(deconvolutional_layer *layer, int h, int w); -void forward_deconvolutional_layer(const deconvolutional_layer layer, network_state state); -void update_deconvolutional_layer(deconvolutional_layer layer, float learning_rate, float momentum, float decay); -void backward_deconvolutional_layer(deconvolutional_layer layer, network_state state); - -image get_deconvolutional_image(deconvolutional_layer layer); -image get_deconvolutional_delta(deconvolutional_layer layer); -image get_deconvolutional_filter(deconvolutional_layer layer, int i); - -int deconvolutional_out_height(deconvolutional_layer layer); -int deconvolutional_out_width(deconvolutional_layer layer); +layer make_deconvolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, ACTIVATION activation, int batch_normalize); +void resize_deconvolutional_layer(layer *l, int h, int w); +void forward_deconvolutional_layer(const layer l, network_state state); +void update_deconvolutional_layer(layer l, int batch, float learning_rate, float momentum, float decay); +void backward_deconvolutional_layer(layer l, network_state state); #endif diff --git a/src/detection_layer.c b/src/detection_layer.c index cd98b4b4..ff0f4c2b 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -58,7 +58,7 @@ void forward_detection_layer(const detection_layer l, network_state state) int index = b*l.inputs; for (i = 0; i < locations; ++i) { int offset = i*l.classes; - softmax(l.output + index + offset, l.classes, 1, + softmax(l.output + index + offset, l.classes, 1, 1, l.output + index + offset); } } @@ -101,13 +101,13 @@ void forward_detection_layer(const detection_layer l, network_state state) avg_allcat += l.output[class_index+j]; } - box truth = float_to_box(state.truth + truth_index + 1 + l.classes); + box truth = float_to_box(state.truth + truth_index + 1 + l.classes, 1); truth.x /= l.side; truth.y /= l.side; for(j = 0; j < l.n; ++j){ int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords; - box out = float_to_box(l.output + box_index); + box out = float_to_box(l.output + box_index, 1); out.x /= l.side; out.y /= l.side; @@ -146,7 +146,7 @@ void forward_detection_layer(const detection_layer l, network_state state) int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords; int tbox_index = truth_index + 1 + l.classes; - box out = float_to_box(l.output + box_index); + box out = float_to_box(l.output + box_index, 1); out.x /= l.side; out.y /= l.side; if (l.sqrt) { diff --git a/src/detector.c b/src/detector.c index 1416c050..318e5d3c 100644 --- a/src/detector.c +++ b/src/detector.c @@ -6,6 +6,7 @@ #include "box.h" #include "demo.h" #include "option_list.h" +#include "blas.h" #ifdef OPENCV #include "opencv2/highgui/highgui_c.h" @@ -103,21 +104,28 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i load_thread = load_data(args); /* - int k; - for(k = 0; k < l.max_boxes; ++k){ - box b = float_to_box(train.y.vals[10] + 1 + k*5); - if(!b.x) break; - printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h); - } - image im = float_to_image(448, 448, 3, train.X.vals[10]); - int k; - for(k = 0; k < l.max_boxes; ++k){ - box b = float_to_box(train.y.vals[10] + 1 + k*5); - printf("%d %d %d %d\n", truth.x, truth.y, truth.w, truth.h); - draw_bbox(im, b, 8, 1,0,0); - } - save_image(im, "truth11"); - */ + int k; + for(k = 0; k < l.max_boxes; ++k){ + box b = float_to_box(train.y.vals[10] + 1 + k*5); + if(!b.x) break; + printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h); + } + */ + /* + int zz; + for(zz = 0; zz < train.X.cols; ++zz){ + image im = float_to_image(net.w, net.h, 3, train.X.vals[zz]); + int k; + for(k = 0; k < l.max_boxes; ++k){ + box b = float_to_box(train.y.vals[zz] + k*5); + printf("%f %f %f %f\n", b.x, b.y, b.w, b.h); + draw_bbox(im, b, 1, 1,0,0); + } + show_image(im, "truth11"); + cvWaitKey(0); + save_image(im, "truth11"); + } + */ printf("Loaded: %lf seconds\n", sec(clock()-time)); @@ -192,13 +200,13 @@ void print_detector_detections(FILE **fps, char *id, box *boxes, float **probs, { int i, j; for(i = 0; i < total; ++i){ - float xmin = boxes[i].x - boxes[i].w/2.; - float xmax = boxes[i].x + boxes[i].w/2.; - float ymin = boxes[i].y - boxes[i].h/2.; - float ymax = boxes[i].y + boxes[i].h/2.; + float xmin = boxes[i].x - boxes[i].w/2. + 1; + float xmax = boxes[i].x + boxes[i].w/2. + 1; + float ymin = boxes[i].y - boxes[i].h/2. + 1; + float ymax = boxes[i].y + boxes[i].h/2. + 1; - if (xmin < 0) xmin = 0; - if (ymin < 0) ymin = 0; + if (xmin < 1) xmin = 1; + if (ymin < 1) ymin = 1; if (xmax > w) xmax = w; if (ymax > h) ymax = h; @@ -231,6 +239,142 @@ void print_imagenet_detections(FILE *fp, int id, box *boxes, float **probs, int } } +void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char *outfile) +{ + int j; + list *options = read_data_cfg(datacfg); + char *valid_images = option_find_str(options, "valid", "data/train.list"); + char *name_list = option_find_str(options, "names", "data/names.list"); + char *prefix = option_find_str(options, "results", "results"); + char **names = get_labels(name_list); + char *mapf = option_find_str(options, "map", 0); + int *map = 0; + if (mapf) map = read_map(mapf); + + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + set_batch_network(&net, 2); + fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + srand(time(0)); + + list *plist = get_paths(valid_images); + char **paths = (char **)list_to_array(plist); + + layer l = net.layers[net.n-1]; + int classes = l.classes; + + char buff[1024]; + char *type = option_find_str(options, "eval", "voc"); + FILE *fp = 0; + FILE **fps = 0; + int coco = 0; + int imagenet = 0; + if(0==strcmp(type, "coco")){ + if(!outfile) outfile = "coco_results"; + snprintf(buff, 1024, "%s/%s.json", prefix, outfile); + fp = fopen(buff, "w"); + fprintf(fp, "[\n"); + coco = 1; + } else if(0==strcmp(type, "imagenet")){ + if(!outfile) outfile = "imagenet-detection"; + snprintf(buff, 1024, "%s/%s.txt", prefix, outfile); + fp = fopen(buff, "w"); + imagenet = 1; + classes = 200; + } else { + if(!outfile) outfile = "comp4_det_test_"; + fps = calloc(classes, sizeof(FILE *)); + for(j = 0; j < classes; ++j){ + snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]); + fps[j] = fopen(buff, "w"); + } + } + + + box *boxes = calloc(l.w*l.h*l.n, sizeof(box)); + float **probs = calloc(l.w*l.h*l.n, sizeof(float *)); + for(j = 0; j < l.w*l.h*l.n; ++j) probs[j] = calloc(classes, sizeof(float *)); + + int m = plist->size; + int i=0; + int t; + + float thresh = .005; + float nms = .45; + + int nthreads = 4; + image *val = calloc(nthreads, sizeof(image)); + image *val_resized = calloc(nthreads, sizeof(image)); + image *buf = calloc(nthreads, sizeof(image)); + image *buf_resized = calloc(nthreads, sizeof(image)); + pthread_t *thr = calloc(nthreads, sizeof(pthread_t)); + + image input = make_image(net.w, net.h, net.c*2); + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + //args.type = IMAGE_DATA; + args.type = LETTERBOX_DATA; + + for(t = 0; t < nthreads; ++t){ + args.path = paths[i+t]; + args.im = &buf[t]; + args.resized = &buf_resized[t]; + thr[t] = load_data_in_thread(args); + } + time_t start = time(0); + for(i = nthreads; i < m+nthreads; i += nthreads){ + fprintf(stderr, "%d\n", i); + for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ + pthread_join(thr[t], 0); + val[t] = buf[t]; + val_resized[t] = buf_resized[t]; + } + for(t = 0; t < nthreads && i+t < m; ++t){ + args.path = paths[i+t]; + args.im = &buf[t]; + args.resized = &buf_resized[t]; + thr[t] = load_data_in_thread(args); + } + for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ + char *path = paths[i+t-nthreads]; + char *id = basecfg(path); + copy_cpu(net.w*net.h*net.c, val_resized[t].data, 1, input.data, 1); + flip_image(val_resized[t]); + copy_cpu(net.w*net.h*net.c, val_resized[t].data, 1, input.data + net.w*net.h*net.c, 1); + + network_predict(net, input.data); + int w = val[t].w; + int h = val[t].h; + get_region_boxes(l, w, h, thresh, probs, boxes, 0, map, .5); + if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, classes, nms); + if (coco){ + print_cocos(fp, path, boxes, probs, l.w*l.h*l.n, classes, w, h); + } else if (imagenet){ + print_imagenet_detections(fp, i+t-nthreads+1, boxes, probs, l.w*l.h*l.n, classes, w, h); + } else { + print_detector_detections(fps, id, boxes, probs, l.w*l.h*l.n, classes, w, h); + } + free(id); + free_image(val[t]); + free_image(val_resized[t]); + } + } + for(j = 0; j < classes; ++j){ + if(fps) fclose(fps[j]); + } + if(coco){ + fseek(fp, -2, SEEK_CUR); + fprintf(fp, "\n]\n"); + fclose(fp); + } + fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start)); +} + + void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *outfile) { int j; @@ -306,7 +450,8 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out load_args args = {0}; args.w = net.w; args.h = net.h; - args.type = IMAGE_DATA; + //args.type = IMAGE_DATA; + args.type = LETTERBOX_DATA; for(t = 0; t < nthreads; ++t){ args.path = paths[i+t]; @@ -467,7 +612,11 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam strtok(input, "\n"); } image im = load_image_color(input,0,0); - image sized = resize_image(im, net.w, net.h); + image sized = letterbox_image(im, net.w, net.h); + //image sized = resize_image(im, net.w, net.h); + //image sized2 = resize_max(im, net.w); + //image sized = crop_image(sized2, -((net.w - sized2.w)/2), -((net.h - sized2.h)/2), net.w, net.h); + //resize_network(&net, sized.w, sized.h); layer l = net.layers[net.n-1]; box *boxes = calloc(l.w*l.h*l.n, sizeof(box)); @@ -481,9 +630,9 @@ void test_detector(char *datacfg, char *cfgfile, char *weightfile, char *filenam get_region_boxes(l, 1, 1, thresh, probs, boxes, 0, 0, hier_thresh); if (l.softmax_tree && nms) do_nms_obj(boxes, probs, l.w*l.h*l.n, l.classes, nms); else if (nms) do_nms_sort(boxes, probs, l.w*l.h*l.n, l.classes, nms); - draw_detections(im, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); - save_image(im, "predictions"); - show_image(im, "predictions"); + draw_detections(sized, l.w*l.h*l.n, thresh, boxes, probs, names, alphabet, l.classes); + save_image(sized, "predictions"); + show_image(sized, "predictions"); free_image(im); free_image(sized); @@ -541,6 +690,7 @@ void run_detector(int argc, char **argv) if(0==strcmp(argv[2], "test")) test_detector(datacfg, cfg, weights, filename, thresh, hier_thresh); else if(0==strcmp(argv[2], "train")) train_detector(datacfg, cfg, weights, gpus, ngpus, clear); else if(0==strcmp(argv[2], "valid")) validate_detector(datacfg, cfg, weights, outfile); + else if(0==strcmp(argv[2], "valid2")) validate_detector_flip(datacfg, cfg, weights, outfile); else if(0==strcmp(argv[2], "recall")) validate_detector_recall(cfg, weights); else if(0==strcmp(argv[2], "demo")) { list *options = read_data_cfg(datacfg); diff --git a/src/go.c b/src/go.c index 89297b5e..39610aa8 100644 --- a/src/go.c +++ b/src/go.c @@ -3,6 +3,8 @@ #include "parser.h" #include "option_list.h" #include "blas.h" +#include "data.h" +#include #ifdef OPENCV #include "opencv2/highgui/highgui_c.h" @@ -10,7 +12,7 @@ int inverted = 1; int noi = 1; -static const int nind = 5; +static const int nind = 2; typedef struct { char **data; @@ -88,22 +90,30 @@ void board_to_string(char *s, float *board) } } -void random_go_moves(moves m, float *boards, float *labels, int n) +data random_go_moves(moves m, int n) { + data d = {0}; + d.X = make_matrix(n, 19*19); + d.y = make_matrix(n, 19*19+1); int i; - memset(labels, 0, 19*19*n*sizeof(float)); for(i = 0; i < n; ++i){ + float *board = d.X.vals[i]; + float *label = d.y.vals[i]; char *b = m.data[rand()%m.n]; int row = b[0]; int col = b[1]; - labels[col + 19*(row + i*19)] = 1; - string_to_board(b+2, boards+i*19*19); - boards[col + 19*(row + i*19)] = 0; + if(row >= 19 || col >= 19){ + label[19*19] = 1; + } else { + label[col + 19*row] = 1; + string_to_board(b+2, board); + if(board[col + 19*row]) printf("hey\n"); + } int flip = rand()%2; int rotate = rand()%4; - image in = float_to_image(19, 19, 1, boards+i*19*19); - image out = float_to_image(19, 19, 1, labels+i*19*19); + image in = float_to_image(19, 19, 1, board); + image out = float_to_image(19, 19, 1, label); if(flip){ flip_image(in); flip_image(out); @@ -111,36 +121,60 @@ void random_go_moves(moves m, float *boards, float *labels, int n) rotate_image_cw(in, rotate); rotate_image_cw(out, rotate); } + return d; } -void train_go(char *cfgfile, char *weightfile) +void train_go(char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear) { - srand(time(0)); + int i; float avg_loss = -1; char *base = basecfg(cfgfile); printf("%s\n", base); - network net = parse_network_cfg(cfgfile); - if(weightfile){ - load_weights(&net, weightfile); + printf("%d\n", ngpus); + network *nets = calloc(ngpus, sizeof(network)); + + srand(time(0)); + int seed = rand(); + for(i = 0; i < ngpus; ++i){ + srand(seed); +#ifdef GPU + cuda_set_device(gpus[i]); +#endif + nets[i] = load_network(cfgfile, weightfile, clear); + nets[i].learning_rate *= ngpus; } + network net = nets[0]; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); char *backup_directory = "/home/pjreddie/backup/"; char buff[256]; - float *board = calloc(19*19*net.batch, sizeof(float)); - float *move = calloc(19*19*net.batch, sizeof(float)); moves m = load_go_moves("/home/pjreddie/backup/go.train"); //moves m = load_go_moves("games.txt"); int N = m.n; + printf("Moves: %d\n", N); int epoch = (*net.seen)/N; while(get_current_batch(net) < net.max_batches || net.max_batches == 0){ clock_t time=clock(); - random_go_moves(m, board, move, net.batch); - float loss = train_network_datum(net, board, move) / net.batch; + data train = random_go_moves(m, net.batch*net.subdivisions*ngpus); + printf("Loaded: %lf seconds\n", sec(clock()-time)); + time=clock(); + + float loss = 0; +#ifdef GPU + if(ngpus == 1){ + loss = train_network(net, train); + } else { + loss = train_networks(nets, ngpus, train, 4); + } +#else + loss = train_network(net, train); +#endif + free_data(train); + if(avg_loss == -1) avg_loss = loss; avg_loss = avg_loss*.95 + loss*.05; printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); @@ -151,7 +185,7 @@ void train_go(char *cfgfile, char *weightfile) save_weights(net, buff); } - if(get_current_batch(net)%100 == 0){ + if(get_current_batch(net)%1000 == 0){ char buff[256]; sprintf(buff, "%s/%s.backup",backup_directory,base); save_weights(net, buff); @@ -204,12 +238,9 @@ int *calculate_liberties(float *board) return lib; } -void print_board(float *board, int swap, int *indexes) +void print_board(FILE *stream, float *board, int swap, int *indexes) { - //FILE *stream = stdout; - FILE *stream = stderr; int i,j,n; - fprintf(stream, "\n\n"); fprintf(stream, " "); for(i = 0; i < 19; ++i){ fprintf(stream, "%c ", 'A' + i + 1*(i > 7 && noi)); @@ -225,12 +256,12 @@ void print_board(float *board, int swap, int *indexes) if(index == indexes[n]){ found = 1; /* - if(n == 0) fprintf(stream, "\uff11"); - else if(n == 1) fprintf(stream, "\uff12"); - else if(n == 2) fprintf(stream, "\uff13"); - else if(n == 3) fprintf(stream, "\uff14"); - else if(n == 4) fprintf(stream, "\uff15"); - */ + if(n == 0) fprintf(stream, "\uff11"); + else if(n == 1) fprintf(stream, "\uff12"); + else if(n == 2) fprintf(stream, "\uff13"); + else if(n == 3) fprintf(stream, "\uff14"); + else if(n == 4) fprintf(stream, "\uff15"); + */ if(n == 0) fprintf(stream, " 1"); else if(n == 1) fprintf(stream, " 2"); else if(n == 2) fprintf(stream, " 3"); @@ -261,7 +292,7 @@ void flip_board(float *board) void predict_move(network net, float *board, float *move, int multi) { float *output = network_predict(net, board); - copy_cpu(19*19, output, 1, move, 1); + copy_cpu(19*19+1, output, 1, move, 1); int i; if(multi){ image bim = float_to_image(19, 19, 1, board); @@ -275,12 +306,12 @@ void predict_move(network net, float *board, float *move, int multi) if(i >= 4) flip_image(oim); rotate_image_cw(oim, -i); - axpy_cpu(19*19, 1, output, 1, move, 1); + axpy_cpu(19*19+1, 1, output, 1, move, 1); if(i >= 4) flip_image(bim); rotate_image_cw(bim, -i); } - scal_cpu(19*19, 1./8., move, 1); + scal_cpu(19*19+1, 1./8., move, 1); } for(i = 0; i < 19*19; ++i){ if(board[i]) move[i] = 0; @@ -350,14 +381,24 @@ int legal_go(float *b, char *ko, int p, int r, int c) int generate_move(network net, int player, float *board, int multi, float thresh, float temp, char *ko, int print) { int i, j; + int empty = 1; + for(i = 0; i < 19*19; ++i){ + if (board[i]) { + empty = 0; + break; + } + } + if(empty) { + return 72; + } for(i = 0; i < net.n; ++i) net.layers[i].temperature = temp; - float move[361]; + float move[362]; if (player < 0) flip_board(board); predict_move(net, board, move, multi); if (player < 0) flip_board(board); - + for(i = 0; i < 19; ++i){ for(j = 0; j < 19; ++j){ if (!legal_go(board, ko, player, i, j)) move[i*19 + j] = 0; @@ -365,40 +406,43 @@ int generate_move(network net, int player, float *board, int multi, float thresh } int indexes[nind]; - top_k(move, 19*19, nind, indexes); + top_k(move, 19*19+1, nind, indexes); if(thresh > move[indexes[0]]) thresh = move[indexes[nind-1]]; - for(i = 0; i < 19; ++i){ - for(j = 0; j < 19; ++j){ - if (move[i*19 + j] < thresh) move[i*19 + j] = 0; - } + for(i = 0; i < 19*19+1; ++i){ + if (move[i] < thresh) move[i] = 0; } - int max = max_index(move, 19*19); + int max = max_index(move, 19*19+1); int row = max / 19; int col = max % 19; - int index = sample_array(move, 19*19); + int index = sample_array(move, 19*19+1); if(print){ - top_k(move, 19*19, nind, indexes); + top_k(move, 19*19+1, nind, indexes); for(i = 0; i < nind; ++i){ if (!move[indexes[i]]) indexes[i] = -1; } - print_board(board, player, indexes); + print_board(stderr, board, player, indexes); for(i = 0; i < nind; ++i){ fprintf(stderr, "%d: %f\n", i+1, move[indexes[i]]); } } + if (row == 19) return -1; - if(suicide_go(board, player, row, col)){ + if (suicide_go(board, player, row, col)){ return -1; } - if(suicide_go(board, player, index/19, index%19)) index = max; + + if (suicide_go(board, player, index/19, index%19)){ + index = max; + } + if (index == 19*19) return -1; return index; } -void valid_go(char *cfgfile, char *weightfile, int multi) +void valid_go(char *cfgfile, char *weightfile, int multi, char *filename) { srand(time(0)); char *base = basecfg(cfgfile); @@ -411,8 +455,9 @@ void valid_go(char *cfgfile, char *weightfile, int multi) printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); float *board = calloc(19*19, sizeof(float)); - float *move = calloc(19*19, sizeof(float)); - moves m = load_go_moves("/home/pjreddie/backup/go.test"); + float *move = calloc(19*19+1, sizeof(float)); + // moves m = load_go_moves("/home/pjreddie/backup/go.test"); + moves m = load_go_moves(filename); int N = m.n; int i; @@ -430,6 +475,23 @@ void valid_go(char *cfgfile, char *weightfile, int multi) } } +int print_game(float *board, FILE *fp) +{ + int i, j; + int count = 3; + fprintf(fp, "komi 6.5\n"); + fprintf(fp, "boardsize 19\n"); + fprintf(fp, "clear_board\n"); + for(j = 0; j < 19; ++j){ + for(i = 0; i < 19; ++i){ + if(board[j*19 + i] == 1) fprintf(fp, "play black %c%d\n", 'A'+i+(i>=8), 19-j); + if(board[j*19 + i] == -1) fprintf(fp, "play white %c%d\n", 'A'+i+(i>=8), 19-j); + if(board[j*19 + i]) ++count; + } + } + return count; +} + void engine_go(char *filename, char *weightfile, int multi) { network net = parse_network_cfg(filename); @@ -456,8 +518,12 @@ void engine_go(char *filename, char *weightfile, int multi) printf("=%s 2\n\n", ids); } else if (!strcmp(buff, "name")){ printf("=%s DarkGo\n\n", ids); + } else if (!strcmp(buff, "time_settings") || !strcmp(buff, "time_left")){ + char *line = fgetl(stdin); + free(line); + printf("=%s \n\n", ids); } else if (!strcmp(buff, "version")){ - printf("=%s 1.0\n\n", ids); + printf("=%s 1.0. Want more DarkGo? You can find me on OGS, unlimited games, no waiting! https://online-go.com/user/view/434218\n\n", ids); } else if (!strcmp(buff, "known_command")){ char comm[256]; scanf("%s", comm); @@ -472,11 +538,14 @@ void engine_go(char *filename, char *weightfile, int multi) !strcmp(comm, "komi") || !strcmp(comm, "final_status_list") || !strcmp(comm, "play") || + !strcmp(comm, "genmove_white") || + !strcmp(comm, "genmove_black") || + !strcmp(comm, "fixed_handicap") || !strcmp(comm, "genmove")); if(known) printf("=%s true\n\n", ids); else printf("=%s false\n\n", ids); } else if (!strcmp(buff, "list_commands")){ - printf("=%s protocol_version\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove\nfinal_status_list\n\n", ids); + printf("=%s protocol_version\nshowboard\nname\nversion\nknown_command\nlist_commands\nquit\nboardsize\nclear_board\nkomi\nplay\ngenmove_black\ngenmove_white\ngenmove\nfinal_status_list\nfixed_handicap\n\n", ids); } else if (!strcmp(buff, "quit")){ break; } else if (!strcmp(buff, "boardsize")){ @@ -486,8 +555,17 @@ void engine_go(char *filename, char *weightfile, int multi) if(boardsize != 19){ printf("?%s unacceptable size\n\n", ids); } else { + memset(board, 0, 19*19*sizeof(float)); printf("=%s \n\n", ids); } + } else if (!strcmp(buff, "fixed_handicap")){ + int handicap = 0; + scanf("%d", &handicap); + int indexes[] = {72, 288, 300, 60, 180, 174, 186, 66, 294}; + int i; + for(i = 0; i < handicap; ++i){ + board[indexes[i]] = 1; + } } else if (!strcmp(buff, "clear_board")){ passed = 0; memset(board, 0, 19*19*sizeof(float)); @@ -496,14 +574,24 @@ void engine_go(char *filename, char *weightfile, int multi) float komi = 0; scanf("%f", &komi); printf("=%s \n\n", ids); - } else if (!strcmp(buff, "play")){ + } else if (!strcmp(buff, "showboard")){ + printf("=%s \n", ids); + print_board(stdout, board, 1, 0); + printf("\n"); + } else if (!strcmp(buff, "play") || !strcmp(buff, "black") || !strcmp(buff, "white")){ char color[256]; - scanf("%s ", color); + if(!strcmp(buff, "play")) + { + scanf("%s ", color); + } else { + scanf(" "); + color[0] = buff[0]; + } char c; int r; int count = scanf("%c%d", &c, &r); int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1; - if(c == 'p' && count < 2) { + if((c == 'p' || c == 'P') && count < 2) { passed = 1; printf("=%s \n\n", ids); char *line = fgetl(stdin); @@ -527,13 +615,20 @@ void engine_go(char *filename, char *weightfile, int multi) board_to_string(one, board); printf("=%s \n\n", ids); - print_board(board, 1, 0); - } else if (!strcmp(buff, "genmove")){ - char color[256]; - scanf("%s", color); - int player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1; + //print_board(stderr, board, 1, 0); + } else if (!strcmp(buff, "genmove") || !strcmp(buff, "genmove_black") || !strcmp(buff, "genmove_white")){ + int player = 0; + if(!strcmp(buff, "genmove")){ + char color[256]; + scanf("%s", color); + player = (color[0] == 'b' || color[0] == 'B') ? 1 : -1; + } else if (!strcmp(buff, "genmove_black")){ + player = 1; + } else { + player = -1; + } - int index = generate_move(net, player, board, multi, .1, .7, two, 1); + int index = generate_move(net, player, board, multi, .4, 1, two, 0); if(passed || index < 0){ printf("=%s pass\n\n", ids); passed = 0; @@ -550,7 +645,7 @@ void engine_go(char *filename, char *weightfile, int multi) row = 19 - row; if (col >= 8) ++col; printf("=%s %c%d\n\n", ids, 'A' + col, row); - print_board(board, 1, 0); + //print_board(board, 1, 0); } } else if (!strcmp(buff, "p")){ @@ -562,19 +657,10 @@ void engine_go(char *filename, char *weightfile, int multi) char *line = fgetl(stdin); free(line); if(type[0] == 'd' || type[0] == 'D'){ + int i; FILE *f = fopen("game.txt", "w"); - int i, j; - int count = 2; - fprintf(f, "boardsize 19\n"); - fprintf(f, "clear_board\n"); - for(j = 0; j < 19; ++j){ - for(i = 0; i < 19; ++i){ - if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j); - if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j); - if(board[j*19 + i]) ++count; - } - } - fprintf(f, "final_status_list dead\n"); + int count = print_game(board, f); + fprintf(f, "%s final_status_list dead\n", ids); fclose(f); FILE *p = popen("./gnugo --mode gtp < game.txt", "r"); for(i = 0; i < count; ++i){ @@ -608,44 +694,25 @@ void test_go(char *cfg, char *weights, int multi) srand(time(0)); set_batch_network(&net, 1); float *board = calloc(19*19, sizeof(float)); - float *move = calloc(19*19, sizeof(float)); + float *move = calloc(19*19+1, sizeof(float)); int color = 1; while(1){ - float *output = network_predict(net, board); - copy_cpu(19*19, output, 1, move, 1); int i; - if(multi){ - image bim = float_to_image(19, 19, 1, board); - for(i = 1; i < 8; ++i){ - rotate_image_cw(bim, i); - if(i >= 4) flip_image(bim); - - float *output = network_predict(net, board); - image oim = float_to_image(19, 19, 1, output); - - if(i >= 4) flip_image(oim); - rotate_image_cw(oim, -i); - - axpy_cpu(19*19, 1, output, 1, move, 1); - - if(i >= 4) flip_image(bim); - rotate_image_cw(bim, -i); - } - scal_cpu(19*19, 1./8., move, 1); - } - for(i = 0; i < 19*19; ++i){ - if(board[i]) move[i] = 0; - } + predict_move(net, board, move, multi); int indexes[nind]; int row, col; - top_k(move, 19*19, nind, indexes); - print_board(board, color, indexes); + top_k(move, 19*19+1, nind, indexes); + print_board(stderr, board, color, indexes); for(i = 0; i < nind; ++i){ int index = indexes[i]; row = index / 19; col = index % 19; - printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100); + if(row == 19){ + printf("%d: Pass, %.2f%%\n", i+1, move[index]*100); + } else { + printf("%d: %c %d, %.2f%%\n", i+1, col + 'A' + 1*(col > 7 && noi), (inverted)?19 - row : row+1, move[index]*100); + } } //if(color == 1) printf("\u25EF Enter move: "); //else printf("\u25C9 Enter move: "); @@ -663,7 +730,9 @@ void test_go(char *cfg, char *weights, int multi) int index = indexes[picked]; row = index / 19; col = index % 19; - board[row*19 + col] = 1; + if(row < 19){ + move_go(board, 1, row, col); + } } } else if (cnum){ if (c <= 'T' && c >= 'A'){ @@ -671,7 +740,7 @@ void test_go(char *cfg, char *weights, int multi) row = (inverted)?19 - row : row-1; col = c - 'A'; if (col > 7 && noi) col -= 1; - if (num == 2) board[row*19 + col] = 1; + if (num == 2) move_go(board, 1, row, col); } else if (c == 'p') { // Pass } else if(c=='b' || c == 'w'){ @@ -698,19 +767,9 @@ void test_go(char *cfg, char *weights, int multi) float score_game(float *board) { + int i; FILE *f = fopen("game.txt", "w"); - int i, j; - int count = 3; - fprintf(f, "komi 6.5\n"); - fprintf(f, "boardsize 19\n"); - fprintf(f, "clear_board\n"); - for(j = 0; j < 19; ++j){ - for(i = 0; i < 19; ++i){ - if(board[j*19 + i] == 1) fprintf(f, "play black %c%d\n", 'A'+i+(i>=8), 19-j); - if(board[j*19 + i] == -1) fprintf(f, "play white %c%d\n", 'A'+i+(i>=8), 19-j); - if(board[j*19 + i]) ++count; - } - } + int count = print_game(board, f); fprintf(f, "final_score\n"); fclose(f); FILE *p = popen("./gnugo --mode gtp < game.txt", "r"); @@ -747,7 +806,7 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi) } } srand(time(0)); - char boards[300][93]; + char boards[600][93]; int count = 0; set_batch_network(&net, 1); set_batch_network(&net2, 1); @@ -760,13 +819,15 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi) int p2 = 0; int total = 0; while(1){ - if (done || count >= 300){ + if (done){ float score = score_game(board); - int i = (score > 0)? 0 : 1; if((score > 0) == (total%2==0)) ++p1; else ++p2; ++total; fprintf(stderr, "Total: %d, Player 1: %f, Player 2: %f\n", total, (float)p1/total, (float)p2/total); + sleep(1); + /* + int i = (score > 0)? 0 : 1; int j; for(; i < count; i += 2){ for(j = 0; j < 93; ++j){ @@ -774,6 +835,7 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi) } printf("\n"); } + */ memset(board, 0, 19*19*sizeof(float)); player = 1; done = 0; @@ -781,10 +843,10 @@ void self_go(char *filename, char *weightfile, char *f2, char *w2, int multi) fflush(stdout); fflush(stderr); } - //print_board(board, 1, 0); + print_board(stderr, board, 1, 0); //sleep(1); network use = ((total%2==0) == (player==1)) ? net : net2; - int index = generate_move(use, player, board, multi, .1, .7, two, 0); + int index = generate_move(use, player, board, multi, .4, 1, two, 0); if(index < 0){ done = 1; continue; @@ -818,13 +880,37 @@ void run_go(int argc, char **argv) return; } + char *gpu_list = find_char_arg(argc, argv, "-gpus", 0); + int *gpus = 0; + int gpu = 0; + int ngpus = 0; + if(gpu_list){ + printf("%s\n", gpu_list); + int len = strlen(gpu_list); + ngpus = 1; + int i; + for(i = 0; i < len; ++i){ + if (gpu_list[i] == ',') ++ngpus; + } + gpus = calloc(ngpus, sizeof(int)); + for(i = 0; i < ngpus; ++i){ + gpus[i] = atoi(gpu_list); + gpu_list = strchr(gpu_list, ',')+1; + } + } else { + gpu = gpu_index; + gpus = &gpu; + ngpus = 1; + } + int clear = find_arg(argc, argv, "-clear"); + char *cfg = argv[3]; char *weights = (argc > 4) ? argv[4] : 0; char *c2 = (argc > 5) ? argv[5] : 0; char *w2 = (argc > 6) ? argv[6] : 0; int multi = find_arg(argc, argv, "-multi"); - if(0==strcmp(argv[2], "train")) train_go(cfg, weights); - else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi); + if(0==strcmp(argv[2], "train")) train_go(cfg, weights, gpus, ngpus, clear); + else if(0==strcmp(argv[2], "valid")) valid_go(cfg, weights, multi, c2); else if(0==strcmp(argv[2], "self")) self_go(cfg, weights, c2, w2, multi); else if(0==strcmp(argv[2], "test")) test_go(cfg, weights, multi); else if(0==strcmp(argv[2], "engine")) engine_go(cfg, weights, multi); diff --git a/src/image.c b/src/image.c index 5a90efd5..4ce44f2e 100644 --- a/src/image.c +++ b/src/image.c @@ -613,6 +613,21 @@ image float_to_image(int w, int h, int c, float *data) return out; } +void place_image(image im, int w, int h, int dx, int dy, image canvas) +{ + int x, y, c; + for(c = 0; c < im.c; ++c){ + for(y = 0; y < h; ++y){ + for(x = 0; x < w; ++x){ + int rx = ((float)x / w) * im.w; + int ry = ((float)y / h) * im.h; + float val = bilinear_interpolate(im, rx, ry, c); + set_pixel(canvas, x + dx, y + dy, c, val); + } + } + } +} + image rotate_crop_image(image im, float rad, float s, int w, int h, float dx, float dy, float aspect) { @@ -652,6 +667,12 @@ image rotate_image(image im, float rad) return rot; } +void fill_image(image m, float s) +{ + int i; + for(i = 0; i < m.h*m.w*m.c; ++i) m.data[i] = s; +} + void translate_image(image m, float s) { int i; @@ -753,6 +774,27 @@ void composite_3d(char *f1, char *f2, char *out, int delta) #endif } +image letterbox_image(image im, int w, int h) +{ + int new_w = im.w; + int new_h = im.h; + if (((float)w/im.w) < ((float)h/im.h)) { + new_w = w; + new_h = (im.h * w)/im.w; + } else { + new_h = h; + new_w = (im.w * h)/im.h; + } + image resized = resize_image(im, new_w, new_h); + image boxed = make_image(w, h, im.c); + fill_image(boxed, .5); + //int i; + //for(i = 0; i < boxed.w*boxed.h*boxed.c; ++i) boxed.data[i] = 0; + embed_image(resized, boxed, (w-new_w)/2, (h-new_h)/2); + free_image(resized); + return boxed; +} + image resize_max(image im, int max) { int w = im.w; @@ -824,6 +866,52 @@ float three_way_min(float a, float b, float c) return (a < b) ? ( (a < c) ? a : c) : ( (b < c) ? b : c) ; } +void yuv_to_rgb(image im) +{ + assert(im.c == 3); + int i, j; + float r, g, b; + float y, u, v; + for(j = 0; j < im.h; ++j){ + for(i = 0; i < im.w; ++i){ + y = get_pixel(im, i , j, 0); + u = get_pixel(im, i , j, 1); + v = get_pixel(im, i , j, 2); + + r = y + 1.13983*v; + g = y + -.39465*u + -.58060*v; + b = y + 2.03211*u; + + set_pixel(im, i, j, 0, r); + set_pixel(im, i, j, 1, g); + set_pixel(im, i, j, 2, b); + } + } +} + +void rgb_to_yuv(image im) +{ + assert(im.c == 3); + int i, j; + float r, g, b; + float y, u, v; + for(j = 0; j < im.h; ++j){ + for(i = 0; i < im.w; ++i){ + r = get_pixel(im, i , j, 0); + g = get_pixel(im, i , j, 1); + b = get_pixel(im, i , j, 2); + + y = .299*r + .587*g + .114*b; + u = -.14713*r + -.28886*g + .436*b; + v = .615*r + -.51499*g + -.10001*b; + + set_pixel(im, i, j, 0, y); + set_pixel(im, i, j, 1, u); + set_pixel(im, i, j, 2, v); + } + } +} + // http://www.cs.rit.edu/~ncs/color/t_convert.html void rgb_to_hsv(image im) { @@ -903,12 +991,30 @@ void hsv_to_rgb(image im) } } +void grayscale_image_3c(image im) +{ + assert(im.c == 3); + int i, j, k; + float scale[] = {0.299, 0.587, 0.114}; + for(j = 0; j < im.h; ++j){ + for(i = 0; i < im.w; ++i){ + float val = 0; + for(k = 0; k < 3; ++k){ + val += scale[k]*get_pixel(im, i, j, k); + } + im.data[0*im.h*im.w + im.w*j + i] = val; + im.data[1*im.h*im.w + im.w*j + i] = val; + im.data[2*im.h*im.w + im.w*j + i] = val; + } + } +} + image grayscale_image(image im) { assert(im.c == 3); int i, j, k; image gray = make_image(im.w, im.h, 1); - float scale[] = {0.587, 0.299, 0.114}; + float scale[] = {0.299, 0.587, 0.114}; for(k = 0; k < im.c; ++k){ for(j = 0; j < im.h; ++j){ for(i = 0; i < im.w; ++i){ diff --git a/src/image.h b/src/image.h index 39c3962d..d5b228f3 100644 --- a/src/image.h +++ b/src/image.h @@ -29,25 +29,32 @@ image crop_image(image im, int dx, int dy, int w, int h); image random_crop_image(image im, int w, int h); image random_augment_image(image im, float angle, float aspect, int low, int high, int size); void random_distort_image(image im, float hue, float saturation, float exposure); +image letterbox_image(image im, int w, int h); image resize_image(image im, int w, int h); image resize_min(image im, int min); image resize_max(image im, int max); +void fill_image(image m, float s); void translate_image(image m, float s); void normalize_image(image p); image rotate_image(image m, float rad); void rotate_image_cw(image im, int times); void embed_image(image source, image dest, int dx, int dy); +void place_image(image im, int w, int h, int dx, int dy, image canvas); void saturate_image(image im, float sat); void exposure_image(image im, float sat); void distort_image(image im, float hue, float sat, float val); void saturate_exposure_image(image im, float sat, float exposure); +void rgb_to_hsv(image im); void hsv_to_rgb(image im); +void yuv_to_rgb(image im); +void rgb_to_yuv(image im); void rgbgr_image(image im); void constrain_image(image im); void composite_3d(char *f1, char *f2, char *out, int delta); int best_3d_shift_r(image a, image b, int min, int max); image grayscale_image(image im); +void grayscale_image_3c(image im); image threshold_image(image im, float thresh); image collapse_image_layers(image source, int border); diff --git a/src/layer.c b/src/layer.c index 622cf268..791c5e77 100644 --- a/src/layer.c +++ b/src/layer.c @@ -32,7 +32,6 @@ void free_layer(layer l) if(l.scale_updates) free(l.scale_updates); if(l.weights) free(l.weights); if(l.weight_updates) free(l.weight_updates); - if(l.col_image) free(l.col_image); if(l.delta) free(l.delta); if(l.output) free(l.output); if(l.squared) free(l.squared); @@ -80,7 +79,6 @@ void free_layer(layer l) if(l.rolling_variance_gpu) cuda_free(l.rolling_variance_gpu); if(l.variance_delta_gpu) cuda_free(l.variance_delta_gpu); if(l.mean_delta_gpu) cuda_free(l.mean_delta_gpu); - if(l.col_image_gpu) cuda_free(l.col_image_gpu); if(l.x_gpu) cuda_free(l.x_gpu); if(l.x_norm_gpu) cuda_free(l.x_norm_gpu); if(l.weights_gpu) cuda_free(l.weights_gpu); diff --git a/src/layer.h b/src/layer.h index 806542bb..f9ac7247 100644 --- a/src/layer.h +++ b/src/layer.h @@ -38,7 +38,7 @@ typedef enum { } LAYER_TYPE; typedef enum{ - SSE, MASKED, SMOOTH + SSE, MASKED, L1, SMOOTH } COST_TYPE; struct layer{ @@ -58,6 +58,7 @@ struct layer{ int flipped; int inputs; int outputs; + int extra; int truths; int h,w,c; int out_h, out_w, out_c; @@ -68,6 +69,7 @@ struct layer{ int side; int stride; int reverse; + int flatten; int pad; int sqrt; int flip; @@ -76,6 +78,8 @@ struct layer{ int xnor; int steps; int hidden; + int truth; + float smooth; float dot; float angle; float jitter; @@ -83,6 +87,7 @@ struct layer{ float exposure; float shift; float ratio; + float learning_rate_scale; int softmax; int classes; int coords; @@ -115,6 +120,8 @@ struct layer{ int classfix; int absolute; + int onlyforward; + int stopbackward; int dontload; int dontloadscales; @@ -149,7 +156,6 @@ struct layer{ float * weights; float * weight_updates; - float * col_image; float * delta; float * output; float * squared; @@ -235,8 +241,6 @@ struct layer{ float * variance_delta_gpu; float * mean_delta_gpu; - float * col_image_gpu; - float * x_gpu; float * x_norm_gpu; float * weights_gpu; @@ -256,6 +260,7 @@ struct layer{ #ifdef CUDNN cudnnTensorDescriptor_t srcTensorDesc, dstTensorDesc; cudnnTensorDescriptor_t dsrcTensorDesc, ddstTensorDesc; + cudnnTensorDescriptor_t normTensorDesc; cudnnFilterDescriptor_t weightDesc; cudnnFilterDescriptor_t dweightDesc; cudnnConvolutionDescriptor_t convDesc; diff --git a/src/local_layer.c b/src/local_layer.c index 31f0ca6b..9f8a7ec7 100644 --- a/src/local_layer.c +++ b/src/local_layer.c @@ -57,9 +57,10 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in float scale = sqrt(2./(size*size*c)); for(i = 0; i < c*n*size*size; ++i) l.weights[i] = scale*rand_uniform(-1,1); - l.col_image = calloc(out_h*out_w*size*size*c, sizeof(float)); l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); l.delta = calloc(l.batch*out_h * out_w * n, sizeof(float)); + + l.workspace_size = out_h*out_w*size*size*c; l.forward = forward_local_layer; l.backward = backward_local_layer; @@ -76,7 +77,6 @@ local_layer make_local_layer(int batch, int h, int w, int c, int n, int size, in l.biases_gpu = cuda_make_array(l.biases, l.outputs); l.bias_updates_gpu = cuda_make_array(l.bias_updates, l.outputs); - l.col_image_gpu = cuda_make_array(l.col_image, out_h*out_w*size*size*c); l.delta_gpu = cuda_make_array(l.delta, l.batch*out_h*out_w*n); l.output_gpu = cuda_make_array(l.output, l.batch*out_h*out_w*n); @@ -102,11 +102,11 @@ void forward_local_layer(const local_layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, l.col_image); + l.size, l.stride, l.pad, state.workspace); float *output = l.output + i*l.outputs; for(j = 0; j < locations; ++j){ float *a = l.weights + j*l.size*l.size*l.c*l.n; - float *b = l.col_image + j; + float *b = state.workspace + j; float *c = output + j; int m = l.n; @@ -133,11 +133,11 @@ void backward_local_layer(local_layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_cpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, l.col_image); + l.size, l.stride, l.pad, state.workspace); for(j = 0; j < locations; ++j){ float *a = l.delta + i*l.outputs + j; - float *b = l.col_image + j; + float *b = state.workspace + j; float *c = l.weight_updates + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; @@ -150,7 +150,7 @@ void backward_local_layer(local_layer l, network_state state) for(j = 0; j < locations; ++j){ float *a = l.weights + j*l.size*l.size*l.c*l.n; float *b = l.delta + i*l.outputs + j; - float *c = l.col_image + j; + float *c = state.workspace + j; int m = l.size*l.size*l.c; int n = 1; @@ -159,7 +159,7 @@ void backward_local_layer(local_layer l, network_state state) gemm(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } - col2im_cpu(l.col_image, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_cpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } } @@ -192,11 +192,11 @@ void forward_local_layer_gpu(const local_layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, l.col_image_gpu); + l.size, l.stride, l.pad, state.workspace); float *output = l.output_gpu + i*l.outputs; for(j = 0; j < locations; ++j){ float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; - float *b = l.col_image_gpu + j; + float *b = state.workspace + j; float *c = output + j; int m = l.n; @@ -222,11 +222,11 @@ void backward_local_layer_gpu(local_layer l, network_state state) for(i = 0; i < l.batch; ++i){ float *input = state.input + i*l.w*l.h*l.c; im2col_ongpu(input, l.c, l.h, l.w, - l.size, l.stride, l.pad, l.col_image_gpu); + l.size, l.stride, l.pad, state.workspace); for(j = 0; j < locations; ++j){ float *a = l.delta_gpu + i*l.outputs + j; - float *b = l.col_image_gpu + j; + float *b = state.workspace + j; float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n; int m = l.n; int n = l.size*l.size*l.c; @@ -239,7 +239,7 @@ void backward_local_layer_gpu(local_layer l, network_state state) for(j = 0; j < locations; ++j){ float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n; float *b = l.delta_gpu + i*l.outputs + j; - float *c = l.col_image_gpu + j; + float *c = state.workspace + j; int m = l.size*l.size*l.c; int n = 1; @@ -248,7 +248,7 @@ void backward_local_layer_gpu(local_layer l, network_state state) gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations); } - col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); + col2im_ongpu(state.workspace, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w); } } } diff --git a/src/lsd.c b/src/lsd.c new file mode 100644 index 00000000..de29d761 --- /dev/null +++ b/src/lsd.c @@ -0,0 +1,924 @@ +#include "network.h" +#include "cost_layer.h" +#include "utils.h" +#include "parser.h" +#include "blas.h" + +#ifdef OPENCV +#include "opencv2/highgui/highgui_c.h" +#endif + +void train_lsd3(char *fcfg, char *fweight, char *gcfg, char *gweight, char *acfg, char *aweight, int clear) +{ +#ifdef GPU + //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; + //char *style_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *style_images = "/home/pjreddie/zelda.txt"; + char *backup_directory = "/home/pjreddie/backup/"; + srand(time(0)); + network fnet = load_network(fcfg, fweight, clear); + network gnet = load_network(gcfg, gweight, clear); + network anet = load_network(acfg, aweight, clear); + char *gbase = basecfg(gcfg); + char *abase = basecfg(acfg); + + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet.learning_rate, gnet.momentum, gnet.decay); + int imgs = gnet.batch*gnet.subdivisions; + int i = *gnet.seen/imgs; + data train, tbuffer; + data style, sbuffer; + + + list *slist = get_paths(style_images); + char **spaths = (char **)list_to_array(slist); + + list *tlist = get_paths(train_images); + char **tpaths = (char **)list_to_array(tlist); + + load_args targs= get_base_args(gnet); + targs.paths = tpaths; + targs.n = imgs; + targs.m = tlist->size; + targs.d = &tbuffer; + targs.type = CLASSIFICATION_DATA; + targs.classes = 1; + char *ls[1] = {"zelda"}; + targs.labels = ls; + + load_args sargs = get_base_args(gnet); + sargs.paths = spaths; + sargs.n = imgs; + sargs.m = slist->size; + sargs.d = &sbuffer; + sargs.type = CLASSIFICATION_DATA; + sargs.classes = 1; + sargs.labels = ls; + + pthread_t tload_thread = load_data_in_thread(targs); + pthread_t sload_thread = load_data_in_thread(sargs); + clock_t time; + + float aloss_avg = -1; + float floss_avg = -1; + + network_state fstate = {0}; + fstate.index = 0; + fstate.net = fnet; + int x_size = get_network_input_size(fnet)*fnet.batch; + int y_size = get_network_output_size(fnet)*fnet.batch; + fstate.input = cuda_make_array(0, x_size); + fstate.truth = cuda_make_array(0, y_size); + fstate.delta = cuda_make_array(0, x_size); + fstate.train = 1; + float *X = calloc(x_size, sizeof(float)); + float *y = calloc(y_size, sizeof(float)); + + float *ones = cuda_make_array(0, anet.batch); + float *zeros = cuda_make_array(0, anet.batch); + fill_ongpu(anet.batch, .99, ones, 1); + fill_ongpu(anet.batch, .01, zeros, 1); + + network_state astate = {0}; + astate.index = 0; + astate.net = anet; + int ax_size = get_network_input_size(anet)*anet.batch; + int ay_size = get_network_output_size(anet)*anet.batch; + astate.input = 0; + astate.truth = ones; + astate.delta = cuda_make_array(0, ax_size); + astate.train = 1; + + network_state gstate = {0}; + gstate.index = 0; + gstate.net = gnet; + int gx_size = get_network_input_size(gnet)*gnet.batch; + int gy_size = get_network_output_size(gnet)*gnet.batch; + gstate.input = cuda_make_array(0, gx_size); + gstate.truth = 0; + gstate.delta = 0; + gstate.train = 1; + + while (get_current_batch(gnet) < gnet.max_batches) { + i += 1; + time=clock(); + pthread_join(tload_thread, 0); + pthread_join(sload_thread, 0); + train = tbuffer; + style = sbuffer; + tload_thread = load_data_in_thread(targs); + sload_thread = load_data_in_thread(sargs); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + + data generated = copy_data(train); + time=clock(); + + int j, k; + float floss = 0; + for(j = 0; j < fnet.subdivisions; ++j){ + layer imlayer = gnet.layers[gnet.n - 1]; + get_next_batch(train, fnet.batch, j*fnet.batch, X, y); + + cuda_push_array(fstate.input, X, x_size); + cuda_push_array(gstate.input, X, gx_size); + *gnet.seen += gnet.batch; + + forward_network_gpu(fnet, fstate); + float *feats = fnet.layers[fnet.n - 2].output_gpu; + copy_ongpu(y_size, feats, 1, fstate.truth, 1); + + forward_network_gpu(gnet, gstate); + float *gen = gnet.layers[gnet.n-1].output_gpu; + copy_ongpu(x_size, gen, 1, fstate.input, 1); + + fill_ongpu(x_size, 0, fstate.delta, 1); + forward_network_gpu(fnet, fstate); + backward_network_gpu(fnet, fstate); + //HERE + + astate.input = gen; + fill_ongpu(ax_size, 0, astate.delta, 1); + forward_network_gpu(anet, astate); + backward_network_gpu(anet, astate); + + float *delta = imlayer.delta_gpu; + fill_ongpu(x_size, 0, delta, 1); + scal_ongpu(x_size, 100, astate.delta, 1); + scal_ongpu(x_size, .00001, fstate.delta, 1); + axpy_ongpu(x_size, 1, fstate.delta, 1, delta, 1); + axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); + + //fill_ongpu(x_size, 0, delta, 1); + //cuda_push_array(delta, X, x_size); + //axpy_ongpu(x_size, -1, imlayer.output_gpu, 1, delta, 1); + //printf("pix error: %f\n", cuda_mag_array(delta, x_size)); + printf("fea error: %f\n", cuda_mag_array(fstate.delta, x_size)); + printf("adv error: %f\n", cuda_mag_array(astate.delta, x_size)); + //axpy_ongpu(x_size, 1, astate.delta, 1, delta, 1); + + backward_network_gpu(gnet, gstate); + + floss += get_network_cost(fnet) /(fnet.subdivisions*fnet.batch); + + cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + for(k = 0; k < gnet.batch; ++k){ + int index = j*gnet.batch + k; + copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); + generated.y.vals[index][0] = .01; + } + } + +/* + image sim = float_to_image(anet.w, anet.h, anet.c, style.X.vals[j]); + show_image(sim, "style"); + cvWaitKey(0); + */ + + harmless_update_network_gpu(anet); + + data merge = concat_data(style, generated); + randomize_data(merge); + float aloss = train_network(anet, merge); + + update_network_gpu(gnet); + + free_data(merge); + free_data(train); + free_data(generated); + free_data(style); + if (aloss_avg < 0) aloss_avg = aloss; + if (floss_avg < 0) floss_avg = floss; + aloss_avg = aloss_avg*.9 + aloss*.1; + floss_avg = floss_avg*.9 + floss*.1; + + printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, floss, aloss, floss_avg, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); + if(i%1000==0){ + char buff[256]; + sprintf(buff, "%s/%s_%d.weights", backup_directory, gbase, i); + save_weights(gnet, buff); + sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); + save_weights(anet, buff); + } + if(i%100==0){ + char buff[256]; + sprintf(buff, "%s/%s.backup", backup_directory, gbase); + save_weights(gnet, buff); + sprintf(buff, "%s/%s.backup", backup_directory, abase); + save_weights(anet, buff); + } + } +#endif +} + +void train_pix2pix(char *cfg, char *weight, char *acfg, char *aweight, int clear) +{ +#ifdef GPU + //char *train_images = "/home/pjreddie/data/coco/train1.txt"; + //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; + char *backup_directory = "/home/pjreddie/backup/"; + srand(time(0)); + char *base = basecfg(cfg); + char *abase = basecfg(acfg); + printf("%s\n", base); + network net = load_network(cfg, weight, clear); + network anet = load_network(acfg, aweight, clear); + + int i, j, k; + layer imlayer = {0}; + for (i = 0; i < net.n; ++i) { + if (net.layers[i].out_c == 3) { + imlayer = net.layers[i]; + break; + } + } + + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + int imgs = net.batch*net.subdivisions; + i = *net.seen/imgs; + data train, buffer; + + + list *plist = get_paths(train_images); + //int N = plist->size; + char **paths = (char **)list_to_array(plist); + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.paths = paths; + args.n = imgs; + args.m = plist->size; + args.d = &buffer; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + args.size = net.w; + args.type = CLASSIFICATION_DATA; + args.classes = 1; + char *ls[1] = {"coco"}; + args.labels = ls; + + pthread_t load_thread = load_data_in_thread(args); + clock_t time; + + network_state gstate = {0}; + gstate.index = 0; + gstate.net = net; + int x_size = get_network_input_size(net)*net.batch; + int y_size = x_size; + gstate.input = cuda_make_array(0, x_size); + gstate.truth = cuda_make_array(0, y_size); + gstate.delta = 0; + gstate.train = 1; + float *pixs = calloc(x_size, sizeof(float)); + float *graypixs = calloc(x_size, sizeof(float)); + float *y = calloc(y_size, sizeof(float)); + + network_state astate = {0}; + astate.index = 0; + astate.net = anet; + int ay_size = get_network_output_size(anet)*anet.batch; + astate.input = 0; + astate.truth = 0; + astate.delta = 0; + astate.train = 1; + + float *imerror = cuda_make_array(0, imlayer.outputs); + float *ones_gpu = cuda_make_array(0, ay_size); + fill_ongpu(ay_size, .9, ones_gpu, 1); + + float aloss_avg = -1; + float gloss_avg = -1; + + //data generated = copy_data(train); + + while (get_current_batch(net) < net.max_batches) { + i += 1; + time=clock(); + pthread_join(load_thread, 0); + train = buffer; + load_thread = load_data_in_thread(args); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + + data gray = copy_data(train); + for(j = 0; j < imgs; ++j){ + image gim = float_to_image(net.w, net.h, net.c, gray.X.vals[j]); + grayscale_image_3c(gim); + train.y.vals[j][0] = .9; + + image yim = float_to_image(net.w, net.h, net.c, train.X.vals[j]); + //rgb_to_yuv(yim); + } + time=clock(); + float gloss = 0; + + for(j = 0; j < net.subdivisions; ++j){ + get_next_batch(train, net.batch, j*net.batch, pixs, y); + get_next_batch(gray, net.batch, j*net.batch, graypixs, y); + cuda_push_array(gstate.input, graypixs, x_size); + cuda_push_array(gstate.truth, pixs, x_size); + /* + image origi = float_to_image(net.w, net.h, 3, pixs); + image grayi = float_to_image(net.w, net.h, 3, graypixs); + show_image(grayi, "gray"); + show_image(origi, "orig"); + cvWaitKey(0); + */ + *net.seen += net.batch; + forward_network_gpu(net, gstate); + + fill_ongpu(imlayer.outputs, 0, imerror, 1); + astate.input = imlayer.output_gpu; + astate.delta = imerror; + astate.truth = ones_gpu; + forward_network_gpu(anet, astate); + backward_network_gpu(anet, astate); + + scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1); + + backward_network_gpu(net, gstate); + + scal_ongpu(imlayer.outputs, 100, imerror, 1); + + printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); + printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs)); + + axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); + + gloss += get_network_cost(net) /(net.subdivisions*net.batch); + + cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + for(k = 0; k < net.batch; ++k){ + int index = j*net.batch + k; + copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); + gray.y.vals[index][0] = .1; + } + } + harmless_update_network_gpu(anet); + + data merge = concat_data(train, gray); + randomize_data(merge); + float aloss = train_network(anet, merge); + + update_network_gpu(net); + update_network_gpu(anet); + free_data(merge); + free_data(train); + free_data(gray); + if (aloss_avg < 0) aloss_avg = aloss; + aloss_avg = aloss_avg*.9 + aloss*.1; + gloss_avg = gloss_avg*.9 + gloss*.1; + + printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); + if(i%1000==0){ + char buff[256]; + sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); + save_weights(net, buff); + sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); + save_weights(anet, buff); + } + if(i%100==0){ + char buff[256]; + sprintf(buff, "%s/%s.backup", backup_directory, base); + save_weights(net, buff); + sprintf(buff, "%s/%s.backup", backup_directory, abase); + save_weights(anet, buff); + } + } + char buff[256]; + sprintf(buff, "%s/%s_final.weights", backup_directory, base); + save_weights(net, buff); +#endif +} + +void train_colorizer(char *cfg, char *weight, char *acfg, char *aweight, int clear) +{ +#ifdef GPU + //char *train_images = "/home/pjreddie/data/coco/train1.txt"; + //char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *train_images = "/home/pjreddie/data/imagenet/imagenet1k.train.list"; + char *backup_directory = "/home/pjreddie/backup/"; + srand(time(0)); + char *base = basecfg(cfg); + char *abase = basecfg(acfg); + printf("%s\n", base); + network net = load_network(cfg, weight, clear); + network anet = load_network(acfg, aweight, clear); + + int i, j, k; + layer imlayer = {0}; + for (i = 0; i < net.n; ++i) { + if (net.layers[i].out_c == 3) { + imlayer = net.layers[i]; + break; + } + } + + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + int imgs = net.batch*net.subdivisions; + i = *net.seen/imgs; + data train, buffer; + + + list *plist = get_paths(train_images); + //int N = plist->size; + char **paths = (char **)list_to_array(plist); + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.paths = paths; + args.n = imgs; + args.m = plist->size; + args.d = &buffer; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + args.size = net.w; + args.type = CLASSIFICATION_DATA; + args.classes = 1; + char *ls[1] = {"imagenet"}; + args.labels = ls; + + pthread_t load_thread = load_data_in_thread(args); + clock_t time; + + network_state gstate = {0}; + gstate.index = 0; + gstate.net = net; + int x_size = get_network_input_size(net)*net.batch; + int y_size = x_size; + gstate.input = cuda_make_array(0, x_size); + gstate.truth = cuda_make_array(0, y_size); + gstate.delta = 0; + gstate.train = 1; + float *pixs = calloc(x_size, sizeof(float)); + float *graypixs = calloc(x_size, sizeof(float)); + float *y = calloc(y_size, sizeof(float)); + + network_state astate = {0}; + astate.index = 0; + astate.net = anet; + int ay_size = get_network_output_size(anet)*anet.batch; + astate.input = 0; + astate.truth = 0; + astate.delta = 0; + astate.train = 1; + + float *imerror = cuda_make_array(0, imlayer.outputs); + float *ones_gpu = cuda_make_array(0, ay_size); + fill_ongpu(ay_size, .99, ones_gpu, 1); + + float aloss_avg = -1; + float gloss_avg = -1; + + //data generated = copy_data(train); + + while (get_current_batch(net) < net.max_batches) { + i += 1; + time=clock(); + pthread_join(load_thread, 0); + train = buffer; + load_thread = load_data_in_thread(args); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + + data gray = copy_data(train); + for(j = 0; j < imgs; ++j){ + image gim = float_to_image(net.w, net.h, net.c, gray.X.vals[j]); + grayscale_image_3c(gim); + train.y.vals[j][0] = .99; + + image yim = float_to_image(net.w, net.h, net.c, train.X.vals[j]); + //rgb_to_yuv(yim); + } + time=clock(); + float gloss = 0; + + for(j = 0; j < net.subdivisions; ++j){ + get_next_batch(train, net.batch, j*net.batch, pixs, y); + get_next_batch(gray, net.batch, j*net.batch, graypixs, y); + cuda_push_array(gstate.input, graypixs, x_size); + cuda_push_array(gstate.truth, pixs, x_size); + /* + image origi = float_to_image(net.w, net.h, 3, pixs); + image grayi = float_to_image(net.w, net.h, 3, graypixs); + show_image(grayi, "gray"); + show_image(origi, "orig"); + cvWaitKey(0); + */ + *net.seen += net.batch; + forward_network_gpu(net, gstate); + + fill_ongpu(imlayer.outputs, 0, imerror, 1); + astate.input = imlayer.output_gpu; + astate.delta = imerror; + astate.truth = ones_gpu; + forward_network_gpu(anet, astate); + backward_network_gpu(anet, astate); + + scal_ongpu(imlayer.outputs, .1, net.layers[net.n-1].delta_gpu, 1); + + backward_network_gpu(net, gstate); + + scal_ongpu(imlayer.outputs, 100, imerror, 1); + + printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); + printf("features %f\n", cuda_mag_array(net.layers[net.n-1].delta_gpu, imlayer.outputs)); + + axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); + + gloss += get_network_cost(net) /(net.subdivisions*net.batch); + + cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + for(k = 0; k < net.batch; ++k){ + int index = j*net.batch + k; + copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, gray.X.vals[index], 1); + gray.y.vals[index][0] = .01; + } + } + harmless_update_network_gpu(anet); + + data merge = concat_data(train, gray); + randomize_data(merge); + float aloss = train_network(anet, merge); + + update_network_gpu(net); + update_network_gpu(anet); + free_data(merge); + free_data(train); + free_data(gray); + if (aloss_avg < 0) aloss_avg = aloss; + aloss_avg = aloss_avg*.9 + aloss*.1; + gloss_avg = gloss_avg*.9 + gloss*.1; + + printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); + if(i%1000==0){ + char buff[256]; + sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); + save_weights(net, buff); + sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); + save_weights(anet, buff); + } + if(i%100==0){ + char buff[256]; + sprintf(buff, "%s/%s.backup", backup_directory, base); + save_weights(net, buff); + sprintf(buff, "%s/%s.backup", backup_directory, abase); + save_weights(anet, buff); + } + } + char buff[256]; + sprintf(buff, "%s/%s_final.weights", backup_directory, base); + save_weights(net, buff); +#endif +} + +void train_lsd2(char *cfgfile, char *weightfile, char *acfgfile, char *aweightfile, int clear) +{ +#ifdef GPU + char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *backup_directory = "/home/pjreddie/backup/"; + srand(time(0)); + char *base = basecfg(cfgfile); + printf("%s\n", base); + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + if(clear) *net.seen = 0; + + char *abase = basecfg(acfgfile); + network anet = parse_network_cfg(acfgfile); + if(aweightfile){ + load_weights(&anet, aweightfile); + } + if(clear) *anet.seen = 0; + + int i, j, k; + layer imlayer = {0}; + for (i = 0; i < net.n; ++i) { + if (net.layers[i].out_c == 3) { + imlayer = net.layers[i]; + break; + } + } + + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + int imgs = net.batch*net.subdivisions; + i = *net.seen/imgs; + data train, buffer; + + + list *plist = get_paths(train_images); + //int N = plist->size; + char **paths = (char **)list_to_array(plist); + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.paths = paths; + args.n = imgs; + args.m = plist->size; + args.d = &buffer; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + args.size = net.w; + args.type = CLASSIFICATION_DATA; + args.classes = 1; + char *ls[1] = {"coco"}; + args.labels = ls; + + pthread_t load_thread = load_data_in_thread(args); + clock_t time; + + network_state gstate = {0}; + gstate.index = 0; + gstate.net = net; + int x_size = get_network_input_size(net)*net.batch; + int y_size = 1*net.batch; + gstate.input = cuda_make_array(0, x_size); + gstate.truth = 0; + gstate.delta = 0; + gstate.train = 1; + float *X = calloc(x_size, sizeof(float)); + float *y = calloc(y_size, sizeof(float)); + + network_state astate = {0}; + astate.index = 0; + astate.net = anet; + int ay_size = get_network_output_size(anet)*anet.batch; + astate.input = 0; + astate.truth = 0; + astate.delta = 0; + astate.train = 1; + + float *imerror = cuda_make_array(0, imlayer.outputs); + float *ones_gpu = cuda_make_array(0, ay_size); + fill_ongpu(ay_size, 1, ones_gpu, 1); + + float aloss_avg = -1; + float gloss_avg = -1; + + //data generated = copy_data(train); + + while (get_current_batch(net) < net.max_batches) { + i += 1; + time=clock(); + pthread_join(load_thread, 0); + train = buffer; + load_thread = load_data_in_thread(args); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + + data generated = copy_data(train); + time=clock(); + float gloss = 0; + + for(j = 0; j < net.subdivisions; ++j){ + get_next_batch(train, net.batch, j*net.batch, X, y); + cuda_push_array(gstate.input, X, x_size); + *net.seen += net.batch; + forward_network_gpu(net, gstate); + + fill_ongpu(imlayer.outputs, 0, imerror, 1); + astate.input = imlayer.output_gpu; + astate.delta = imerror; + astate.truth = ones_gpu; + forward_network_gpu(anet, astate); + backward_network_gpu(anet, astate); + + scal_ongpu(imlayer.outputs, 1, imerror, 1); + axpy_ongpu(imlayer.outputs, 1, imerror, 1, imlayer.delta_gpu, 1); + + backward_network_gpu(net, gstate); + + printf("features %f\n", cuda_mag_array(imlayer.delta_gpu, imlayer.outputs)); + printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs)); + + gloss += get_network_cost(net) /(net.subdivisions*net.batch); + + cuda_pull_array(imlayer.output_gpu, imlayer.output, x_size); + for(k = 0; k < net.batch; ++k){ + int index = j*net.batch + k; + copy_cpu(imlayer.outputs, imlayer.output + k*imlayer.outputs, 1, generated.X.vals[index], 1); + generated.y.vals[index][0] = 0; + } + } + harmless_update_network_gpu(anet); + + data merge = concat_data(train, generated); + randomize_data(merge); + float aloss = train_network(anet, merge); + + update_network_gpu(net); + update_network_gpu(anet); + free_data(merge); + free_data(train); + free_data(generated); + if (aloss_avg < 0) aloss_avg = aloss; + aloss_avg = aloss_avg*.9 + aloss*.1; + gloss_avg = gloss_avg*.9 + gloss*.1; + + printf("%d: gen: %f, adv: %f | gen_avg: %f, adv_avg: %f, %f rate, %lf seconds, %d images\n", i, gloss, aloss, gloss_avg, aloss_avg, get_current_rate(net), sec(clock()-time), i*imgs); + if(i%1000==0){ + char buff[256]; + sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); + save_weights(net, buff); + sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); + save_weights(anet, buff); + } + if(i%100==0){ + char buff[256]; + sprintf(buff, "%s/%s.backup", backup_directory, base); + save_weights(net, buff); + sprintf(buff, "%s/%s.backup", backup_directory, abase); + save_weights(anet, buff); + } + } + char buff[256]; + sprintf(buff, "%s/%s_final.weights", backup_directory, base); + save_weights(net, buff); +#endif +} + +void train_lsd(char *cfgfile, char *weightfile, int clear) +{ + char *train_images = "/home/pjreddie/data/coco/trainvalno5k.txt"; + char *backup_directory = "/home/pjreddie/backup/"; + srand(time(0)); + char *base = basecfg(cfgfile); + printf("%s\n", base); + float avg_loss = -1; + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + if(clear) *net.seen = 0; + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + int imgs = net.batch*net.subdivisions; + int i = *net.seen/imgs; + data train, buffer; + + + list *plist = get_paths(train_images); + //int N = plist->size; + char **paths = (char **)list_to_array(plist); + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.paths = paths; + args.n = imgs; + args.m = plist->size; + args.d = &buffer; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + args.size = net.w; + args.type = CLASSIFICATION_DATA; + args.classes = 1; + char *ls[1] = {"coco"}; + args.labels = ls; + + pthread_t load_thread = load_data_in_thread(args); + clock_t time; + //while(i*imgs < N*120){ + while(get_current_batch(net) < net.max_batches){ + i += 1; + time=clock(); + pthread_join(load_thread, 0); + train = buffer; + load_thread = load_data_in_thread(args); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + + time=clock(); + float loss = train_network(net, train); + if (avg_loss < 0) avg_loss = loss; + avg_loss = avg_loss*.9 + loss*.1; + + printf("%d: %f, %f avg, %f rate, %lf seconds, %d images\n", i, loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs); + if(i%1000==0){ + char buff[256]; + sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); + save_weights(net, buff); + } + if(i%100==0){ + char buff[256]; + sprintf(buff, "%s/%s.backup", backup_directory, base); + save_weights(net, buff); + } + free_data(train); + } + char buff[256]; + sprintf(buff, "%s/%s_final.weights", backup_directory, base); + save_weights(net, buff); +} + +void test_lsd(char *cfgfile, char *weightfile, char *filename) +{ + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + set_batch_network(&net, 1); + srand(2222222); + + clock_t time; + char buff[256]; + char *input = buff; + int i, imlayer = 0; + + for (i = 0; i < net.n; ++i) { + if (net.layers[i].out_c == 3) { + imlayer = i; + printf("%d\n", i); + break; + } + } + + while(1){ + if(filename){ + strncpy(input, filename, 256); + }else{ + printf("Enter Image Path: "); + fflush(stdout); + input = fgets(input, 256, stdin); + if(!input) return; + strtok(input, "\n"); + } + image im = load_image_color(input, 0, 0); + image resized = resize_min(im, net.w); + image crop = crop_image(resized, (resized.w - net.w)/2, (resized.h - net.h)/2, net.w, net.h); + //grayscale_image_3c(crop); + + float *X = crop.data; + time=clock(); + network_predict(net, X); + image out = get_network_image_layer(net, imlayer); + //yuv_to_rgb(out); + constrain_image(out); + printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); + show_image(out, "out"); + show_image(crop, "crop"); + save_image(out, "out"); +#ifdef OPENCV + cvWaitKey(0); +#endif + + free_image(im); + free_image(resized); + free_image(crop); + if (filename) break; + } +} + + +void run_lsd(int argc, char **argv) +{ + if(argc < 4){ + fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); + return; + } + + int clear = find_arg(argc, argv, "-clear"); + + char *cfg = argv[3]; + char *weights = (argc > 4) ? argv[4] : 0; + char *filename = (argc > 5) ? argv[5] : 0; + char *acfg = argv[5]; + char *aweights = (argc > 6) ? argv[6] : 0; + if(0==strcmp(argv[2], "train")) train_lsd(cfg, weights, clear); + else if(0==strcmp(argv[2], "train2")) train_lsd2(cfg, weights, acfg, aweights, clear); + else if(0==strcmp(argv[2], "traincolor")) train_colorizer(cfg, weights, acfg, aweights, clear); + else if(0==strcmp(argv[2], "train3")) train_lsd3(argv[3], argv[4], argv[5], argv[6], argv[7], argv[8], clear); + else if(0==strcmp(argv[2], "test")) test_lsd(cfg, weights, filename); + /* + else if(0==strcmp(argv[2], "valid")) validate_lsd(cfg, weights); + */ +} diff --git a/src/matrix.c b/src/matrix.c index ee149799..799916bf 100644 --- a/src/matrix.c +++ b/src/matrix.c @@ -1,5 +1,6 @@ #include "matrix.h" #include "utils.h" +#include "blas.h" #include #include #include @@ -73,6 +74,20 @@ void matrix_add_matrix(matrix from, matrix to) } } +matrix copy_matrix(matrix m) +{ + matrix c = {0}; + c.rows = m.rows; + c.cols = m.cols; + c.vals = calloc(c.rows, sizeof(float *)); + int i; + for(i = 0; i < c.rows; ++i){ + c.vals[i] = calloc(c.cols, sizeof(float)); + copy_cpu(c.cols, m.vals[i], 1, c.vals[i], 1); + } + return c; +} + matrix make_matrix(int rows, int cols) { int i; diff --git a/src/matrix.h b/src/matrix.h index 641b5965..d6655eb4 100644 --- a/src/matrix.h +++ b/src/matrix.h @@ -6,6 +6,7 @@ typedef struct matrix{ } matrix; matrix make_matrix(int rows, int cols); +matrix copy_matrix(matrix m); void free_matrix(matrix m); void print_matrix(matrix m); diff --git a/src/network.c b/src/network.c index 0914e37e..0d30dd8c 100644 --- a/src/network.c +++ b/src/network.c @@ -27,6 +27,35 @@ #include "dropout_layer.h" #include "route_layer.h" #include "shortcut_layer.h" +#include "parser.h" +#include "data.h" + +load_args get_base_args(network net) +{ + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.size = net.w; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + return args; +} + +network load_network(char *cfg, char *weights, int clear) +{ + network net = parse_network_cfg(cfg); + if(weights && weights[0] != 0){ + load_weights(&net, weights); + } + if(clear) *net.seen = 0; + return net; +} int get_current_batch(network net) { @@ -50,6 +79,7 @@ float get_current_rate(network net) int batch_num = get_current_batch(net); int i; float rate; + if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power); switch (net.policy) { case CONSTANT: return net.learning_rate; @@ -66,7 +96,6 @@ float get_current_rate(network net) case EXP: return net.learning_rate * pow(net.gamma, batch_num); case POLY: - if (batch_num < net.burn_in) return net.learning_rate * pow((float)batch_num / net.burn_in, net.power); return net.learning_rate * pow(1 - (float)batch_num / net.max_batches, net.power); case RANDOM: return net.learning_rate * pow(rand_uniform(0,1), net.power); @@ -150,7 +179,7 @@ void forward_network(network net, network_state state) state.index = i; layer l = net.layers[i]; if(l.delta){ - scal_cpu(l.outputs * l.batch, 0, l.delta, 1); + fill_cpu(l.outputs * l.batch, 0, l.delta, 1); } l.forward(l, state); state.input = l.output; @@ -165,7 +194,7 @@ void update_network(network net) for(i = 0; i < net.n; ++i){ layer l = net.layers[i]; if(l.update){ - l.update(l, update_batch, rate, net.momentum, net.decay); + l.update(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); } } } @@ -218,6 +247,7 @@ void backward_network(network net, network_state state) state.delta = prev.delta; } layer l = net.layers[i]; + if(l.stopbackward) break; l.backward(l, state); } } @@ -414,6 +444,9 @@ detection_layer get_network_detection_layer(network net) image get_network_image_layer(network net, int i) { layer l = net.layers[i]; + #ifdef GPU + cuda_pull_array(l.output_gpu, l.output, l.outputs); + #endif if (l.out_w && l.out_h && l.out_c){ return float_to_image(l.out_w, l.out_h, l.out_c, l.output); } diff --git a/src/network.h b/src/network.h index e48cbc28..20c75b61 100644 --- a/src/network.h +++ b/src/network.h @@ -43,6 +43,7 @@ typedef struct network{ float eps; int inputs; + int notruth; int h, w, c; int max_crop; int min_crop; @@ -82,6 +83,7 @@ float *get_network_output_gpu(network net); void forward_network_gpu(network net, network_state state); void backward_network_gpu(network net, network_state state); void update_network_gpu(network net); +void harmless_update_network_gpu(network net); #endif float get_current_rate(network net); @@ -121,6 +123,8 @@ int resize_network(network *net, int w, int h); void set_batch_network(network *net, int b); int get_network_input_size(network net); float get_network_cost(network net); +network load_network(char *cfg, char *weights, int clear); +load_args get_base_args(network net); int get_network_nuisance(network net); int get_network_background(network net); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 313cd6d1..8088d725 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -50,8 +50,10 @@ void forward_network_gpu(network net, network_state state) if(l.delta_gpu){ fill_ongpu(l.outputs * l.batch, 0, l.delta_gpu, 1); } + //if(l.c ==3 && i > 5) state.input = *net.input_gpu; l.forward_gpu(l, state); state.input = l.output_gpu; + if(l.truth) state.truth = l.output_gpu; } } @@ -64,6 +66,7 @@ void backward_network_gpu(network net, network_state state) for(i = net.n-1; i >= 0; --i){ state.index = i; layer l = net.layers[i]; + if(l.stopbackward) break; if(i == 0){ state.input = original_input; state.delta = original_delta; @@ -86,11 +89,18 @@ void update_network_gpu(network net) layer l = net.layers[i]; l.t = get_current_batch(net); if(l.update_gpu){ - l.update_gpu(l, update_batch, rate, net.momentum, net.decay); + l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); } } } +void harmless_update_network_gpu(network net) +{ + net.learning_rate = 0; + net.momentum = 1; + update_network_gpu(net); +} + void forward_backward_network_gpu(network net, float *x, float *y) { network_state state; @@ -101,10 +111,10 @@ void forward_backward_network_gpu(network net, float *x, float *y) if(net.layers[net.n-1].truths) y_size = net.layers[net.n-1].truths*net.batch; if(!*net.input_gpu){ *net.input_gpu = cuda_make_array(x, x_size); - *net.truth_gpu = cuda_make_array(y, y_size); + if(!net.notruth) *net.truth_gpu = cuda_make_array(y, y_size); }else{ cuda_push_array(*net.input_gpu, x, x_size); - cuda_push_array(*net.truth_gpu, y, y_size); + if(!net.notruth) cuda_push_array(*net.truth_gpu, y, y_size); } state.input = *net.input_gpu; state.delta = 0; @@ -180,7 +190,7 @@ void update_layer(layer l, network net) float rate = get_current_rate(net); l.t = get_current_batch(net); if(l.update_gpu){ - l.update_gpu(l, update_batch, rate, net.momentum, net.decay); + l.update_gpu(l, update_batch, rate*l.learning_rate_scale, net.momentum, net.decay); } } diff --git a/src/nightmare.c b/src/nightmare.c index ec7166cc..8a05d21d 100644 --- a/src/nightmare.c +++ b/src/nightmare.c @@ -52,6 +52,7 @@ void optimize_picture(network *net, image orig, int max_layer, float scale, floa image delta = make_image(im.w, im.h, im.c); network_state state = {0}; + state.net = *net; #ifdef GPU state.input = cuda_make_array(im.data, im.w*im.h*im.c); @@ -142,6 +143,7 @@ void reconstruct_picture(network net, float *features, image recon, image update image delta = make_image(recon.w, recon.h, recon.c); network_state state = {0}; + state.net = net; #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); @@ -178,6 +180,113 @@ void reconstruct_picture(network net, float *features, image recon, image update } } +/* +void run_lsd(int argc, char **argv) +{ + srand(0); + if(argc < 3){ + fprintf(stderr, "usage: %s %s [cfg] [weights] [image] [options! (optional)]\n", argv[0], argv[1]); + return; + } + + char *cfg = argv[2]; + char *weights = argv[3]; + char *input = argv[4]; + + int norm = find_int_arg(argc, argv, "-norm", 1); + int rounds = find_int_arg(argc, argv, "-rounds", 1); + int iters = find_int_arg(argc, argv, "-iters", 10); + float rate = find_float_arg(argc, argv, "-rate", .04); + float momentum = find_float_arg(argc, argv, "-momentum", .9); + float lambda = find_float_arg(argc, argv, "-lambda", .01); + char *prefix = find_char_arg(argc, argv, "-prefix", 0); + int reconstruct = find_arg(argc, argv, "-reconstruct"); + int smooth_size = find_int_arg(argc, argv, "-smooth", 1); + + network net = parse_network_cfg(cfg); + load_weights(&net, weights); + char *cfgbase = basecfg(cfg); + char *imbase = basecfg(input); + + set_batch_network(&net, 1); + image im = load_image_color(input, 0, 0); + + float *features = 0; + image update; + if (reconstruct){ + im = letterbox_image(im, net.w, net.h); + + int zz = 0; + network_predict(net, im.data); + image out_im = get_network_image(net); + image crop = crop_image(out_im, zz, zz, out_im.w-2*zz, out_im.h-2*zz); + //flip_image(crop); + image f_im = resize_image(crop, out_im.w, out_im.h); + free_image(crop); + printf("%d features\n", out_im.w*out_im.h*out_im.c); + + + im = resize_image(im, im.w, im.h); + f_im = resize_image(f_im, f_im.w, f_im.h); + features = f_im.data; + + int i; + for(i = 0; i < 14*14*512; ++i){ + features[i] += rand_uniform(-.19, .19); + } + + free_image(im); + im = make_random_image(im.w, im.h, im.c); + update = make_image(im.w, im.h, im.c); + + } + + int e; + int n; + for(e = 0; e < rounds; ++e){ + fprintf(stderr, "Iteration: "); + fflush(stderr); + for(n = 0; n < iters; ++n){ + fprintf(stderr, "%d, ", n); + fflush(stderr); + if(reconstruct){ + reconstruct_picture(net, features, im, update, rate, momentum, lambda, smooth_size, 1); + //if ((n+1)%30 == 0) rate *= .5; + show_image(im, "reconstruction"); +#ifdef OPENCV + cvWaitKey(10); +#endif + }else{ + int layer = max_layer + rand()%range - range/2; + int octave = rand()%octaves; + optimize_picture(&net, im, layer, 1/pow(1.33333333, octave), rate, thresh, norm); + } + } + fprintf(stderr, "done\n"); + char buff[256]; + if (prefix){ + sprintf(buff, "%s/%s_%s_%d_%06d",prefix, imbase, cfgbase, max_layer, e); + }else{ + sprintf(buff, "%s_%s_%d_%06d",imbase, cfgbase, max_layer, e); + } + printf("%d %s\n", e, buff); + save_image(im, buff); + //show_image(im, buff); + //cvWaitKey(0); + + if(rotate){ + image rot = rotate_image(im, rotate); + free_image(im); + im = rot; + } + image crop = crop_image(im, im.w * (1. - zoom)/2., im.h * (1.-zoom)/2., im.w*zoom, im.h*zoom); + image resized = resize_image(crop, im.w, im.h); + free_image(im); + free_image(crop); + im = resized; + } +} +*/ void run_nightmare(int argc, char **argv) { @@ -224,6 +333,7 @@ void run_nightmare(int argc, char **argv) free_image(im); im = resized; } + im = letterbox_image(im, net.w, net.h); float *features = 0; image update; @@ -246,13 +356,11 @@ void run_nightmare(int argc, char **argv) int i; for(i = 0; i < 14*14*512; ++i){ - features[i] += rand_uniform(-.19, .19); + //features[i] += rand_uniform(-.19, .19); } - free_image(im); im = make_random_image(im.w, im.h, im.c); update = make_image(im.w, im.h, im.c); - } int e; diff --git a/src/parser.c b/src/parser.c index 3f39a138..c89d98de 100644 --- a/src/parser.c +++ b/src/parser.c @@ -9,6 +9,7 @@ #include "batchnorm_layer.h" #include "blas.h" #include "connected_layer.h" +#include "deconvolutional_layer.h" #include "convolutional_layer.h" #include "cost_layer.h" #include "crnn_layer.h" @@ -48,6 +49,8 @@ LAYER_TYPE string_to_layer_type(char * type) if (strcmp(type, "[local]")==0) return LOCAL; if (strcmp(type, "[conv]")==0 || strcmp(type, "[convolutional]")==0) return CONVOLUTIONAL; + if (strcmp(type, "[deconv]")==0 + || strcmp(type, "[deconvolutional]")==0) return DECONVOLUTIONAL; if (strcmp(type, "[activation]")==0) return ACTIVE; if (strcmp(type, "[net]")==0 || strcmp(type, "[network]")==0) return NETWORK; @@ -135,6 +138,29 @@ local_layer parse_local(list *options, size_params params) return layer; } +layer parse_deconvolutional(list *options, size_params params) +{ + int n = option_find_int(options, "filters",1); + int size = option_find_int(options, "size",1); + int stride = option_find_int(options, "stride",1); + + char *activation_s = option_find_str(options, "activation", "logistic"); + ACTIVATION activation = get_activation(activation_s); + + int batch,h,w,c; + h = params.h; + w = params.w; + c = params.c; + batch=params.batch; + if(!(h && w && c)) error("Layer before deconvolutional layer must output image."); + int batch_normalize = option_find_int_quiet(options, "batch_normalize", 0); + + layer l = make_deconvolutional_layer(batch,h,w,c,n,size,stride,activation, batch_normalize); + + return l; +} + + convolutional_layer parse_convolutional(list *options, size_params params) { int n = option_find_int(options, "filters",1); @@ -312,6 +338,7 @@ cost_layer parse_cost(list *options, size_params params) float scale = option_find_float_quiet(options, "scale",1); cost_layer layer = make_cost_layer(params.batch, params.inputs, type, scale); layer.ratio = option_find_float_quiet(options, "ratio",0); + layer.thresh = option_find_float_quiet(options, "thresh",0); return layer; } @@ -343,6 +370,8 @@ layer parse_reorg(list *options, size_params params) { int stride = option_find_int(options, "stride",1); int reverse = option_find_int_quiet(options, "reverse",0); + int flatten = option_find_int_quiet(options, "flatten",0); + int extra = option_find_int_quiet(options, "extra",0); int batch,h,w,c; h = params.h; @@ -351,7 +380,7 @@ layer parse_reorg(list *options, size_params params) batch=params.batch; if(!(h && w && c)) error("Layer before reorg layer must output image."); - layer layer = make_reorg_layer(batch,w,h,c,stride,reverse); + layer layer = make_reorg_layer(batch,w,h,c,stride,reverse, flatten, extra); return layer; } @@ -508,6 +537,7 @@ void parse_net_options(list *options, network *net) net->decay = option_find_float(options, "decay", .0001); int subdivs = option_find_int(options, "subdivisions",1); net->time_steps = option_find_int_quiet(options, "time_steps",1); + net->notruth = option_find_int_quiet(options, "notruth",0); net->batch /= subdivs; net->batch *= net->time_steps; net->subdivisions = subdivs; @@ -537,6 +567,7 @@ void parse_net_options(list *options, network *net) char *policy_s = option_find_str(options, "policy", "constant"); net->policy = get_policy(policy_s); net->burn_in = option_find_int_quiet(options, "burn_in", 0); + net->power = option_find_float_quiet(options, "power", 4); if(net->policy == STEP){ net->step = option_find_int(options, "step", 1); net->scale = option_find_float(options, "scale", 1); @@ -570,7 +601,6 @@ void parse_net_options(list *options, network *net) net->gamma = option_find_float(options, "gamma", 1); net->step = option_find_int(options, "step", 1); } else if (net->policy == POLY || net->policy == RANDOM){ - net->power = option_find_float(options, "power", 1); } net->max_batches = option_find_int(options, "max_batches", 0); } @@ -617,6 +647,8 @@ network parse_network_cfg(char *filename) LAYER_TYPE lt = string_to_layer_type(s->type); if(lt == CONVOLUTIONAL){ l = parse_convolutional(options, params); + }else if(lt == DECONVOLUTIONAL){ + l = parse_deconvolutional(options, params); }else if(lt == LOCAL){ l = parse_local(options, params); }else if(lt == ACTIVE){ @@ -665,8 +697,13 @@ network parse_network_cfg(char *filename) }else{ fprintf(stderr, "Type not recognized: %s\n", s->type); } + l.truth = option_find_int_quiet(options, "truth", 0); + l.onlyforward = option_find_int_quiet(options, "onlyforward", 0); + l.stopbackward = option_find_int_quiet(options, "stopbackward", 0); l.dontload = option_find_int_quiet(options, "dontload", 0); l.dontloadscales = option_find_int_quiet(options, "dontloadscales", 0); + l.learning_rate_scale = option_find_float_quiet(options, "learning_rate", 1); + l.smooth = option_find_float_quiet(options, "smooth", 0); option_unused(options); net.layers[count] = l; if (l.workspace_size > workspace_size) workspace_size = l.workspace_size; @@ -840,7 +877,7 @@ void save_weights_upto(network net, char *filename, int cutoff) int i; for(i = 0; i < net.n && i < cutoff; ++i){ layer l = net.layers[i]; - if(l.type == CONVOLUTIONAL){ + if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){ save_convolutional_weights(l, fp); } if(l.type == CONNECTED){ save_connected_weights(l, fp); @@ -1005,7 +1042,7 @@ void load_convolutional_weights(layer l, FILE *fp) } -void load_weights_upto(network *net, char *filename, int cutoff) +void load_weights_upto(network *net, char *filename, int start, int cutoff) { #ifdef GPU if(net->gpu_index >= 0){ @@ -1027,10 +1064,10 @@ void load_weights_upto(network *net, char *filename, int cutoff) int transpose = (major > 1000) || (minor > 1000); int i; - for(i = 0; i < net->n && i < cutoff; ++i){ + for(i = start; i < net->n && i < cutoff; ++i){ layer l = net->layers[i]; if (l.dontload) continue; - if(l.type == CONVOLUTIONAL){ + if(l.type == CONVOLUTIONAL || l.type == DECONVOLUTIONAL){ load_convolutional_weights(l, fp); } if(l.type == CONNECTED){ @@ -1075,6 +1112,6 @@ void load_weights_upto(network *net, char *filename, int cutoff) void load_weights(network *net, char *filename) { - load_weights_upto(net, filename, net->n); + load_weights_upto(net, filename, 0, net->n); } diff --git a/src/parser.h b/src/parser.h index 6cff4fb5..473f21a9 100644 --- a/src/parser.h +++ b/src/parser.h @@ -8,6 +8,6 @@ void save_weights(network net, char *filename); void save_weights_upto(network net, char *filename, int cutoff); void save_weights_double(network net, char *filename); void load_weights(network *net, char *filename); -void load_weights_upto(network *net, char *filename, int cutoff); +void load_weights_upto(network *net, char *filename, int start, int cutoff); #endif diff --git a/src/region_layer.c b/src/region_layer.c index f5522c3f..5a3794a5 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -18,6 +18,10 @@ layer make_region_layer(int batch, int w, int h, int n, int classes, int coords) l.batch = batch; l.h = h; l.w = w; + l.c = n*(classes + coords + 1); + l.out_w = l.w; + l.out_h = l.h; + l.out_c = l.c; l.classes = classes; l.coords = coords; l.cost = calloc(1, sizeof(float)); @@ -68,19 +72,19 @@ void resize_region_layer(layer *l, int w, int h) #endif } -box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h) +box get_region_box(float *x, float *biases, int n, int index, int i, int j, int w, int h, int stride) { box b; - b.x = (i + logistic_activate(x[index + 0])) / w; - b.y = (j + logistic_activate(x[index + 1])) / h; - b.w = exp(x[index + 2]) * biases[2*n] / w; - b.h = exp(x[index + 3]) * biases[2*n+1] / h; + b.x = (i + x[index + 0*stride]) / w; + b.y = (j + x[index + 1*stride]) / h; + b.w = exp(x[index + 2*stride]) * biases[2*n] / w; + b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h; return b; } -float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale) +float delta_region_box(box truth, float *x, float *biases, int n, int index, int i, int j, int w, int h, float *delta, float scale, int stride) { - box pred = get_region_box(x, biases, n, index, i, j, w, h); + box pred = get_region_box(x, biases, n, index, i, j, w, h, stride); float iou = box_iou(pred, truth); float tx = (truth.x*w - i); @@ -88,34 +92,34 @@ float delta_region_box(box truth, float *x, float *biases, int n, int index, int float tw = log(truth.w*w / biases[2*n]); float th = log(truth.h*h / biases[2*n + 1]); - delta[index + 0] = scale * (tx - logistic_activate(x[index + 0])) * logistic_gradient(logistic_activate(x[index + 0])); - delta[index + 1] = scale * (ty - logistic_activate(x[index + 1])) * logistic_gradient(logistic_activate(x[index + 1])); - delta[index + 2] = scale * (tw - x[index + 2]); - delta[index + 3] = scale * (th - x[index + 3]); + delta[index + 0*stride] = scale * (tx - x[index + 0*stride]); + delta[index + 1*stride] = scale * (ty - x[index + 1*stride]); + delta[index + 2*stride] = scale * (tw - x[index + 2*stride]); + delta[index + 3*stride] = scale * (th - x[index + 3*stride]); return iou; } -void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, float *avg_cat) +void delta_region_class(float *output, float *delta, int index, int class, int classes, tree *hier, float scale, int stride, float *avg_cat) { int i, n; if(hier){ float pred = 1; while(class >= 0){ - pred *= output[index + class]; + pred *= output[index + stride*class]; int g = hier->group[class]; int offset = hier->group_offset[g]; for(i = 0; i < hier->group_size[g]; ++i){ - delta[index + offset + i] = scale * (0 - output[index + offset + i]); + delta[index + stride*(offset + i)] = scale * (0 - output[index + stride*(offset + i)]); } - delta[index + class] = scale * (1 - output[index + class]); + delta[index + stride*class] = scale * (1 - output[index + stride*class]); class = hier->parent[class]; } *avg_cat += pred; } else { for(n = 0; n < classes; ++n){ - delta[index + n] = scale * (((n == class)?1 : 0) - output[index + n]); - if(n == class) *avg_cat += output[index + n]; + delta[index + stride*n] = scale * (((n == class)?1 : 0) - output[index + stride*n]); + if(n == class) *avg_cat += output[index + stride*n]; } } } @@ -130,42 +134,35 @@ float tisnan(float x) return (x != x); } +int entry_index(layer l, int batch, int location, int entry) +{ + int n = location / (l.w*l.h); + int loc = location % (l.w*l.h); + return batch*l.outputs + n*l.w*l.h*(l.coords+l.classes+1) + entry*l.w*l.h + loc; +} + void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output); void forward_region_layer(const layer l, network_state state) { int i,j,b,t,n; - int size = l.coords + l.classes + 1; memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); -#ifndef GPU - flatten(l.output, l.w*l.h, size*l.n, l.batch, 1); -#endif - for (b = 0; b < l.batch; ++b){ - for(i = 0; i < l.h*l.w*l.n; ++i){ - int index = size*i + b*l.outputs; - l.output[index + 4] = logistic_activate(l.output[index + 4]); - } - } - #ifndef GPU if (l.softmax_tree){ - for (b = 0; b < l.batch; ++b){ - for(i = 0; i < l.h*l.w*l.n; ++i){ - int index = size*i + b*l.outputs; - softmax_tree(l.output + index + 5, 1, 0, 1, l.softmax_tree, l.output + index + 5); - } + int i; + int count = 5; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_cpu(state.input + count, group_size, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + count); + count += group_size; } } else if (l.softmax){ - for (b = 0; b < l.batch; ++b){ - for(i = 0; i < l.h*l.w*l.n; ++i){ - int index = size*i + b*l.outputs; - softmax(l.output + index + 5, l.classes, 1, l.output + index + 5); - } - } + softmax_cpu(state.input + 5, l.classes, l.batch, l.inputs, l.n*l.w*l.h, 1, l.n*l.w*l.h, l.temperature, l.output + 5); } #endif - if(!state.train) return; + memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); + if(!state.train) return; float avg_iou = 0; float recall = 0; float avg_cat = 0; @@ -178,26 +175,28 @@ void forward_region_layer(const layer l, network_state state) if(l.softmax_tree){ int onlyclass = 0; for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths); + box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); if(!truth.x) break; int class = state.truth[t*5 + b*l.truths + 4]; float maxp = 0; int maxi = 0; if(truth.x > 100000 && truth.y > 100000){ for(n = 0; n < l.n*l.w*l.h; ++n){ - int index = size*n + b*l.outputs + 5; - float scale = l.output[index-1]; - l.delta[index - 1] = l.noobject_scale * ((0 - l.output[index - 1]) * logistic_gradient(l.output[index - 1])); - float p = scale*get_hierarchy_probability(l.output + index, l.softmax_tree, class); + int class_index = entry_index(l, b, n, 5); + int obj_index = entry_index(l, b, n, 4); + float scale = l.output[obj_index]; + l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); + float p = scale*get_hierarchy_probability(l.output + class_index, l.softmax_tree, class, l.w*l.h); if(p > maxp){ maxp = p; maxi = n; } } - int index = size*maxi + b*l.outputs + 5; - delta_region_class(l.output, l.delta, index, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat); - if(l.output[index - 1] < .3) l.delta[index - 1] = l.object_scale * ((.3 - l.output[index - 1]) * logistic_gradient(l.output[index - 1])); - else l.delta[index - 1] = 0; + int class_index = entry_index(l, b, maxi, 5); + int obj_index = entry_index(l, b, maxi, 4); + delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); + if(l.output[obj_index] < .3) l.delta[obj_index] = l.object_scale * (.3 - l.output[obj_index]); + else l.delta[obj_index] = 0; ++class_count; onlyclass = 1; break; @@ -208,21 +207,22 @@ void forward_region_layer(const layer l, network_state state) for (j = 0; j < l.h; ++j) { for (i = 0; i < l.w; ++i) { for (n = 0; n < l.n; ++n) { - int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs; - box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h); + int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); + box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); float best_iou = 0; for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths); + box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); if(!truth.x) break; float iou = box_iou(pred, truth); if (iou > best_iou) { best_iou = iou; } } - avg_anyobj += l.output[index + 4]; - l.delta[index + 4] = l.noobject_scale * ((0 - l.output[index + 4]) * logistic_gradient(l.output[index + 4])); + int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4); + avg_anyobj += l.output[obj_index]; + l.delta[obj_index] = l.noobject_scale * (0 - l.output[obj_index]); if (best_iou > l.thresh) { - l.delta[index + 4] = 0; + l.delta[obj_index] = 0; } if(*(state.net.seen) < 12800){ @@ -231,17 +231,16 @@ void forward_region_layer(const layer l, network_state state) truth.y = (j + .5)/l.h; truth.w = l.biases[2*n]/l.w; truth.h = l.biases[2*n+1]/l.h; - delta_region_box(truth, l.output, l.biases, n, index, i, j, l.w, l.h, l.delta, .01); + delta_region_box(truth, l.output, l.biases, n, box_index, i, j, l.w, l.h, l.delta, .01, l.w*l.h); } } } } for(t = 0; t < 30; ++t){ - box truth = float_to_box(state.truth + t*5 + b*l.truths); + box truth = float_to_box(state.truth + t*5 + b*l.truths, 1); if(!truth.x) break; float best_iou = 0; - int best_index = 0; int best_n = 0; i = (truth.x * l.w); j = (truth.y * l.h); @@ -251,8 +250,8 @@ void forward_region_layer(const layer l, network_state state) truth_shift.y = 0; //printf("index %d %d\n",i, j); for(n = 0; n < l.n; ++n){ - int index = size*(j*l.w*l.n + i*l.n + n) + b*l.outputs; - box pred = get_region_box(l.output, l.biases, n, index, i, j, l.w, l.h); + int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); + box pred = get_region_box(l.output, l.biases, n, box_index, i, j, l.w, l.h, l.w*l.h); if(l.bias_match){ pred.w = l.biases[2*n]/l.w; pred.h = l.biases[2*n+1]/l.h; @@ -262,80 +261,118 @@ void forward_region_layer(const layer l, network_state state) pred.y = 0; float iou = box_iou(pred, truth_shift); if (iou > best_iou){ - best_index = index; best_iou = iou; best_n = n; } } //printf("%d %f (%f, %f) %f x %f\n", best_n, best_iou, truth.x, truth.y, truth.w, truth.h); - float iou = delta_region_box(truth, l.output, l.biases, best_n, best_index, i, j, l.w, l.h, l.delta, l.coord_scale); + int box_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 0); + float iou = delta_region_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, l.delta, l.coord_scale * (2 - truth.w*truth.h), l.w*l.h); if(iou > .5) recall += 1; avg_iou += iou; //l.delta[best_index + 4] = iou - l.output[best_index + 4]; - avg_obj += l.output[best_index + 4]; - l.delta[best_index + 4] = l.object_scale * (1 - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]); + int obj_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 4); + avg_obj += l.output[obj_index]; + l.delta[obj_index] = l.object_scale * (1 - l.output[obj_index]); if (l.rescore) { - l.delta[best_index + 4] = l.object_scale * (iou - l.output[best_index + 4]) * logistic_gradient(l.output[best_index + 4]); + l.delta[obj_index] = l.object_scale * (iou - l.output[obj_index]); } - int class = state.truth[t*5 + b*l.truths + 4]; if (l.map) class = l.map[class]; - delta_region_class(l.output, l.delta, best_index + 5, class, l.classes, l.softmax_tree, l.class_scale, &avg_cat); + int class_index = entry_index(l, b, best_n*l.w*l.h + j*l.w + i, 5); + delta_region_class(l.output, l.delta, class_index, class, l.classes, l.softmax_tree, l.class_scale, l.w*l.h, &avg_cat); ++count; ++class_count; } } //printf("\n"); -#ifndef GPU - flatten(l.delta, l.w*l.h, size*l.n, l.batch, 0); -#endif *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); printf("Region Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, Avg Recall: %f, count: %d\n", avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, count); } void backward_region_layer(const layer l, network_state state) { - axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); + /* + int b; + int size = l.coords + l.classes + 1; + for (b = 0; b < l.batch*l.n; ++b){ + int index = (b*size + 4)*l.w*l.h; + gradient_array(l.output + index, l.w*l.h, LOGISTIC, l.delta + index); + } + axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, state.delta, 1); + */ } void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *boxes, int only_objectness, int *map, float tree_thresh) { - int i,j,n; + int i,j,n,z; float *predictions = l.output; + if (l.batch == 2) { + float *flip = l.output + l.outputs; + for (j = 0; j < l.h; ++j) { + for (i = 0; i < l.w/2; ++i) { + for (n = 0; n < l.n; ++n) { + for(z = 0; z < l.classes + 5; ++z){ + int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i; + int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1); + float swap = flip[i1]; + flip[i1] = flip[i2]; + flip[i2] = swap; + if(z == 0){ + flip[i1] = -flip[i1]; + flip[i2] = -flip[i2]; + } + } + } + } + } + for(i = 0; i < l.outputs; ++i){ + l.output[i] = (l.output[i] + flip[i])/2.; + } + } for (i = 0; i < l.w*l.h; ++i){ int row = i / l.w; int col = i % l.w; for(n = 0; n < l.n; ++n){ - int index = i*l.n + n; - int p_index = index * (l.classes + 5) + 4; - float scale = predictions[p_index]; - int box_index = index * (l.classes + 5); - boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h); + int index = n*l.w*l.h + i; + int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4); + int box_index = entry_index(l, 0, n*l.w*l.h + i, 0); + float scale = predictions[obj_index]; + boxes[index] = get_region_box(predictions, l.biases, n, box_index, col, row, l.w, l.h, l.w*l.h); + if(1){ + int max = w > h ? w : h; + boxes[index].x = (boxes[index].x - (max - w)/2./max) / ((float)w/max); + boxes[index].y = (boxes[index].y - (max - h)/2./max) / ((float)h/max); + boxes[index].w *= (float)max/w; + boxes[index].h *= (float)max/h; + } boxes[index].x *= w; boxes[index].y *= h; boxes[index].w *= w; boxes[index].h *= h; - int class_index = index * (l.classes + 5) + 5; + int class_index = entry_index(l, 0, n*l.w*l.h + i, 5); if(l.softmax_tree){ - hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0); + hierarchy_predictions(predictions + class_index, l.classes, l.softmax_tree, 0, l.w*l.h); if(map){ for(j = 0; j < 200; ++j){ - float prob = scale*predictions[class_index+map[j]]; + int class_index = entry_index(l, 0, n*l.w*l.h + i, 5 + map[j]); + float prob = scale*predictions[class_index]; probs[index][j] = (prob > thresh) ? prob : 0; } } else { - int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh); + int j = hierarchy_top_prediction(predictions + class_index, l.softmax_tree, tree_thresh, l.w*l.h); probs[index][j] = (scale > thresh) ? scale : 0; probs[index][l.classes] = scale; } } else { for(j = 0; j < l.classes; ++j){ - float prob = scale*predictions[class_index+j]; + int class_index = entry_index(l, 0, n*l.w*l.h + i, 5 + j); + float prob = scale*predictions[class_index]; probs[index][j] = (prob > thresh) ? prob : 0; } } @@ -350,23 +387,33 @@ void get_region_boxes(layer l, int w, int h, float thresh, float **probs, box *b void forward_region_layer_gpu(const layer l, network_state state) { - /* - if(!state.train){ - copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); - return; - } - */ - flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu); - if(l.softmax_tree){ + copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); + int b, n; + for (b = 0; b < l.batch; ++b){ + for(n = 0; n < l.n; ++n){ + int index = entry_index(l, b, n*l.w*l.h, 0); + activate_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC); + index = entry_index(l, b, n*l.w*l.h, 4); + activate_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC); + } + } + if (l.softmax_tree){ int i; int count = 5; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; - softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count); + int index = entry_index(l, 0, 0, count); + softmax_gpu(state.input + index, group_size, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); count += group_size; } - }else if (l.softmax){ - softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5); + } else if (l.softmax) { + int index = entry_index(l, 0, 0, 5); + //printf("%d\n", index); + softmax_gpu(state.input + index, l.classes, l.batch*l.n, l.inputs/l.n, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu + index); + } + if(!state.train || l.onlyforward){ + cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); + return; } float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); @@ -382,16 +429,25 @@ void forward_region_layer_gpu(const layer l, network_state state) cpu_state.truth = truth_cpu; cpu_state.input = in_cpu; forward_region_layer(l, cpu_state); - //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); + cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); free(cpu_state.input); if(!state.train) return; cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); if(cpu_state.truth) free(cpu_state.truth); } -void backward_region_layer_gpu(layer l, network_state state) +void backward_region_layer_gpu(const layer l, network_state state) { - flatten_ongpu(l.delta_gpu, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 0, state.delta); + int b, n; + for (b = 0; b < l.batch; ++b){ + for(n = 0; n < l.n; ++n){ + int index = entry_index(l, b, n*l.w*l.h, 0); + gradient_array_ongpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC, l.delta_gpu + index); + index = entry_index(l, b, n*l.w*l.h, 4); + gradient_array_ongpu(l.output_gpu + index, l.w*l.h, LOGISTIC, l.delta_gpu + index); + } + } + axpy_ongpu(l.batch*l.inputs, 1, l.delta_gpu, 1, state.delta, 1); } #endif diff --git a/src/regressor.c b/src/regressor.c new file mode 100644 index 00000000..4950e747 --- /dev/null +++ b/src/regressor.c @@ -0,0 +1,261 @@ +#include "network.h" +#include "utils.h" +#include "parser.h" +#include "option_list.h" +#include "blas.h" +#include "assert.h" +#include "cuda.h" +#include + +#ifdef OPENCV +#include "opencv2/highgui/highgui_c.h" +image get_image_from_stream(CvCapture *cap); +#endif + +void train_regressor(char *datacfg, char *cfgfile, char *weightfile, int *gpus, int ngpus, int clear) +{ + int i; + + float avg_loss = -1; + char *base = basecfg(cfgfile); + printf("%s\n", base); + printf("%d\n", ngpus); + network *nets = calloc(ngpus, sizeof(network)); + + srand(time(0)); + int seed = rand(); + for(i = 0; i < ngpus; ++i){ + srand(seed); +#ifdef GPU + cuda_set_device(gpus[i]); +#endif + nets[i] = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&nets[i], weightfile); + } + if(clear) *nets[i].seen = 0; + nets[i].learning_rate *= ngpus; + } + srand(time(0)); + network net = nets[0]; + + int imgs = net.batch * net.subdivisions * ngpus; + + printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); + list *options = read_data_cfg(datacfg); + + char *backup_directory = option_find_str(options, "backup", "/backup/"); + char *train_list = option_find_str(options, "train", "data/train.list"); + + list *plist = get_paths(train_list); + char **paths = (char **)list_to_array(plist); + printf("%d\n", plist->size); + int N = plist->size; + clock_t time; + + load_args args = {0}; + args.w = net.w; + args.h = net.h; + args.threads = 32; + + args.min = net.min_crop; + args.max = net.max_crop; + args.angle = net.angle; + args.aspect = net.aspect; + args.exposure = net.exposure; + args.saturation = net.saturation; + args.hue = net.hue; + args.size = net.w; + + args.paths = paths; + args.n = imgs; + args.m = N; + args.type = REGRESSION_DATA; + + data train; + data buffer; + pthread_t load_thread; + args.d = &buffer; + load_thread = load_data(args); + + int epoch = (*net.seen)/N; + while(get_current_batch(net) < net.max_batches || net.max_batches == 0){ + time=clock(); + + pthread_join(load_thread, 0); + train = buffer; + load_thread = load_data(args); + + printf("Loaded: %lf seconds\n", sec(clock()-time)); + time=clock(); + + float loss = 0; +#ifdef GPU + if(ngpus == 1){ + loss = train_network(net, train); + } else { + loss = train_networks(nets, ngpus, train, 4); + } +#else + loss = train_network(net, train); +#endif + if(avg_loss == -1) avg_loss = loss; + avg_loss = avg_loss*.9 + loss*.1; + printf("%d, %.3f: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), (float)(*net.seen)/N, loss, avg_loss, get_current_rate(net), sec(clock()-time), *net.seen); + free_data(train); + if(*net.seen/N > epoch){ + epoch = *net.seen/N; + char buff[256]; + sprintf(buff, "%s/%s_%d.weights",backup_directory,base, epoch); + save_weights(net, buff); + } + if(get_current_batch(net)%100 == 0){ + char buff[256]; + sprintf(buff, "%s/%s.backup",backup_directory,base); + save_weights(net, buff); + } + } + char buff[256]; + sprintf(buff, "%s/%s.weights", backup_directory, base); + save_weights(net, buff); + + free_network(net); + free_ptrs((void**)paths, plist->size); + free_list(plist); + free(base); +} + +void predict_regressor(char *cfgfile, char *weightfile, char *filename) +{ + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + set_batch_network(&net, 1); + srand(2222222); + + clock_t time; + char buff[256]; + char *input = buff; + while(1){ + if(filename){ + strncpy(input, filename, 256); + }else{ + printf("Enter Image Path: "); + fflush(stdout); + input = fgets(input, 256, stdin); + if(!input) return; + strtok(input, "\n"); + } + image im = load_image_color(input, 0, 0); + image sized = letterbox_image(im, net.w, net.h); + + float *X = sized.data; + time=clock(); + float *predictions = network_predict(net, X); + printf("Predicted: %f\n", predictions[0]); + printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); + free_image(im); + free_image(sized); + if (filename) break; + } +} + + +void demo_regressor(char *datacfg, char *cfgfile, char *weightfile, int cam_index, const char *filename) +{ +#ifdef OPENCV + printf("Regressor Demo\n"); + network net = parse_network_cfg(cfgfile); + if(weightfile){ + load_weights(&net, weightfile); + } + set_batch_network(&net, 1); + + srand(2222222); + CvCapture * cap; + + if(filename){ + cap = cvCaptureFromFile(filename); + }else{ + cap = cvCaptureFromCAM(cam_index); + } + + if(!cap) error("Couldn't connect to webcam.\n"); + cvNamedWindow("Regressor", CV_WINDOW_NORMAL); + cvResizeWindow("Regressor", 512, 512); + float fps = 0; + int i; + + while(1){ + struct timeval tval_before, tval_after, tval_result; + gettimeofday(&tval_before, NULL); + + image in = get_image_from_stream(cap); + image in_s = letterbox_image(in, net.w, net.h); + show_image(in, "Regressor"); + + float *predictions = network_predict(net, in_s.data); + + printf("\033[2J"); + printf("\033[1;1H"); + printf("\nFPS:%.0f\n",fps); + + printf("People: %f\n", predictions[0]); + + free_image(in_s); + free_image(in); + + cvWaitKey(10); + + gettimeofday(&tval_after, NULL); + timersub(&tval_after, &tval_before, &tval_result); + float curr = 1000000.f/((long int)tval_result.tv_usec); + fps = .9*fps + .1*curr; + } +#endif +} + + +void run_regressor(int argc, char **argv) +{ + if(argc < 4){ + fprintf(stderr, "usage: %s %s [train/test/valid] [cfg] [weights (optional)]\n", argv[0], argv[1]); + return; + } + + char *gpu_list = find_char_arg(argc, argv, "-gpus", 0); + int *gpus = 0; + int gpu = 0; + int ngpus = 0; + if(gpu_list){ + printf("%s\n", gpu_list); + int len = strlen(gpu_list); + ngpus = 1; + int i; + for(i = 0; i < len; ++i){ + if (gpu_list[i] == ',') ++ngpus; + } + gpus = calloc(ngpus, sizeof(int)); + for(i = 0; i < ngpus; ++i){ + gpus[i] = atoi(gpu_list); + gpu_list = strchr(gpu_list, ',')+1; + } + } else { + gpu = gpu_index; + gpus = &gpu; + ngpus = 1; + } + + int cam_index = find_int_arg(argc, argv, "-c", 0); + int clear = find_arg(argc, argv, "-clear"); + char *data = argv[3]; + char *cfg = argv[4]; + char *weights = (argc > 5) ? argv[5] : 0; + char *filename = (argc > 6) ? argv[6]: 0; + if(0==strcmp(argv[2], "test")) predict_regressor(data, cfg, weights); + else if(0==strcmp(argv[2], "train")) train_regressor(data, cfg, weights, gpus, ngpus, clear); + else if(0==strcmp(argv[2], "demo")) demo_regressor(data, cfg, weights, cam_index, filename); +} + + diff --git a/src/reorg_layer.c b/src/reorg_layer.c index 2abca8fa..29ccc0e5 100644 --- a/src/reorg_layer.c +++ b/src/reorg_layer.c @@ -4,15 +4,17 @@ #include -layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse) +layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra) { layer l = {0}; l.type = REORG; l.batch = batch; l.stride = stride; + l.extra = extra; l.h = h; l.w = w; l.c = c; + l.flatten = flatten; if(reverse){ l.out_w = w*stride; l.out_h = h*stride; @@ -23,10 +25,20 @@ layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse) l.out_c = c*(stride*stride); } l.reverse = reverse; - fprintf(stderr, "reorg /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); + l.outputs = l.out_h * l.out_w * l.out_c; l.inputs = h*w*c; - int output_size = l.out_h * l.out_w * l.out_c * batch; + if(l.extra){ + l.out_w = l.out_h = l.out_c = 0; + l.outputs = l.inputs + l.extra; + } + + if(extra){ + fprintf(stderr, "reorg %4d -> %4d\n", l.inputs, l.outputs); + } else { + fprintf(stderr, "reorg /%2d %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); + } + int output_size = l.outputs * batch; l.output = calloc(output_size, sizeof(float)); l.delta = calloc(output_size, sizeof(float)); @@ -77,17 +89,41 @@ void resize_reorg_layer(layer *l, int w, int h) void forward_reorg_layer(const layer l, network_state state) { - if(l.reverse){ + int i; + if(l.flatten){ + memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float)); + if(l.reverse){ + flatten(l.output, l.w*l.h, l.c, l.batch, 0); + }else{ + flatten(l.output, l.w*l.h, l.c, l.batch, 1); + } + } else if (l.extra) { + for(i = 0; i < l.batch; ++i){ + copy_cpu(l.inputs, state.input + i*l.inputs, 1, l.output + i*l.outputs, 1); + } + } else if (l.reverse){ reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output); - }else { + } else { reorg_cpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output); } } void backward_reorg_layer(const layer l, network_state state) { - if(l.reverse){ + int i; + if(l.flatten){ + memcpy(state.delta, l.delta, l.outputs*l.batch*sizeof(float)); + if(l.reverse){ + flatten(state.delta, l.w*l.h, l.c, l.batch, 1); + }else{ + flatten(state.delta, l.w*l.h, l.c, l.batch, 0); + } + } else if(l.reverse){ reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta); + } else if (l.extra) { + for(i = 0; i < l.batch; ++i){ + copy_cpu(l.inputs, l.delta + i*l.outputs, 1, state.delta + i*l.inputs, 1); + } }else{ reorg_cpu(l.delta, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta); } @@ -96,7 +132,18 @@ void backward_reorg_layer(const layer l, network_state state) #ifdef GPU void forward_reorg_layer_gpu(layer l, network_state state) { - if(l.reverse){ + int i; + if(l.flatten){ + if(l.reverse){ + flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 0, l.output_gpu); + }else{ + flatten_ongpu(state.input, l.w*l.h, l.c, l.batch, 1, l.output_gpu); + } + } else if (l.extra) { + for(i = 0; i < l.batch; ++i){ + copy_ongpu(l.inputs, state.input + i*l.inputs, 1, l.output_gpu + i*l.outputs, 1); + } + } else if (l.reverse) { reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.output_gpu); }else { reorg_ongpu(state.input, l.w, l.h, l.c, l.batch, l.stride, 0, l.output_gpu); @@ -105,9 +152,20 @@ void forward_reorg_layer_gpu(layer l, network_state state) void backward_reorg_layer_gpu(layer l, network_state state) { - if(l.reverse){ + if(l.flatten){ + if(l.reverse){ + flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 1, state.delta); + }else{ + flatten_ongpu(l.delta_gpu, l.w*l.h, l.c, l.batch, 0, state.delta); + } + } else if (l.extra) { + int i; + for(i = 0; i < l.batch; ++i){ + copy_ongpu(l.inputs, l.delta_gpu + i*l.outputs, 1, state.delta + i*l.inputs, 1); + } + } else if(l.reverse){ reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, state.delta); - }else{ + } else { reorg_ongpu(l.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, state.delta); } } diff --git a/src/reorg_layer.h b/src/reorg_layer.h index 21c22cd8..6b9c3040 100644 --- a/src/reorg_layer.h +++ b/src/reorg_layer.h @@ -6,7 +6,7 @@ #include "layer.h" #include "network.h" -layer make_reorg_layer(int batch, int h, int w, int c, int stride, int reverse); +layer make_reorg_layer(int batch, int w, int h, int c, int stride, int reverse, int flatten, int extra); void resize_reorg_layer(layer *l, int w, int h); void forward_reorg_layer(const layer l, network_state state); void backward_reorg_layer(const layer l, network_state state); diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 5d153148..88f032fc 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -32,40 +32,24 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups) return l; } -void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output) -{ - int b; - for(b = 0; b < batch; ++b){ - int i; - int count = 0; - for(i = 0; i < hierarchy->groups; ++i){ - int group_size = hierarchy->group_size[i]; - softmax(input+b*inputs + count, group_size, temp, output+b*inputs + count); - count += group_size; - } - } -} - void forward_softmax_layer(const softmax_layer l, network_state state) { - int b; - int inputs = l.inputs / l.groups; - int batch = l.batch * l.groups; if(l.softmax_tree){ - softmax_tree(state.input, batch, inputs, l.temperature, l.softmax_tree, l.output); - } else { - for(b = 0; b < batch; ++b){ - softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs); + int i; + int count = 0; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_cpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count); + count += group_size; } + } else { + softmax_cpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output); } } void backward_softmax_layer(const softmax_layer l, network_state state) { - int i; - for(i = 0; i < l.inputs*l.batch; ++i){ - state.delta[i] += l.delta[i]; - } + axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, state.delta, 1); } #ifdef GPU @@ -77,18 +61,16 @@ void pull_softmax_layer_output(const softmax_layer layer) void forward_softmax_layer_gpu(const softmax_layer l, network_state state) { - int inputs = l.inputs / l.groups; - int batch = l.batch * l.groups; if(l.softmax_tree){ int i; int count = 0; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; - softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count); + softmax_gpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); count += group_size; } } else { - softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu); + softmax_gpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); } } diff --git a/src/super.c b/src/super.c index 63e9860a..5583ac26 100644 --- a/src/super.c +++ b/src/super.c @@ -7,7 +7,7 @@ #include "opencv2/highgui/highgui_c.h" #endif -void train_super(char *cfgfile, char *weightfile) +void train_super(char *cfgfile, char *weightfile, int clear) { char *train_images = "/data/imagenet/imagenet1k.train.list"; char *backup_directory = "/home/pjreddie/backup/"; @@ -19,6 +19,7 @@ void train_super(char *cfgfile, char *weightfile) if(weightfile){ load_weights(&net, weightfile); } + if(clear) *net.seen = 0; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay); int imgs = net.batch*net.subdivisions; int i = *net.seen/imgs; @@ -123,7 +124,8 @@ void run_super(int argc, char **argv) char *cfg = argv[3]; char *weights = (argc > 4) ? argv[4] : 0; char *filename = (argc > 5) ? argv[5] : 0; - if(0==strcmp(argv[2], "train")) train_super(cfg, weights); + int clear = find_arg(argc, argv, "-clear"); + if(0==strcmp(argv[2], "train")) train_super(cfg, weights, clear); else if(0==strcmp(argv[2], "test")) test_super(cfg, weights, filename); /* else if(0==strcmp(argv[2], "valid")) validate_super(cfg, weights); diff --git a/src/tree.c b/src/tree.c index dd44515c..f36ca3a1 100644 --- a/src/tree.c +++ b/src/tree.c @@ -24,33 +24,33 @@ void change_leaves(tree *t, char *leaf_list) fprintf(stderr, "Found %d leaves.\n", found); } -float get_hierarchy_probability(float *x, tree *hier, int c) +float get_hierarchy_probability(float *x, tree *hier, int c, int stride) { float p = 1; while(c >= 0){ - p = p * x[c]; + p = p * x[c*stride]; c = hier->parent[c]; } return p; } -void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves) +void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride) { int j; for(j = 0; j < n; ++j){ int parent = hier->parent[j]; if(parent >= 0){ - predictions[j] *= predictions[parent]; + predictions[j*stride] *= predictions[parent*stride]; } } if(only_leaves){ for(j = 0; j < n; ++j){ - if(!hier->leaf[j]) predictions[j] = 0; + if(!hier->leaf[j]) predictions[j*stride] = 0; } } } -int hierarchy_top_prediction(float *predictions, tree *hier, float thresh) +int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride) { float p = 1; int group = 0; @@ -61,7 +61,7 @@ int hierarchy_top_prediction(float *predictions, tree *hier, float thresh) for(i = 0; i < hier->group_size[group]; ++i){ int index = i + hier->group_offset[group]; - float val = predictions[i + hier->group_offset[group]]; + float val = predictions[(i + hier->group_offset[group])*stride]; if(val > max){ max_i = index; max = val; diff --git a/src/tree.h b/src/tree.h index dbd4c394..2e9c8126 100644 --- a/src/tree.h +++ b/src/tree.h @@ -15,9 +15,9 @@ typedef struct{ } tree; tree *read_tree(char *filename); -void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves); +void hierarchy_predictions(float *predictions, int n, tree *hier, int only_leaves, int stride); void change_leaves(tree *t, char *leaf_list); -int hierarchy_top_prediction(float *predictions, tree *hier, float thresh); -float get_hierarchy_probability(float *x, tree *hier, int c); +int hierarchy_top_prediction(float *predictions, tree *hier, float thresh, int stride); +float get_hierarchy_probability(float *x, tree *hier, int c, int stride); #endif