From 62b781af4d01fc5f074407590cf556b36c70e837 Mon Sep 17 00:00:00 2001
From: Joseph Redmon <pjreddie@gmail.com>
Date: Mon, 2 Oct 2017 15:17:48 -0700
Subject: [PATCH] resnet that works

---
 Makefile                           |   4 +-
 cfg/msr_34.cfg                     | 366 -------------------
 cfg/msr_50.cfg                     | 558 -----------------------------
 cfg/{msr_152.cfg => resnet152.cfg} | 353 ++++++------------
 examples/classifier.c              |   3 +-
 examples/darknet.c                 |  49 +--
 examples/detector.c                |  20 +-
 examples/rnn.c                     |  30 ++
 examples/segmenter.c               |   2 +-
 src/activation_kernels.cu          |  42 +--
 src/blas_kernels.cu                |  26 +-
 src/convolutional_kernels.cu       |   8 +-
 src/crop_layer_kernels.cu          |  26 +-
 src/data.c                         |   9 +-
 src/demo.c                         |   4 -
 src/image.c                        |  90 ++---
 src/image.h                        |   6 -
 src/utils.h                        |   9 +-
 18 files changed, 294 insertions(+), 1311 deletions(-)
 delete mode 100644 cfg/msr_34.cfg
 delete mode 100644 cfg/msr_50.cfg
 rename cfg/{msr_152.cfg => resnet152.cfg} (92%)

diff --git a/Makefile b/Makefile
index b0a9f1ce..b61f1c60 100644
--- a/Makefile
+++ b/Makefile
@@ -4,11 +4,11 @@ OPENCV=0
 OPENMP=0
 DEBUG=0
 
-ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \
-      -gencode arch=compute_30,code=sm_30 \
+ARCH= -gencode arch=compute_30,code=sm_30 \
       -gencode arch=compute_35,code=sm_35 \
       -gencode arch=compute_50,code=[sm_50,compute_50] \
       -gencode arch=compute_52,code=[sm_52,compute_52]
+#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?
 
 # This is what I use, uncomment if you know your arch and want to specify
 # ARCH= -gencode arch=compute_52,code=compute_52
diff --git a/cfg/msr_34.cfg b/cfg/msr_34.cfg
deleted file mode 100644
index 5ae23cf5..00000000
--- a/cfg/msr_34.cfg
+++ /dev/null
@@ -1,366 +0,0 @@
-[net]
-batch=128
-subdivisions=1
-height=256
-width=256
-channels=3
-momentum=0.9
-decay=0.0005
-
-learning_rate=0.1
-policy=poly
-power=4
-max_batches=500000
-
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=7
-stride=2
-pad=1
-activation=leaky
-
-[maxpool]
-size=3
-stride=2
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-
-
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-
-
-
-
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-
-
-
-
-
-
-
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[shortcut]
-from = -3
-
-[avgpool]
-
-[connected]
-output=1000
-activation=leaky
-
-[softmax]
-groups=1
-
-[cost]
-type=sse
-
diff --git a/cfg/msr_50.cfg b/cfg/msr_50.cfg
deleted file mode 100644
index 2edd21c1..00000000
--- a/cfg/msr_50.cfg
+++ /dev/null
@@ -1,558 +0,0 @@
-[net]
-batch=128
-subdivisions=8
-height=256
-width=256
-channels=3
-momentum=0.9
-decay=0.0001
-
-learning_rate=0.05
-policy=poly
-power=4
-max_batches=500000
-
-
-
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
-
-##### Conv 1 #####
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=7
-stride=2
-pad=1
-activation=leaky
-
-[maxpool]
-size=3
-stride=2
-
-
-##### Conv 2_x #####
-
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=linear
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=1
-pad=1
-activation=linear
-filters=256
-
-[shortcut]
-from = -3
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=64
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-
-##### Conv 3_x #####
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=linear
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=512
-
-[shortcut]
-from = -3
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=128
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-
-##### Conv 4_x #####
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=1024
-
-[shortcut]
-from = -3
-activation=leaky
-
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=256
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=1024
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-
-##### Conv 5_x #####
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=2
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=2048
-size=1
-stride=1
-pad=1
-activation=linear
-
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=2048
-
-[shortcut]
-from = -3
-activation=leaky
-
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=2048
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=2048
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-activation=leaky
-
-[avgpool]
-
-[connected]
-output=1000
-activation=leaky
-
-[softmax]
-groups=1
-
-[cost]
-type=sse
-
diff --git a/cfg/msr_152.cfg b/cfg/resnet152.cfg
similarity index 92%
rename from cfg/msr_152.cfg
rename to cfg/resnet152.cfg
index b19c999d..d5fe9094 100644
--- a/cfg/msr_152.cfg
+++ b/cfg/resnet152.cfg
@@ -1,26 +1,30 @@
 [net]
-batch=128
-subdivisions=8
+# Training
+# batch=128
+# subdivisions=8
+
+# Testing
+batch=1
+subdivisions=1
+
 height=256
 width=256
+max_crop=448
 channels=3
 momentum=0.9
-decay=0.0001
+decay=0.0005
 
+burn_in=1000
 learning_rate=0.1
 policy=poly
 power=4
-max_batches=500000
+max_batches=1600000
 
-[crop]
-crop_height=224
-crop_width=224
-flip=1
-saturation=1
-exposure=1
-angle=0
-
-##### Conv 1 #####
+angle=7
+hue=.1
+saturation=.75
+exposure=.75
+aspect=.75
 
 [convolutional]
 batch_normalize=1
@@ -31,13 +35,9 @@ pad=1
 activation=leaky
 
 [maxpool]
-size=3
+size=2
 stride=2
 
-
-##### Conv 2_x #####
-
-
 [convolutional]
 batch_normalize=1
 filters=64
@@ -62,19 +62,8 @@ stride=1
 pad=1
 activation=linear
 
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=1
-pad=1
-activation=linear
-filters=256
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
 [convolutional]
@@ -102,8 +91,7 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 [convolutional]
@@ -131,13 +119,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
-##### Conv 3_x #####
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -162,23 +146,10 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=512
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -204,11 +175,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -234,11 +203,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -264,11 +231,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -294,11 +259,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -324,11 +287,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -354,11 +315,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=128
@@ -384,14 +343,11 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 
-
-##### Conv 4_x #####
-
+# Conv 4
 [convolutional]
 batch_normalize=1
 filters=256
@@ -416,23 +372,10 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=1024
-
 [shortcut]
-from = -3
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -458,11 +401,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -488,11 +429,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -518,11 +457,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -548,11 +485,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -578,11 +513,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -608,11 +541,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -638,11 +569,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -668,11 +597,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -698,11 +625,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -728,11 +653,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -758,11 +681,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -788,11 +709,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -818,11 +737,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -848,11 +765,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -878,11 +793,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -908,11 +821,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -938,11 +849,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -968,11 +877,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -998,11 +905,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1028,11 +933,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1058,11 +961,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1088,11 +989,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1118,11 +1017,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1148,11 +1045,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1178,11 +1073,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1208,11 +1101,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1238,11 +1129,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1268,11 +1157,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1298,11 +1185,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1328,11 +1213,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1358,11 +1241,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1388,11 +1269,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1418,11 +1297,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1448,11 +1325,9 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
 [convolutional]
 batch_normalize=1
 filters=256
@@ -1478,13 +1353,10 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
-
-##### Conv 5_x #####
-
+#Conv 5
 [convolutional]
 batch_normalize=1
 filters=512
@@ -1509,50 +1381,8 @@ stride=1
 pad=1
 activation=linear
 
-
-[route]
-layers=-4
-
-[convolutional]
-batch_normalize=1
-size=1
-stride=2
-pad=1
-activation=linear
-filters=2048
-
 [shortcut]
-from = -3
-activation=leaky
-
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=1
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=512
-size=3
-stride=1
-pad=1
-activation=leaky
-
-[convolutional]
-batch_normalize=1
-filters=2048
-size=1
-stride=1
-pad=1
-activation=linear
-
-[shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
 [convolutional]
@@ -1580,16 +1410,51 @@ pad=1
 activation=linear
 
 [shortcut]
-from = -4
-
+from=-4
 activation=leaky
 
+[convolutional]
+batch_normalize=1
+filters=512
+size=1
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=512
+size=3
+stride=1
+pad=1
+activation=leaky
+
+[convolutional]
+batch_normalize=1
+filters=2048
+size=1
+stride=1
+pad=1
+activation=linear
+
+[shortcut]
+from=-4
+activation=leaky
+
+
+
+
+
+
+[convolutional]
+filters=1000
+size=1
+stride=1
+pad=1
+activation=linear
+
 [avgpool]
 
-[connected]
-output=1000
-activation=leaky
-
 [softmax]
 groups=1
 
diff --git a/examples/classifier.c b/examples/classifier.c
index 593b34e5..20202c8c 100644
--- a/examples/classifier.c
+++ b/examples/classifier.c
@@ -58,7 +58,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     load_args args = {0};
     args.w = net.w;
     args.h = net.h;
-    args.threads = 64;
+    args.threads = 32;
     args.hierarchy = net.hierarchy;
 
     args.min = net.min_crop;
@@ -123,6 +123,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int *gpus,
     char buff[256];
     sprintf(buff, "%s/%s.weights", backup_directory, base);
     save_weights(net, buff);
+    pthread_join(load_thread, 0);
 
     free_network(net);
     free_ptrs((void**)labels, classes);
diff --git a/examples/darknet.c b/examples/darknet.c
index e63afef8..9ead608c 100644
--- a/examples/darknet.c
+++ b/examples/darknet.c
@@ -83,27 +83,8 @@ void average(int argc, char *argv[])
     save_weights(sum, outfile);
 }
 
-void speed(char *cfgfile, int tics)
+long numops(network net)
 {
-    if (tics == 0) tics = 1000;
-    network net = parse_network_cfg(cfgfile);
-    set_batch_network(&net, 1);
-    int i;
-    double time=what_time_is_it_now();
-    image im = make_image(net.w, net.h, net.c*net.batch);
-    for(i = 0; i < tics; ++i){
-        network_predict(net, im.data);
-    }
-    double t = what_time_is_it_now() - time;
-    printf("\n%d evals, %f Seconds\n", tics, t);
-    printf("Speed: %f sec/eval\n", t/tics);
-    printf("Speed: %f Hz\n", tics/t);
-}
-
-void operations(char *cfgfile)
-{
-    gpu_index = -1;
-    network net = parse_network_cfg(cfgfile);
     int i;
     long ops = 0;
     for(i = 0; i < net.n; ++i){
@@ -134,6 +115,34 @@ void operations(char *cfgfile)
             ops += 2l * l.wo->inputs * l.wo->outputs;
         }
     }
+    return ops;
+}
+
+void speed(char *cfgfile, int tics)
+{
+    if (tics == 0) tics = 1000;
+    network net = parse_network_cfg(cfgfile);
+    set_batch_network(&net, 1);
+    int i;
+    double time=what_time_is_it_now();
+    image im = make_image(net.w, net.h, net.c*net.batch);
+    for(i = 0; i < tics; ++i){
+        network_predict(net, im.data);
+    }
+    double t = what_time_is_it_now() - time;
+    long ops = numops(net);
+    printf("\n%d evals, %f Seconds\n", tics, t);
+    printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
+    printf("FLOPS: %.2f Bn\n", (float)ops/1000000000.*tics/t);
+    printf("Speed: %f sec/eval\n", t/tics);
+    printf("Speed: %f Hz\n", tics/t);
+}
+
+void operations(char *cfgfile)
+{
+    gpu_index = -1;
+    network net = parse_network_cfg(cfgfile);
+    long ops = numops(net);
     printf("Floating Point Operations: %ld\n", ops);
     printf("Floating Point Operations: %.2f Bn\n", (float)ops/1000000000.);
 }
diff --git a/examples/detector.c b/examples/detector.c
index 0537588c..4709b898 100644
--- a/examples/detector.c
+++ b/examples/detector.c
@@ -52,10 +52,10 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
     args.d = &buffer;
     args.type = DETECTION_DATA;
     //args.type = INSTANCE_DATA;
-    args.threads = 8;
+    args.threads = 64;
 
     pthread_t load_thread = load_data(args);
-    clock_t time;
+    double time;
     int count = 0;
     //while(i*imgs < N*120){
     while(get_current_batch(net) < net.max_batches){
@@ -78,7 +78,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
             }
             net = nets[0];
         }
-        time=clock();
+        time=what_time_is_it_now();
         pthread_join(load_thread, 0);
         train = buffer;
         load_thread = load_data(args);
@@ -107,9 +107,9 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         }
         */
 
-        printf("Loaded: %lf seconds\n", sec(clock()-time));
+        printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
 
-        time=clock();
+        time=what_time_is_it_now();
         float loss = 0;
 #ifdef GPU
         if(ngpus == 1){
@@ -124,7 +124,7 @@ void train_detector(char *datacfg, char *cfgfile, char *weightfile, int *gpus, i
         avg_loss = avg_loss*.9 + loss*.1;
 
         i = get_current_batch(net);
-        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), sec(clock()-time), i*imgs);
+        printf("%ld: %f, %f avg, %f rate, %lf seconds, %d images\n", get_current_batch(net), loss, avg_loss, get_current_rate(net), what_time_is_it_now()-time, i*imgs);
         if(i%100==0){
 #ifdef GPU
             if(ngpus != 1) sync_nets(nets, ngpus, 0);
@@ -313,7 +313,7 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
         args.resized = &buf_resized[t];
         thr[t] = load_data_in_thread(args);
     }
-    time_t start = time(0);
+    double start = what_time_is_it_now();
     for(i = nthreads; i < m+nthreads; i += nthreads){
         fprintf(stderr, "%d\n", i);
         for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
@@ -359,7 +359,7 @@ void validate_detector_flip(char *datacfg, char *cfgfile, char *weightfile, char
         fprintf(fp, "\n]\n");
         fclose(fp);
     }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
 }
 
 
@@ -447,7 +447,7 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
         args.resized = &buf_resized[t];
         thr[t] = load_data_in_thread(args);
     }
-    time_t start = time(0);
+    double start = what_time_is_it_now();
     for(i = nthreads; i < m+nthreads; i += nthreads){
         fprintf(stderr, "%d\n", i);
         for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
@@ -490,7 +490,7 @@ void validate_detector(char *datacfg, char *cfgfile, char *weightfile, char *out
         fprintf(fp, "\n]\n");
         fclose(fp);
     }
-    fprintf(stderr, "Total Detection Time: %f Seconds\n", (double)(time(0) - start));
+    fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
 }
 
 void validate_detector_recall(char *cfgfile, char *weightfile)
diff --git a/examples/rnn.c b/examples/rnn.c
index 45e8deba..8169f510 100644
--- a/examples/rnn.c
+++ b/examples/rnn.c
@@ -52,6 +52,7 @@ char **read_tokens(char *filename, size_t *read)
     return d;
 }
 
+
 float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
     float *x = calloc(batch * steps * characters, sizeof(float));
@@ -78,6 +79,35 @@ float_pair get_rnn_token_data(int *tokens, size_t *offsets, int characters, size
     return p;
 }
 
+float_pair get_seq2seq_data(char **source, char **dest, int n, int characters, size_t len, int batch, int steps)
+{
+    int i,j;
+    float *x = calloc(batch * steps * characters, sizeof(float));
+    float *y = calloc(batch * steps * characters, sizeof(float));
+    for(i = 0; i < batch; ++i){
+        int index = rand()%n;
+        for(j = 0; j < steps; ++j){
+            unsigned char curr = source[index][j];
+            unsigned char next = dest[index][j];
+
+            x[(j*batch + i)*characters + curr] = 1;
+            y[(j*batch + i)*characters + next] = 1;
+
+            if(curr > 255 || curr <= 0 || next > 255 || next <= 0){
+                /*text[(index+j+2)%len] = 0;
+                printf("%ld %d %d %d %d\n", index, j, len, (int)text[index+j], (int)text[index+j+1]);
+                printf("%s", text+index);
+                */
+                error("Bad char");
+            }
+        }
+    }
+    float_pair p;
+    p.x = x;
+    p.y = y;
+    return p;
+}
+
 float_pair get_rnn_data(unsigned char *text, size_t *offsets, int characters, size_t len, int batch, int steps)
 {
     float *x = calloc(batch * steps * characters, sizeof(float));
diff --git a/examples/segmenter.c b/examples/segmenter.c
index 2c1979d4..137ecd8e 100644
--- a/examples/segmenter.c
+++ b/examples/segmenter.c
@@ -211,7 +211,7 @@ void demo_segmenter(char *datacfg, char *cfgfile, char *weightfile, int cam_inde
         image in = get_image_from_stream(cap);
         image in_s = letterbox_image(in, net.w, net.h);
 
-        float *predictions = network_predict(net, in_s.data);
+        network_predict(net, in_s.data);
 
         printf("\033[2J");
         printf("\033[1;1H");
diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu
index 73530056..80a849f7 100644
--- a/src/activation_kernels.cu
+++ b/src/activation_kernels.cu
@@ -10,8 +10,8 @@ extern "C" {
 
 __device__ float lhtan_activate_kernel(float x)
 {
-    if(x < 0) return .001*x;
-    if(x > 1) return .001*(x-1) + 1;
+    if(x < 0) return .001f*x;
+    if(x > 1) return .001f*(x-1.f) + 1.f;
     return x;
 }
 __device__ float lhtan_gradient_kernel(float x)
@@ -27,25 +27,25 @@ __device__ float hardtan_activate_kernel(float x)
     return x;
 }
 __device__ float linear_activate_kernel(float x){return x;}
-__device__ float logistic_activate_kernel(float x){return 1./(1. + exp(-x));}
-__device__ float loggy_activate_kernel(float x){return 2./(1. + exp(-x)) - 1;}
+__device__ float logistic_activate_kernel(float x){return 1.f/(1.f + expf(-x));}
+__device__ float loggy_activate_kernel(float x){return 2.f/(1.f + expf(-x)) - 1;}
 __device__ float relu_activate_kernel(float x){return x*(x>0);}
-__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(exp(x)-1);}
-__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01*x;}
-__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1*x;}
-__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1*x;}
-__device__ float tanh_activate_kernel(float x){return (2/(1 + exp(-2*x)) - 1);}
+__device__ float elu_activate_kernel(float x){return (x >= 0)*x + (x < 0)*(expf(x)-1);}
+__device__ float relie_activate_kernel(float x){return (x>0) ? x : .01f*x;}
+__device__ float ramp_activate_kernel(float x){return x*(x>0)+.1f*x;}
+__device__ float leaky_activate_kernel(float x){return (x>0) ? x : .1f*x;}
+__device__ float tanh_activate_kernel(float x){return (2.f/(1 + expf(-2*x)) - 1);}
 __device__ float plse_activate_kernel(float x)
 {
-    if(x < -4) return .01 * (x + 4);
-    if(x > 4)  return .01 * (x - 4) + 1;
-    return .125*x + .5;
+    if(x < -4) return .01f * (x + 4);
+    if(x > 4)  return .01f * (x - 4) + 1;
+    return .125f*x + .5f;
 }
 __device__ float stair_activate_kernel(float x)
 {
-    int n = floor(x);
-    if (n%2 == 0) return floor(x/2.);
-    else return (x - n) + floor(x/2.);
+    int n = floorf(x);
+    if (n%2 == 0) return floorf(x/2);
+    else return (x - n) + floorf(x/2);
 }
  
 
@@ -58,19 +58,19 @@ __device__ float linear_gradient_kernel(float x){return 1;}
 __device__ float logistic_gradient_kernel(float x){return (1-x)*x;}
 __device__ float loggy_gradient_kernel(float x)
 {
-    float y = (x+1.)/2.;
+    float y = (x+1)/2;
     return 2*(1-y)*y;
 }
 __device__ float relu_gradient_kernel(float x){return (x>0);}
 __device__ float elu_gradient_kernel(float x){return (x >= 0) + (x < 0)*(x + 1);}
-__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01;}
-__device__ float ramp_gradient_kernel(float x){return (x>0)+.1;}
-__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1;}
+__device__ float relie_gradient_kernel(float x){return (x>0) ? 1 : .01f;}
+__device__ float ramp_gradient_kernel(float x){return (x>0)+.1f;}
+__device__ float leaky_gradient_kernel(float x){return (x>0) ? 1 : .1f;}
 __device__ float tanh_gradient_kernel(float x){return 1-x*x;}
-__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01 : .125;}
+__device__ float plse_gradient_kernel(float x){return (x < 0 || x > 1) ? .01f : .125f;}
 __device__ float stair_gradient_kernel(float x)
 {
-    if (floor(x) == x) return 0;
+    if (floorf(x) == x) return 0;
     return 1;
 }
 
diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu
index 867db038..a483f2eb 100644
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@@ -165,7 +165,7 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float
     int index = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (index >= N) return;
     
-    x[index] = x[index] + (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps));
+    x[index] = x[index] + (rate * sqrtf(1.f-powf(B2, t)) / (1.f-powf(B1, t)) * m[index] / (sqrtf(v[index]) + eps));
 }
 
 extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t)
@@ -194,7 +194,7 @@ __global__ void normalize_kernel(int N, float *x, float *mean, float *variance,
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    x[index] = (x[index] - mean[f])/(sqrt(variance[f] + .00001f));
+    x[index] = (x[index] - mean[f])/(sqrtf(variance[f] + .00001f));
 }
 
 __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -203,7 +203,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari
     if (index >= N) return;
     int f = (index/spatial)%filters;
     
-    delta[index] = delta[index] * 1./(sqrt(variance[f] + .00001f)) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
+    delta[index] = delta[index] * 1.f/(sqrtf(variance[f] + .00001f)) + variance_delta[f] * 2.f * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch);
 }
 
 extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta)
@@ -225,7 +225,7 @@ __global__ void  variance_delta_kernel(float *x, float *delta, float *mean, floa
             variance_delta[i] += delta[index]*(x[index] - mean[i]);
         }
     }
-    variance_delta[i] *= -.5 * pow(variance[i] + .00001f, (float)(-3./2.));
+    variance_delta[i] *= -.5f * powf(variance[i] + .00001f, (float)(-3.f/2.f));
 }
 
 __global__ void accumulate_kernel(float *x, int n, int groups, float *sum)
@@ -264,7 +264,7 @@ __global__ void fast_mean_delta_kernel(float *delta, float *variance, int batch,
         for(i = 0; i < threads; ++i){
             mean_delta[filter] += local[i];
         }
-        mean_delta[filter] *= (-1./sqrt(variance[filter] + .00001f));
+        mean_delta[filter] *= (-1.f/sqrtf(variance[filter] + .00001f));
     }
 }
 
@@ -294,7 +294,7 @@ __global__ void  fast_variance_delta_kernel(float *x, float *delta, float *mean,
         for(i = 0; i < threads; ++i){
             variance_delta[filter] += local[i];
         }
-        variance_delta[filter] *= -.5 * pow(variance[filter] + .00001f, (float)(-3./2.));
+        variance_delta[filter] *= -.5f * powf(variance[filter] + .00001f, (float)(-3.f/2.f));
     }
 }
 
@@ -311,7 +311,7 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int
             mean_delta[i] += delta[index];
         }
     }
-    mean_delta[i] *= (-1./sqrt(variance[i] + .00001f));
+    mean_delta[i] *= (-1.f/sqrtf(variance[i] + .00001f));
 }
 
 extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta)
@@ -334,7 +334,7 @@ extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, flo
 
 __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, float *mean)
 {
-    float scale = 1./(batch * spatial);
+    float scale = 1.f/(batch * spatial);
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
     int j,k;
@@ -350,7 +350,7 @@ __global__ void  mean_kernel(float *x, int batch, int filters, int spatial, floa
 
 __global__ void variance_kernel(float *x, float *mean, int batch, int filters, int spatial, float *variance)
 {
-    float scale = 1./(batch * spatial - 1);
+    float scale = 1.f/(batch * spatial - 1);
     int j,k;
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if (i >= filters) return;
@@ -358,7 +358,7 @@ __global__ void variance_kernel(float *x, float *mean, int batch, int filters, i
     for(j = 0; j < batch; ++j){
         for(k = 0; k < spatial; ++k){
             int index = j*filters*spatial + i*spatial + k;
-            variance[i] += pow((x[index] - mean[i]), 2);
+            variance[i] += powf((x[index] - mean[i]), 2);
         }
     }
     variance[i] *= scale;
@@ -516,7 +516,7 @@ __global__ void  fast_variance_kernel(float *x, float *mean, int batch, int filt
         for(i = 0; i < spatial; i += threads){
             int index = j*spatial*filters + filter*spatial + i + id;
 
-            local[id] += (i+id < spatial) ? pow((x[index] - mean[filter]), 2) : 0;
+            local[id] += (i+id < spatial) ? powf((x[index] - mean[filter]), 2) : 0;
         }
     }
 
@@ -716,7 +716,7 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta,
     int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(i < n){
         float diff = truth[i] - pred[i];
-        float abs_val = abs(diff);
+        float abs_val = fabsf(diff);
         if(abs_val < 1) {
             error[i] = diff * diff;
             delta[i] = diff;
@@ -864,7 +864,7 @@ __device__ void softmax_device(float *input, int n, float temp, int stride, floa
         largest = (val>largest) ? val : largest;
     }
     for(i = 0; i < n; ++i){
-        float e = exp(input[i*stride]/temp - largest/temp);
+        float e = expf(input[i*stride]/temp - largest/temp);
         sum += e;
         output[i*stride] = e;
     }
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index b9b6f455..749b4c5e 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -33,7 +33,7 @@ __global__ void binarize_input_kernel(float *input, int n, int size, float *bina
     int i = 0;
     float mean = 0;
     for(i = 0; i < n; ++i){
-        mean += abs(input[i*size + s]);
+        mean += fabsf(input[i*size + s]);
     }
     mean = mean / n;
     for(i = 0; i < n; ++i){
@@ -55,7 +55,7 @@ __global__ void binarize_weights_kernel(float *weights, int n, int size, float *
     int i = 0;
     float mean = 0;
     for(i = 0; i < size; ++i){
-        mean += abs(weights[f*size + i]);
+        mean += fabsf(weights[f*size + i]);
     }
     mean = mean / size;
     for(i = 0; i < size; ++i){
@@ -139,8 +139,8 @@ __global__ void smooth_kernel(float *x, int n, int w, int h, int c, int size, fl
     id /= c;
     int b = id;
 
-    int w_offset = -(size/2.);
-    int h_offset = -(size/2.);
+    int w_offset = -(size/2.f);
+    int h_offset = -(size/2.f);
 
     int out_index = j + w*(i + h*(k + c*b));
     int l, m;
diff --git a/src/crop_layer_kernels.cu b/src/crop_layer_kernels.cu
index b6568219..b5b9f554 100644
--- a/src/crop_layer_kernels.cu
+++ b/src/crop_layer_kernels.cu
@@ -113,9 +113,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     float r3 = rand[8*id + 3];
 
     saturation = r0*(saturation - 1) + 1;
-    saturation = (r1 > .5) ? 1./saturation : saturation;
+    saturation = (r1 > .5f) ? 1.f/saturation : saturation;
     exposure = r2*(exposure - 1) + 1;
-    exposure = (r3 > .5) ? 1./exposure : exposure;
+    exposure = (r3 > .5f) ? 1.f/exposure : exposure;
 
     size_t offset = id * h * w * 3;
     image += offset;
@@ -131,9 +131,9 @@ __global__ void levels_image_kernel(float *image, float *rand, int batch, int w,
     } else {
         shift = 0;
     }
-    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
-    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
-    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
+    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5f)*shift;
+    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5f)*shift;
+    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5f)*shift;
 }
 
 __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
@@ -141,8 +141,8 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
     if(id >= size) return;
 
-    float cx = w/2.;
-    float cy = h/2.;
+    float cx = w/2.f;
+    float cy = h/2.f;
 
     int count = id;
     int j = id % crop_width;
@@ -160,11 +160,11 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
 
     float dw = (w - crop_width)*r4;
     float dh = (h - crop_height)*r5;
-    flip = (flip && (r6 > .5));
+    flip = (flip && (r6 > .5f));
     angle = 2*angle*r7 - angle;
     if(!train){
-        dw = (w - crop_width)/2.;
-        dh = (h - crop_height)/2.;
+        dw = (w - crop_width)/2.f;
+        dh = (h - crop_height)/2.f;
         flip = 0;
         angle = 0;
     }
@@ -174,8 +174,8 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i
     float x = (flip) ? w - dw - j - 1 : j + dw;    
     float y = i + dh;
 
-    float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
-    float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;
+    float rx = cosf(angle)*(x-cx) - sinf(angle)*(y-cy) + cx;
+    float ry = sinf(angle)*(x-cx) + cosf(angle)*(y-cy) + cy;
 
     output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
 }
@@ -184,7 +184,7 @@ extern "C" void forward_crop_layer_gpu(crop_layer layer, network net)
 {
     cuda_random(layer.rand_gpu, layer.batch*8);
 
-    float radians = layer.angle*3.14159265/180.;
+    float radians = layer.angle*3.14159265f/180.f;
 
     float scale = 2;
     float translate = -1;
diff --git a/src/data.c b/src/data.c
index e060e937..36b1286a 100644
--- a/src/data.c
+++ b/src/data.c
@@ -137,14 +137,18 @@ matrix load_image_augment_paths(char **paths, int n, int min, int max, int size,
 
 box_label *read_boxes(char *filename, int *n)
 {
-    box_label *boxes = calloc(1, sizeof(box_label));
     FILE *file = fopen(filename, "r");
     if(!file) file_error(filename);
     float x, y, h, w;
     int id;
     int count = 0;
+    int size = 64;
+    box_label *boxes = calloc(size, sizeof(box_label));
     while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
-        boxes = realloc(boxes, (count+1)*sizeof(box_label));
+        if(count == size) {
+            size = size * 2;
+            boxes = realloc(boxes, size*sizeof(box_label));
+        }
         boxes[count].id = id;
         boxes[count].x = x;
         boxes[count].y = y;
@@ -976,6 +980,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in
         place_image(orig, nw, nh, dx, dy, sized);
 
         random_distort_image(sized, hue, saturation, exposure);
+
         int flip = rand()%2;
         if(flip) flip_image(sized);
         d.X.vals[i] = sized.data;
diff --git a/src/demo.c b/src/demo.c
index a60c456d..28a6ddc6 100644
--- a/src/demo.c
+++ b/src/demo.c
@@ -20,10 +20,6 @@ static int demo_classes;
 static float **probs;
 static box *boxes;
 static network net;
-static network net2;
-static float **probs2;
-static box *boxes2;
-static float **predictions2;
 static image buff [3];
 static image buff_letter[3];
 static int buff_index = 0;
diff --git a/src/image.c b/src/image.c
index ece5ff50..fc8d08d1 100644
--- a/src/image.c
+++ b/src/image.c
@@ -44,6 +44,51 @@ image mask_to_rgb(image mask)
     return im;
 }
 
+static float get_pixel(image m, int x, int y, int c)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    return m.data[c*m.h*m.w + y*m.w + x];
+}
+static float get_pixel_extend(image m, int x, int y, int c)
+{
+    if(x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
+    /*
+    if(x < 0) x = 0;
+    if(x >= m.w) x = m.w-1;
+    if(y < 0) y = 0;
+    if(y >= m.h) y = m.h-1;
+    */
+    if(c < 0 || c >= m.c) return 0;
+    return get_pixel(m, x, y, c);
+}
+static void set_pixel(image m, int x, int y, int c, float val)
+{
+    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] = val;
+}
+static void add_pixel(image m, int x, int y, int c, float val)
+{
+    assert(x < m.w && y < m.h && c < m.c);
+    m.data[c*m.h*m.w + y*m.w + x] += val;
+}
+
+static float bilinear_interpolate(image im, float x, float y, int c)
+{
+    int ix = (int) floorf(x);
+    int iy = (int) floorf(y);
+
+    float dx = x - ix;
+    float dy = y - iy;
+
+    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
+        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
+        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
+        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
+    return val;
+}
+
+
 void composite_image(image source, image dest, int dx, int dy)
 {
     int x,y,k;
@@ -1255,21 +1300,6 @@ void saturate_exposure_image(image im, float sat, float exposure)
     constrain_image(im);
 }
 
-float bilinear_interpolate(image im, float x, float y, int c)
-{
-    int ix = (int) floorf(x);
-    int iy = (int) floorf(y);
-
-    float dx = x - ix;
-    float dy = y - iy;
-
-    float val = (1-dy) * (1-dx) * get_pixel_extend(im, ix, iy, c) + 
-        dy     * (1-dx) * get_pixel_extend(im, ix, iy+1, c) + 
-        (1-dy) *   dx   * get_pixel_extend(im, ix+1, iy, c) +
-        dy     *   dx   * get_pixel_extend(im, ix+1, iy+1, c);
-    return val;
-}
-
 image resize_image(image im, int w, int h)
 {
     image resized = make_image(w, h, im.c);   
@@ -1419,36 +1449,6 @@ image get_image_layer(image m, int l)
     }
     return out;
 }
-
-float get_pixel(image m, int x, int y, int c)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    return m.data[c*m.h*m.w + y*m.w + x];
-}
-float get_pixel_extend(image m, int x, int y, int c)
-{
-    if(x < 0 || x >= m.w || y < 0 || y >= m.h) return 0;
-    /*
-    if(x < 0) x = 0;
-    if(x >= m.w) x = m.w-1;
-    if(y < 0) y = 0;
-    if(y >= m.h) y = m.h-1;
-    */
-    if(c < 0 || c >= m.c) return 0;
-    return get_pixel(m, x, y, c);
-}
-void set_pixel(image m, int x, int y, int c, float val)
-{
-    if (x < 0 || y < 0 || c < 0 || x >= m.w || y >= m.h || c >= m.c) return;
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] = val;
-}
-void add_pixel(image m, int x, int y, int c, float val)
-{
-    assert(x < m.w && y < m.h && c < m.c);
-    m.data[c*m.h*m.w + y*m.w + x] += val;
-}
-
 void print_image(image m)
 {
     int i, j, k;
diff --git a/src/image.h b/src/image.h
index 02c79f08..4ff0eacb 100644
--- a/src/image.h
+++ b/src/image.h
@@ -60,12 +60,6 @@ void print_image(image m);
 image make_empty_image(int w, int h, int c);
 void copy_image_into(image src, image dest);
 
-float get_pixel(image m, int x, int y, int c);
-float get_pixel_extend(image m, int x, int y, int c);
-void set_pixel(image m, int x, int y, int c, float val);
-void add_pixel(image m, int x, int y, int c, float val);
-float bilinear_interpolate(image im, float x, float y, int c);
-
 image get_image_layer(image m, int l);
 
 #endif
diff --git a/src/utils.h b/src/utils.h
index 1593e62b..4e467075 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -5,7 +5,14 @@
 #include "darknet.h"
 #include "list.h"
 
-#define TWO_PI 6.2831853071795864769252866
+#define TIME(a) \
+    do { \
+    double start = what_time_is_it_now(); \
+    a; \
+    printf("%s took: %f seconds\n", #a, what_time_is_it_now() - start); \
+    } while (0)
+
+#define TWO_PI 6.2831853071795864769252866f
 
 double what_time_is_it_now();
 void shuffle(void *arr, size_t n, size_t size);