From 8a767f106677b78a389e1ceffc066501015ec51a Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Mon, 6 Jun 2016 15:48:52 -0700 Subject: [PATCH] stuff for carlo --- ai2.mk | 79 +++++ cfg/xyolo.test.cfg | 148 ++++++++++ src/binary_convolution.c | 598 ++++++++++++++++++++++++++++++++++++++ src/binary_convolution.h | 218 ++++++++++++++ src/common.c | 81 ++++++ src/common.h | 50 ++++ src/convolutional_layer.c | 127 ++++---- src/layer.h | 2 + src/parser.c | 3 +- src/xnor_layer.c | 86 ++++++ src/xnor_layer.h | 11 + src/yolo.c | 2 +- 12 files changed, 1350 insertions(+), 55 deletions(-) create mode 100644 ai2.mk create mode 100644 cfg/xyolo.test.cfg create mode 100644 src/binary_convolution.c create mode 100644 src/binary_convolution.h create mode 100644 src/common.c create mode 100644 src/common.h create mode 100644 src/xnor_layer.c create mode 100644 src/xnor_layer.h diff --git a/ai2.mk b/ai2.mk new file mode 100644 index 00000000..b72e97b1 --- /dev/null +++ b/ai2.mk @@ -0,0 +1,79 @@ +GPU=0 +CUDNN=0 +OPENCV=0 +DEBUG=0 +AI2=1 + +ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 + +VPATH=./src/ +EXEC=darknet +OBJDIR=./obj/ + +CC=gcc -std=gnu11 +NVCC=nvcc +OPTS=-Ofast +LDFLAGS= -lm -pthread +COMMON= +CFLAGS=-Wall -Wfatal-errors + +ifeq ($(DEBUG), 1) +OPTS=-O0 -g +endif + +CFLAGS+=$(OPTS) + +ifeq ($(OPENCV), 1) +COMMON+= -DOPENCV +CFLAGS+= -DOPENCV +LDFLAGS+= `pkg-config --libs opencv` +COMMON+= `pkg-config --cflags opencv` +endif + +ifeq ($(AI2), 1) +COMMON+= -DAI2 +CFLAGS+= -DAI2 +endif + +ifeq ($(GPU), 1) +COMMON+= -DGPU -I/usr/local/cuda/include/ +CFLAGS+= -DGPU +LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand +endif + +ifeq ($(CUDNN), 1) +COMMON+= -DCUDNN +CFLAGS+= -DCUDNN +LDFLAGS+= -lcudnn +endif + +OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o +ifeq ($(GPU), 1) +LDFLAGS+= -lstdc++ +OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o +endif + +OBJS = $(addprefix $(OBJDIR), $(OBJ)) +DEPS = $(wildcard src/*.h) Makefile + +all: obj results $(EXEC) + +$(EXEC): $(OBJS) + $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) + +$(OBJDIR)%.o: %.c $(DEPS) + $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ + +$(OBJDIR)%.o: %.cu $(DEPS) + $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ + +obj: + mkdir -p obj +results: + mkdir -p results + +.PHONY: clean + +clean: + rm -rf $(OBJS) $(EXEC) + diff --git a/cfg/xyolo.test.cfg b/cfg/xyolo.test.cfg new file mode 100644 index 00000000..22596796 --- /dev/null +++ b/cfg/xyolo.test.cfg @@ -0,0 +1,148 @@ +[net] +batch=1 +subdivisions=1 +height=448 +width=448 +channels=3 +momentum=0.9 +decay=0.0005 + +learning_rate=0.0001 +policy=steps +steps=20,40,60,80,20000,30000 +scales=5,5,2,2,.1,.1 +max_batches = 40000 + +[crop] +crop_width=448 +crop_height=448 +flip=0 +angle=0 +saturation = 1.5 +exposure = 1.5 +noadjust=1 + +[convolutional] +batch_normalize=1 +filters=16 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +xnor = 1 +batch_normalize=1 +filters=32 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +xnor = 1 +batch_normalize=1 +filters=64 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +xnor = 1 +batch_normalize=1 +filters=128 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +xnor = 1 +batch_normalize=1 +filters=256 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +xnor = 1 +batch_normalize=1 +filters=512 +size=3 +stride=1 +pad=1 +activation=leaky + +[maxpool] +size=2 +stride=2 + +[batchnorm] + +[convolutional] +batch_normalize=1 +filters=1024 +size=3 +stride=1 +pad=1 +activation=leaky + +[convolutional] +batch_normalize=1 +size=3 +stride=1 +pad=1 +filters=128 +activation=leaky + +[connected] +output= 1470 +activation=linear + +[detection] +classes=20 +coords=4 +rescore=1 +side=7 +num=2 +softmax=0 +sqrt=1 +jitter=.2 + +object_scale=1 +noobject_scale=.5 +class_scale=1 +coord_scale=5 + diff --git a/src/binary_convolution.c b/src/binary_convolution.c new file mode 100644 index 00000000..dfededa6 --- /dev/null +++ b/src/binary_convolution.c @@ -0,0 +1,598 @@ +#include "binary_convolution.h" + +int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) { // TODO unroll + int accumulator = 0; + for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) { + for (int y = 0; y < vdim.y; y++) { + for (int x = 0; x < vdim.x; x++) { + int idx = z*vdim.y*vdim.x + y*vdim.x + x; + accumulator += __builtin_popcount(~(a[idx] ^ b[idx])); // count the XNOR of the two bit vectors + } + } + } + + return accumulator; +} + +/** + * Pre-conditions: + * alpha_volume is an array of size x*y*z. + * alpha_plane is an array of size x*y. + * alpha_volume (x,y,z) is transposed to (z,x,y). + */ +void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) { + for (int y = 0; y < vdim.y; ++y) { + for (int x = 0; x < vdim.x; ++x) { + int out = y * vdim.x + x; + double accum = 0.0; + for (int z = 0; z < vdim.z; ++z) { + accum += alpha_volume[out * vdim.z + z]; + } + + alpha_plane[out] = accum / vdim.z; + } + } +} + +/** @brief Wrapper function for generating the beta scaling factor */ +void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) { + ai2_calc_alpha(beta_plane, beta_volume, vdim); +} + +/** @brief Set the bit in a binary word */ +void ai2_bitset(BINARY_WORD *bword, unsigned int position) { + BINARY_WORD mask = (1 << position); + *bword = *bword | mask; +} + +/** @brief Checks that the bit is set in a binary word */ +int ai2_is_set(BINARY_WORD bword, unsigned int position) { + unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position; // number of leading bits before the bit position of interest + bword = (bword << position_complement); // zero out leading bits + bword = (bword >> (BITS_PER_BINARY_WORD - 1)); // shift bit position of interest to the 0th position + return (bword & 0x1); // test if bit position of interest is set +} + +void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) { + ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y) + + int sz = dim.x * dim.y * dim.z; + for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) { + BINARY_WORD tmp = 0x00000000; + for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) { + int waddr = x + i; + if (signbit(real_vol[waddr]) == 0) + ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x); + } + binary_vol[i / BITS_PER_BINARY_WORD] = tmp; + } +} + +void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) { // TODO unit tests + for (int z = 0; z < dim.z; z++) { + for (int y = 0; y < dim.y; y++) { + for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) { // TODO boundary checks, for uneven input + BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x]; + for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) { + int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t; + if (ai2_is_set(word, t)) + real_vol[oidx] = 1.f; + else + real_vol[oidx] = -1.f; + } + } + } + } + + // Transpose channels back to output + ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z) +} + +/* @brief: input is padded. + */ +void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) { + + int r, rd, c, cd; + int wx_2 = wx / 2; + int wy_2 = wy / 2; + + // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1] + int sx = pad; // start x + int ex = ix + pad - 1; // end x + int sy = pad; // start y + int ey = iy + pad - 1; // end y + + // Indexing for weights + int wsx, wex, wsy, wey; + if (wx % 2 == 1) { // odd weights + wsx = -wx_2; wex = wx_2 + 1; + wsy = -wy_2; wey = wy_2 + 1; + } + else { + wsx = -wx_2; wex = wx_2; + wsy = -wy_2; wey = wy_2; + } + + int px = ix + 2*pad; + //int py = iy + 2*pad; + + for (r = sy; r <= ey; ++r) { + for (c = sx; c <= ex; ++c) { + int accumulator = 0; + for (rd = wsy; rd < wey; ++rd) { + for (cd = wsx; cd < wex; ++cd) { + int iidx = (r+rd)*px + (c+cd); + BINARY_WORD pixel = input[iidx]; + //BINARY_WORD pixel = 0xFFFFFFFF; + //BINARY_WORD weight = 0xFFFFFFFF; + int widx = (rd + wy_2)*wx + (cd+wx_2); + BINARY_WORD weight = weights[widx]; + accumulator += __builtin_popcount(~(pixel ^ weight)); + } + } + + // Padded space + int oidx = r*px + c; + output[oidx] += (float) accumulator; + } + } + + //for (r = sy; r <= ey; ++r) { + // for (c = sx; c <= ex; ++c) { + // int accumulator = 0; + // for (rd = -wy_2; rd < wy_2; ++rd) { + // for (cd = -wx_2; cd < wx_2; ++cd) { + // int iidx = (r+rd)*px + (c+cd); + // BINARY_WORD pixel = input[iidx]; + // //BINARY_WORD pixel = 0xFFFFFFFF; + // //BINARY_WORD weight = 0xFFFFFFFF; + // int widx = (rd + wy_2)*wx + (cd+wx_2); + // BINARY_WORD weight = weights[widx]; + // accumulator += __builtin_popcount(~(pixel ^ weight)); + // } + // } + + // // Padded space + // int oidx = r*px + c; + // output[oidx] += (float) accumulator; + // } + //} + + //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride); + //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride); +} + +void ai2_pointwise_mul_mm(float *output, const float *input, int N) { + int i = 0; + + while (i + 8 <= N) { + output[i+0] *= input[i+0]; + output[i+1] *= input[i+1]; + output[i+2] *= input[i+2]; + output[i+3] *= input[i+3]; + output[i+4] *= input[i+4]; + output[i+5] *= input[i+5]; + output[i+6] *= input[i+6]; + output[i+7] *= input[i+7]; + + i += 8; + } + + while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly) + output[i] *= input[i]; +} + +/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors + * Pre-conditions: wx < ix, and wy < iy + */ +void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) { + // Slower version +// for (int y = 0; y < iy; ++y) +// for (int x = 0; x < ix; x++) +// output[y*ix+x] *= input[(y % wy)*wx + (x % wx)]; + + // Stride prefetch optimized + for (int s = 0; s < wy; ++s) { // for each strip + const float *strip_ptr = &alpha[s*wx]; + for (int y = pad; y < pad + (iy / wy); ++y) { // + int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad); + float *output_ptr = &output[stride]; + + for (int x = 0; x < ix; ++x) { + output_ptr[x] *= strip_ptr[x % wx]; + } + } + } +} + +void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) { + if (new_input != NULL) { + if (layer->input != NULL) + free(layer->input); + layer->input = new_input; + + dim3 dim; + dim.x = layer->px; + dim.y = layer->py; + dim.z = layer->c; + + // Binarize input + ai2_flt_to_bin(layer->binary_input, layer->input, dim); + + float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float)); + ai2_setFltBeta(layer, new_beta); + + // layer->input is transposed to (z,x,y) already + ai2_calc_beta(layer->beta, layer->input, dim); + } +} + +void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) { + if (new_input != NULL) { + if (layer->binary_input != NULL) + free(layer->binary_input); + layer->binary_input = new_input; + } +} + +void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) { + if (new_weights != NULL) { + if (layer->weights != NULL) + free(layer->weights); + layer->weights = new_weights; + + dim3 dim; + dim.x = layer->wx; + dim.y = layer->wy; + dim.z = layer->c; + + ai2_flt_to_bin(layer->binary_weights, layer->weights, dim); + + // Calculate alpha + if (layer->alpha != NULL) + free(layer->alpha); + + layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float)); + // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin() + ai2_calc_alpha(layer->alpha, layer->weights, dim); + } +} + +void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) { + if (new_weights != NULL) { + if (layer->binary_weights != NULL) + free(layer->binary_weights); + layer->binary_weights = new_weights; + } +} + +void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) { + if (new_output != NULL) { + if (layer->output != NULL) + free(layer->output); + layer->output = new_output; + } +} + +void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) { + if (new_output != NULL) { + if (layer->binary_output != NULL) + free(layer->binary_output); + layer->binary_output = new_output; + } +} + +void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) { + if (new_alpha != NULL) { + if (layer->alpha != NULL) + free(layer->alpha); + layer->alpha = new_alpha; + } +} + +void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) { + if (new_beta != NULL) { + if (layer->beta != NULL) + free(layer->beta); + layer->beta = new_beta; + } +} + +void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) { + if (new_new_beta != NULL) { + if (layer->new_beta != NULL) + free(layer->new_beta); + layer->new_beta = new_new_beta; + } +} + +float* ai2_getFltOutput(ai2_bin_conv_layer *layer) { + //if (layer->output != NULL && layer->binary_output != NULL) { + if (layer->output != NULL) { + + // The idea here was that all intermediate states are stored in the binary output. + // Whenever the user needs the real-valued output, the conversion happens at this function call. + //dim3 dim; + //dim.x = layer->px; + //dim.y = layer->py; + //dim.z = layer->batch; + //ai2_bin_to_flt(layer->output, layer->binary_output, dim); + + return layer->output; + } + else + return NULL; +} + +void ai2_transpose3D(float *data, dim3 d) { + // Slow transpose for correctness + + // (x,y,z) becomes (z,x,y). Requires two transposes: + // (x,y,z) -> (x,z,y). + // (x,z,y) -> (z,x,y). + + // Intermediate buffer + float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float)); + + // Transpose y and z axis. + // (x,y,z) -> (x,z,y); + for (int y = 0; y < d.y; ++y) { + for (int z = 0; z < d.z; ++z) { + for (int x = 0; x < d.x; ++x) { + new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x]; + //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x]; + } + } + } + + // Transpose x and z axis. + // (x,z,y) -> (z,x,y) + for (int y = 0; y < d.y; ++y) { + for (int x = 0; x < d.x; ++x) { + for (int z = 0; z < d.z; ++z) { + data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x]; + } + } + } + + free(new_data); +} + +int ai2_isFloatWhole(float f) { // TODO unit test + return (ceilf(f) == f) ? 1 : 0; +} + +/* @brief Initialize and create all memory arrays for this layer + * b - batches (number of filter batches) + * c - input channels + * ix - input width + * iy - input height + * wx - weight/filter width + * wy - weight/filter height + * s - stride between sliding windows + * pad - the amount of padding + */ +ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) { + // http://cs231n.github.io/convolutional-networks/ + // See: spatial arrangement section for determining what the output size will be + float output_size = ((ix - wx + 2 * pad) / s) + 1; + if (ai2_isFloatWhole(output_size) == 0) { + fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give " + " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size); + exit(1); + } + + // TODO: Support strided output + if (s != 1) { + fprintf(stderr, "ERROR! Only stride values of 1 is supported\n"); + exit(1); + } + + // padded input size + int px = (int) ix + 2*pad; + int py = (int) iy + 2*pad; + + ai2_bin_conv_layer l = {0}; // initialize all to 0 + l.input = (float *) calloc (c * px * py, sizeof(float)); // is padded + l.binary_input = (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); // is padded + + dim3 dim; + dim.x = px; + dim.y = py; + dim.z = c; + ai2_flt_to_bin(l.binary_input, l.input, dim); + + l.weights = (float *) calloc (b * c * wx * wy, sizeof(float)); + l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); + + l.output = (float *) calloc (c * px * py, sizeof(float)); // is padded + l.new_beta = (float *) calloc(px * py, sizeof(float)); // is padded + + l.batch = b; + l.c = c; + l.h = iy; + l.w = ix; + l.stride = s; + l.pad = pad; + l.px = px; + l.py = py; + l.wx = wx; + l.wy = wy; + + // The following parameters are uninitialized and should be set elsewhere: + // l.beta - padded + // l.alpha - not padded + + return l; +} + +void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) { + if (layer->input) free (layer->input); + if (layer->binary_input) free(layer->binary_input); + if (layer->weights) free (layer->weights); + if (layer->binary_weights) free(layer->binary_weights); + if (layer->output) free(layer->output); + if (layer->binary_output) free (layer->binary_output); + if (layer->alpha) free(layer->alpha); + if (layer->beta) free(layer->beta); + if (layer->new_beta) free(layer->new_beta); +} + +void ai2_throw_error(char *str) { + fprintf(stderr, "ERROR: %s\n", str); + exit(1); +} + +void ai2_bin_forward(ai2_bin_conv_layer *l) { + if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer"); + if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer"); + if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer"); + if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer"); + if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer"); + + if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask " + "before supporting arbitrary channel size. For now, " + "set the channel size to the nearest multiple of 32 " + "and ignore any ''extra'' channels unused."); + + l->c /= BITS_PER_BINARY_WORD; // For compensating with doing more work per word + + float *output = l->output; + float *alpha = l->alpha; + float *beta = l->beta; + int px = l->px; + int py = l->py; + BINARY_WORD *binary_weights = l->binary_weights; + + for (int z = 0; z < l->batch; ++z) { // for each filter map + BINARY_WORD *binary_input = l->binary_input; + for (int c = 0; c < l->c; ++c) { // for each input channel + ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride); + binary_input += px*py; // increment with next 2D plane + binary_weights += l->wx*l->wy; // increment with next 2D plane + + ai2_pointwise_mul_mm(output, beta, px*py); + ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad); + } + } +} + +// Deprecated +//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) { +// printf("Running Binary Convolution test!\n"); +// +// size_t ix, iy, iz, wx, wy, wz, L, stride; +// ix = conv_args.input.x; +// iy = conv_args.input.y; +// iz = conv_args.input.z; +// wx = conv_args.weights.x; +// wy = conv_args.weights.y; +// wz = conv_args.weights.z; +// L = BITS_PER_BINARY_WORD; +// stride = 1; +// +// printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz); +// printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz); +// +// double sz_input_elements = ix * iy * iz; +// double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input); +// double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights); +// +// printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20)); +// printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20)); +// +// BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input); +// BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights); +// BINARY_WORD *b_input = binary_input; // alias +// BINARY_WORD *b_weight = binary_weights; // alias +// float *output = mallocFloatVolume(conv_args.output); +// float *output_ptr = output; +// float *beta = (float *) malloc(sizeof(float) * ix * iy); // we assume beta is given to us +// float *alpha = (float *) malloc(sizeof(float) * wx * wy); // we assume alpha is given to us +// float *new_output = mallocFloatVolume(conv_args.output); +// //float *new_output_ptr = new_output; +// float *new_beta = (float *) malloc(sizeof(float) * ix * iy); +// //float *new_beta_ptr = new_beta; +// +// // Scale number of computations because we're packing. +// // After this point, you should not have to reason about input dimensions for input and weights. +// iz /= BITS_PER_BINARY_WORD; +// wz /= BITS_PER_BINARY_WORD; +// +// // Calculate time taken by a request +// struct timeval start_time; +// gettimeofday(&start_time, NULL); +// +// // Preprocessing +// int pad = wx/2; +// +// for (int z = 0; z < iz; ++z) { // number of channels +// ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride); +// b_input += ix*iy; // increment with next 2D plane +// b_weight += wx*wy; // increment with next 2D plane +// +// ai2_pointwise_mul_mm(output_ptr, beta, ix*iy); +// ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad); +// } +// +// // copy to new array (need to wrap this around); TODO. +// struct timeval end_time; +// gettimeofday(&end_time, NULL); +// +// struct timeval diff_time; +// timersub(&end_time, &start_time, &diff_time); +// double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6; +// double time_conv_ms = time_conv_s * 1000.0; +// +// double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz; +// double conv_ops_s = 1e-9 * model_ops / time_conv_s; +// double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0); +// double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0); +// +// printf("Execution Time (ms): %f\n", time_conv_ms); +// printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s); +// printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s); +// printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s); +// +// free(binary_input); +// free(binary_weights); +// free(output); +// free(beta); +// free(alpha); +// free(new_output); +// free(new_beta); +// +// return time_conv_ms; +//} + +// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args); + +//void benchmark() { +// int ix, iy, iz, wx, wy, wz; +// iz = (1 << 9) * BITS_PER_BINARY_WORD; +// ix = 227; // x == y for square face +// iy = 227; +// wx = 3; // x == y for a square face +// wy = 3; +// wz = iz; +// +// int runs = 1; +// double accum_binary = 0; +// double accum_real = 0; +// ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz); +// for (int i = 0; i < runs; ++i) { +// double t_binary_convolve = ai2_bin_conv_benchmark(conv_args); +// double t_real_convolve = run_convolve2D_real(conv_args); +// printf("t binary = %lf\n", t_binary_convolve); +// printf("t real = %lf\n", t_real_convolve); +// accum_binary += t_binary_convolve; +// accum_real += t_real_convolve; +// } +// +// accum_binary /= runs; +// accum_real /= runs; +// printf("Average convolution pass binary (ms): %lf\n", accum_binary); +// printf("Average convolution pass flt (ms): %lf\n", accum_real); +// printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary); +// exit(1); +//} diff --git a/src/binary_convolution.h b/src/binary_convolution.h new file mode 100644 index 00000000..602677e8 --- /dev/null +++ b/src/binary_convolution.h @@ -0,0 +1,218 @@ +#ifndef AI2_BINARY_CONVOLUTION_H +#define AI2_BINARY_CONVOLUTION_H + +/** @file binary_convolution.h + * @brief Routines related for approximating convolutions using binary operations + * + * @author Carlo C. del Mundo (carlom) + * @date 05/23/2016 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "common.h" + +typedef struct { + int batch; // number of filter batches + int c; // channels, z + int h; // height, y + int w; // width, x + int stride; + int pad; + + int px; // padded x (use this for striding in padded input and output arrays) + int py; // padded y (use this for striding in padded input and output arrays) + int wx; + int wy; + + float *input; // input values + BINARY_WORD *binary_input; + + float *weights; // weight or filter values + BINARY_WORD *binary_weights; + + float *output; // output values + BINARY_WORD *binary_output; + + float *alpha; // we assume alpha is calculated at the beginning of initialization + float *beta; // we assume beta is given to us + float *new_beta; // we calculate the new beta for the next layer + + struct ai2_bin_conv_layer *next; +} ai2_bin_conv_layer; + +/** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights + * + * @param output A 2D real-valued plane to store the outputs + * @param input A 2D binary-valued plane that holds the inputs + * @param weights A 2D binary-valued plane that holds the weights + * @param ix the input's x dimension + * @param iy the input's y dimensions + * @param wx the weight's x dimension + * @param wy the weight's y dimension + * @param pad the amount of padding applied to input. (ix+2*pad is the x dimension of the input + * @param stride NOP. TODO: implement stride. the stride between sliding windows + * @return the count of all overlapping set bits between the two volumes. + */ +void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride); + +/** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes. + * + * @param a A 3D binary tensor + * @param b A 3D binary tensor + * @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element. + * @return the count of all overlapping set bits between the two volumes. + */ +int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim); + +/** @brief Calculates the alpha plane given an alpha volume. + * + * Each point in the yz alpha plane + * is the average sum of the absolute value of all elements in the z-direction. + * + * Pre-conditions: + * alpha_volume is an array of size x*y*z. + * alpha_plane is an array of size x*y. + * alpha_volume (x,y,z) is transposed to (z,x,y). + * + * @param alpha_plane The 2D real-valued output plane + * @param alpha_volume The 3D real-valued output volume + * @param vdim the dimensionality of alpha_volume. + */ +void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim); + +/** @brief Wrapper function for generating the beta scaling factor */ +void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim); + +/** @brief Set the bit in a binary word */ +void ai2_bitset(BINARY_WORD *bword, unsigned int position); + +/** @brief Checks that the bit is set in a binary word */ +int ai2_is_set(BINARY_WORD bword, unsigned int position) ; + +/** @brief Converts a 3D float tensor into a 3D binary tensor. + * + * The value of the ith element in the binary tensor is the sign + * of the ith element in the floating tensor. + * + * @param binary_vol the binary tensor + * @param real_vol the real tensor + * @param vdim the size of the 3D tensor + */ +void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 vdim) ; + +/** @brief Converts a 3D binary tensor into a 3D float tensor. + * + * The ith float element will be '1' if the ith binary element is '1'. + * Otherwise, the float element will be '-1'. + * + * @param real_vol the output real tensor + * @param binary_vol the input binary tensor + * @param vdim the dimension of both binary_vol and real_vol + */ +void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 vdim); + +/** @brief Performs a pointwise matrix multication between two 2D tensors + * @param output A 2D real-valued plane to store the outputs + * @param input A 2D binary-valued plane that holds the inputs + * @param N the number of elements between the arrays + */ +void ai2_pointwise_mul_mm(float *output, const float *input, int N); + +/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors + * + * Pre-conditions: wx < ix, and wy < iy + * + * @param output A 2D real-valued plane of size ix, iy + * @param alpha A 2D binary-valued plane of size wx, wy + * @param ix the output's x dimension + * @param iy the output's y dimensions + * @param wx the alpha's x dimension + * @param wy the alpha's y dimension + * @param pad how many cells are padded, adds 2*pad to the borders of the image + */ +void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad); + +// -------------------------------------- +// SETTER FUNCTIONS +// -------------------------------------- +/** @brief Safe function to set the float input of a conv_layer + */ +void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input); + +/** @brief Safe function to set the binary input of a conv_layer + */ +void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input); + +/** @brief Safe function to set the binary weights of a conv_layer + */ +void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights); + +/** @brief Safe function to set the binary weights of a conv_layer + */ +void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights); + +/** @brief Safe function to set the binary outputs of a conv_layer + */ +void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output); + +/** @brief Safe function to set the binary outputs of a conv_layer + */ +void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output); + +/** @brief Safe function to set the alpha of a conv_layer + */ +void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha); + +/** @brief Safe function to set the beta of a conv_layer + */ +void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta); + +/** @brief Safe function to set the new_beta of a conv_layer + */ +void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta); + +// -------------------------------------- +// GETTER FUNCTIONS +// -------------------------------------- +/** @brief Safe function to get the float outputs of a conv_layer + */ +float * ai2_getFltOutput(ai2_bin_conv_layer *layer); + +/** @brief 3D tranpose from (x,y,z) to (z,y,x) + * @return a new pointer with the transposed matrix + */ +void ai2_transpose3D(float *data, dim3 d); + +/** @brief Checks if a float is a whole number (e.g., an int) + */ +int ai2_isFloatWhole(float f); + +/* @brief Allocates all memory objects in an ai2_bin_conv_layer + * b - batches (number of filter batches) + * c - input channels + * ix - input width + * iy - input height + * wx - weight/filter width + * wy - weight/filter height + * s - stride between sliding windows + * pad - the amount of padding + */ +ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad); + +/* @brief Safe deallocation of all memory objects in an ai2_bin_conv_layer + */ +void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer); + +/* @brief Given real-valued filter data and a conv layer, performs a forward pass + */ +void ai2_bin_forward(ai2_bin_conv_layer *layer); + +#endif diff --git a/src/common.c b/src/common.c new file mode 100644 index 00000000..9d59ee8a --- /dev/null +++ b/src/common.c @@ -0,0 +1,81 @@ +#include "common.h" + +// Returns the time in ms +double getElapsedTime(Timer *timer) { + // Calculate time it took in seconds + double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec ) + + ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec ) + / 1e6; + return accum_ms; +} + +void start_timer(Timer *timer) { + clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart)); +} + +void stop_timer(Timer *timer) { + clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd)); +} + + +BINARY_WORD * mallocBinaryVolume(dim3 vol) { + return (BINARY_WORD *) malloc (vol.x * vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD)); +} + +float * mallocFloatVolume(dim3 vol) { + return (float *) malloc (vol.x * vol.y * vol.z * sizeof(float)); +} + +// Returns the size (in bytes) of a binary array with dimensions stored in conv_args +double getSizeBytesBinaryArray(dim3 conv_args) { + return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD); +} + + +ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) { + ConvolutionArgs conv_args; + // Input Volume + conv_args.input.x = ix; // x == y for a square face + conv_args.input.y = iy; + conv_args.input.z = iz; + conv_args.weights.x = wx; // x == y for square face + conv_args.weights.y = wy; + conv_args.weights.z = wz; + + // + // Intermediate Volumes + conv_args.alpha_plane.x = conv_args.weights.x; + conv_args.alpha_plane.y = conv_args.weights.y; + conv_args.alpha_plane.z = 1; + + conv_args.beta_plane.x = 1; + conv_args.beta_plane.y = conv_args.input.y; + conv_args.beta_plane.z = conv_args.input.z; + + conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x; + conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y; + conv_args.gamma_plane.z = 1; + + conv_args.zeta_plane.x = conv_args.gamma_plane.x; + conv_args.zeta_plane.y = conv_args.gamma_plane.y; + conv_args.zeta_plane.z = 1; + + // Output Volume + conv_args.output.x = conv_args.input.x; + conv_args.output.y = conv_args.input.y; + conv_args.output.z = 1; // Output should be a 2D plane + + // Verify dimensions + //assert(conv_args.weights.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints +// assert(conv_args.weights.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints + assert(conv_args.weights.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints + //assert(conv_args.input.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints +// assert(conv_args.input.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints + assert(conv_args.input.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints + assert(conv_args.weights.x <= conv_args.input.x); + assert(conv_args.weights.y <= conv_args.input.y); + assert(conv_args.weights.z <= conv_args.input.z); + // + + return conv_args; +} diff --git a/src/common.h b/src/common.h new file mode 100644 index 00000000..bad428d1 --- /dev/null +++ b/src/common.h @@ -0,0 +1,50 @@ +#ifndef AI2_COMMON_H +#define AI2_COMMON_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include +#include + +typedef uint32_t BINARY_WORD; +#define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT) + +typedef struct{ + struct timespec requestStart; + struct timespec requestEnd; +} Timer; + +typedef struct { + size_t x; + size_t y; + size_t z; +} dim3; + +typedef struct { + dim3 weights; + dim3 input; + dim3 output; + dim3 alpha_plane; + dim3 beta_plane; + dim3 gamma_plane; + dim3 zeta_plane; +} ConvolutionArgs; + +// Timer stuff +double getElapsedTime(Timer *timer); // Returns the time in ms +void start_timer(Timer *timer); +void stop_timer(Timer *timer); + +BINARY_WORD * mallocBinaryVolume(dim3 vol); +float * mallocFloatVolume(dim3 vol); +ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz); +double getSizeBytesBinaryArray(dim3 conv_args); + +#endif diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index c377802f..f0c312c9 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -8,6 +8,10 @@ #include #include +#ifndef AI2 +#define AI2 0 +#endif + void swap_binary(convolutional_layer *l) { float *swap = l->filters; @@ -21,24 +25,6 @@ void swap_binary(convolutional_layer *l) #endif } -void binarize_filters2(float *filters, int n, int size, char *binary, float *scales) -{ - int i, k, f; - for(f = 0; f < n; ++f){ - float mean = 0; - for(i = 0; i < size; ++i){ - mean += fabs(filters[f*size + i]); - } - mean = mean / size; - scales[f] = mean; - for(i = 0; i < size/8; ++i){ - binary[f*size + i] = (filters[f*size + i] > 0) ? 1 : 0; - for(k = 0; k < 8; ++k){ - } - } - } -} - void binarize_filters(float *filters, int n, int size, float *binary) { int i, f; @@ -54,6 +40,21 @@ void binarize_filters(float *filters, int n, int size, float *binary) } } +void binarize_input(float *input, int n, int size, float *binary) +{ + int i, s; + for(s = 0; s < size; ++s){ + float mean = 0; + for(i = 0; i < n; ++i){ + mean += fabs(input[i*size + s]); + } + mean = mean / n; + for(i = 0; i < n; ++i){ + binary[i*size + s] = (input[i*size + s] > 0) ? mean : -mean; + } + } +} + int convolutional_out_height(convolutional_layer l) { int h = l.h; @@ -89,7 +90,7 @@ image get_convolutional_delta(convolutional_layer l) } size_t get_workspace_size(layer l){ - #ifdef CUDNN +#ifdef CUDNN size_t most = 0; size_t s = 0; cudnnGetConvolutionForwardWorkspaceSize(cudnn_handle(), @@ -117,9 +118,9 @@ size_t get_workspace_size(layer l){ &s); if (s > most) most = s; return most; - #else +#else return (size_t)l.out_h*l.out_w*l.size*l.size*l.c*sizeof(float); - #endif +#endif } convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize, int binary, int xnor) @@ -133,6 +134,7 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.c = c; l.n = n; l.binary = binary; + l.xnor = xnor; l.batch = batch; l.stride = stride; l.size = size; @@ -164,6 +166,10 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.cfilters = calloc(c*n*size*size, sizeof(char)); l.scales = calloc(n, sizeof(float)); } + if(xnor){ + l.binary_filters = calloc(c*n*size*size, sizeof(float)); + l.binary_input = calloc(l.inputs*l.batch, sizeof(float)); + } if(batch_normalize){ l.scales = calloc(n, sizeof(float)); @@ -199,7 +205,6 @@ convolutional_layer make_convolutional_layer(int batch, int h, int w, int c, int l.binary_filters_gpu = cuda_make_array(l.filters, c*n*size*size); l.binary_input_gpu = cuda_make_array(0, l.inputs*l.batch); } - l.xnor = xnor; if(batch_normalize){ l.mean_gpu = cuda_make_array(l.mean, n); @@ -325,7 +330,7 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); - #ifdef CUDNN +#ifdef CUDNN cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); @@ -359,7 +364,7 @@ void resize_convolutional_layer(convolutional_layer *l, int w, int h) CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); - #endif +#endif #endif l->workspace_size = get_workspace_size(*l); } @@ -404,7 +409,9 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) int out_w = convolutional_out_width(l); int i; + fill_cpu(l.outputs*l.batch, 0, l.output, 1); + /* if(l.binary){ binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); @@ -413,44 +420,59 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) } */ -/* - if(l.binary){ - int m = l.n; - int k = l.size*l.size*l.c; - int n = out_h*out_w; + /* + if(l.binary){ + int m = l.n; + int k = l.size*l.size*l.c; + int n = out_h*out_w; - char *a = l.cfilters; + char *a = l.cfilters; + float *b = state.workspace; + float *c = l.output; + + for(i = 0; i < l.batch; ++i){ + im2col_cpu(state.input, l.c, l.h, l.w, + l.size, l.stride, l.pad, b); + gemm_bin(m,n,k,1,a,k,b,n,c,n); + c += n*m; + state.input += l.c*l.h*l.w; + } + scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); + add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); + activate_array(l.output, m*n*l.batch, l.activation); + return; + } + */ + + if(l.xnor && (l.c%32 != 0 || !AI2)){ + binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); + swap_binary(&l); + for(i = 0; i < l.batch; ++i){ + binarize_input(state.input + i*l.inputs, l.c, l.h*l.w, l.binary_input + i*l.inputs); + } + state.input = l.binary_input; + } + + int m = l.n; + int k = l.size*l.size*l.c; + int n = out_h*out_w; + + if (l.xnor && l.c%32 == 0 && AI2) { + forward_xnor_layer(l, state); + printf("xnor\n"); + } else { + + float *a = l.filters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); - gemm_bin(m,n,k,1,a,k,b,n,c,n); + gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } - scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); - add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); - activate_array(l.output, m*n*l.batch, l.activation); - return; - } - */ - - int m = l.n; - int k = l.size*l.size*l.c; - int n = out_h*out_w; - - float *a = l.filters; - float *b = state.workspace; - float *c = l.output; - - for(i = 0; i < l.batch; ++i){ - im2col_cpu(state.input, l.c, l.h, l.w, - l.size, l.stride, l.pad, b); - gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); - c += n*m; - state.input += l.c*l.h*l.w; } if(l.batch_normalize){ @@ -459,6 +481,7 @@ void forward_convolutional_layer(convolutional_layer l, network_state state) add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); + if(l.binary || l.xnor) swap_binary(&l); } void backward_convolutional_layer(convolutional_layer l, network_state state) diff --git a/src/layer.h b/src/layer.h index d2250a6d..0c457f7e 100644 --- a/src/layer.h +++ b/src/layer.h @@ -167,6 +167,8 @@ struct layer{ float *r_cpu; float *h_cpu; + float *binary_input; + size_t workspace_size; #ifdef GPU diff --git a/src/parser.c b/src/parser.c index d12b5c18..71f54cc0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1021,7 +1021,6 @@ void load_convolutional_weights_binary(layer l, FILE *fp) } } } - binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales); #ifdef GPU if(gpu_index >= 0){ push_convolutional_layer(l); @@ -1046,7 +1045,7 @@ void load_convolutional_weights(layer l, FILE *fp) if (l.flipped) { transpose_matrix(l.filters, l.c*l.size*l.size, l.n); } - if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters); + //if (l.binary) binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.filters); #ifdef GPU if(gpu_index >= 0){ push_convolutional_layer(l); diff --git a/src/xnor_layer.c b/src/xnor_layer.c new file mode 100644 index 00000000..6c5e9b8a --- /dev/null +++ b/src/xnor_layer.c @@ -0,0 +1,86 @@ +#include "xnor_layer.h" +#include "binary_convolution.h" +#include "convolutional_layer.h" + +layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalize) +{ + int i; + layer l = {0}; + l.type = XNOR; + + l.h = h; + l.w = w; + l.c = c; + l.n = n; + l.batch = batch; + l.stride = stride; + l.size = size; + l.pad = pad; + l.batch_normalize = batch_normalize; + + l.filters = calloc(c*n*size*size, sizeof(float)); + l.biases = calloc(n, sizeof(float)); + + int out_h = convolutional_out_height(l); + int out_w = convolutional_out_width(l); + l.out_h = out_h; + l.out_w = out_w; + l.out_c = n; + l.outputs = l.out_h * l.out_w * l.out_c; + l.inputs = l.w * l.h * l.c; + + l.output = calloc(l.batch*out_h * out_w * n, sizeof(float)); + + if(batch_normalize){ + l.scales = calloc(n, sizeof(float)); + for(i = 0; i < n; ++i){ + l.scales[i] = 1; + } + + l.mean = calloc(n, sizeof(float)); + l.variance = calloc(n, sizeof(float)); + + l.rolling_mean = calloc(n, sizeof(float)); + l.rolling_variance = calloc(n, sizeof(float)); + } + + l.activation = activation; + + fprintf(stderr, "XNOR Layer: %d x %d x %d image, %d filters -> %d x %d x %d image\n", h,w,c,n, out_h, out_w, n); + + return l; +} + +void forward_xnor_layer(const layer l, network_state state) +{ + int b = l.n; + int c = l.c; + int ix = l.w; + int iy = l.h; + int wx = l.size; + int wy = l.size; + int s = l.stride; + int pad = l.pad * (l.size/2); + + // MANDATORY: Make the binary layer + ai2_bin_conv_layer al = ai2_make_bin_conv_layer(b, c, ix, iy, wx, wy, s, pad); + + // OPTIONAL: You need to set the real-valued input like: + ai2_setFltInput(&al, state.input); + // The above function will automatically binarize the input for the layer (channel wise). + // If commented: using the default 0-valued input. + + ai2_setFltWeights(&al, l.filters); + // The above function will automatically binarize the input for the layer (channel wise). + // If commented: using the default 0-valued weights. + + // MANDATORY: Call forward + ai2_bin_forward(&al); + + // OPTIONAL: Inspect outputs + float *output = ai2_getFltOutput(&al); // output is of size l.px * l.py where px and py are the padded outputs + + memcpy(l.output, output, l.outputs*sizeof(float)); + // MANDATORY: Free layer + ai2_free_bin_conv_layer(&al); +} diff --git a/src/xnor_layer.h b/src/xnor_layer.h new file mode 100644 index 00000000..f1c5b687 --- /dev/null +++ b/src/xnor_layer.h @@ -0,0 +1,11 @@ +#ifndef XNOR_LAYER_H +#define XNOR_LAYER_H + +#include "layer.h" +#include "network.h" + +layer make_xnor_layer(int batch, int h, int w, int c, int n, int size, int stride, int pad, ACTIVATION activation, int batch_normalization); +void forward_xnor_layer(const layer l, network_state state); + +#endif + diff --git a/src/yolo.c b/src/yolo.c index 9c3999ea..2b999356 100644 --- a/src/yolo.c +++ b/src/yolo.c @@ -346,8 +346,8 @@ void test_yolo(char *cfgfile, char *weightfile, char *filename, float thresh) if (nms) do_nms_sort(boxes, probs, l.side*l.side*l.n, l.classes, nms); //draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20); draw_detections(im, l.side*l.side*l.n, thresh, boxes, probs, voc_names, voc_labels, 20); - show_image(im, "predictions"); save_image(im, "predictions"); + show_image(im, "predictions"); show_image(sized, "resized"); free_image(im);