diff --git a/Makefile b/Makefile index f3e4b792..28a0d17a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -GPU=1 -CUDNN=1 -OPENCV=1 +GPU=0 +CUDNN=0 +OPENCV=0 DEBUG=0 ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 diff --git a/ai2.mk b/ai2.mk deleted file mode 100644 index 57edc893..00000000 --- a/ai2.mk +++ /dev/null @@ -1,79 +0,0 @@ -GPU=0 -CUDNN=0 -OPENCV=0 -DEBUG=1 -AI2=1 - -ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 - -VPATH=./src/ -EXEC=darknet -OBJDIR=./obj/ - -CC=gcc -std=gnu11 -NVCC=nvcc -OPTS=-Ofast -LDFLAGS= -lm -pthread -COMMON= -CFLAGS=-Wall -Wfatal-errors - -ifeq ($(DEBUG), 1) -OPTS=-O0 -g -endif - -CFLAGS+=$(OPTS) - -ifeq ($(OPENCV), 1) -COMMON+= -DOPENCV -CFLAGS+= -DOPENCV -LDFLAGS+= `pkg-config --libs opencv` -COMMON+= `pkg-config --cflags opencv` -endif - -ifeq ($(AI2), 1) -COMMON+= -DAI2 -CFLAGS+= -DAI2 -endif - -ifeq ($(GPU), 1) -COMMON+= -DGPU -I/usr/local/cuda/include/ -CFLAGS+= -DGPU -LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand -endif - -ifeq ($(CUDNN), 1) -COMMON+= -DCUDNN -CFLAGS+= -DCUDNN -LDFLAGS+= -lcudnn -endif - -OBJ=gemm.o utils.o cuda.o deconvolutional_layer.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o imagenet.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o coco_demo.o tag.o cifar.o yolo_demo.o go.o batchnorm_layer.o art.o xnor_layer.o common.o binary_convolution.o -ifeq ($(GPU), 1) -LDFLAGS+= -lstdc++ -OBJ+=convolutional_kernels.o deconvolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o -endif - -OBJS = $(addprefix $(OBJDIR), $(OBJ)) -DEPS = $(wildcard src/*.h) Makefile - -all: obj results $(EXEC) - -$(EXEC): $(OBJS) - $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) - -$(OBJDIR)%.o: %.c $(DEPS) - $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ - -$(OBJDIR)%.o: %.cu $(DEPS) - $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ - -obj: - mkdir -p obj -results: - mkdir -p results - -.PHONY: clean - -clean: - rm -rf $(OBJS) $(EXEC) - diff --git a/src/binary_convolution.c b/src/binary_convolution.c deleted file mode 100644 index dfededa6..00000000 --- a/src/binary_convolution.c +++ /dev/null @@ -1,598 +0,0 @@ -#include "binary_convolution.h" - -int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim) { // TODO unroll - int accumulator = 0; - for (int z = 0; z < vdim.z / BITS_PER_BINARY_WORD; z++) { - for (int y = 0; y < vdim.y; y++) { - for (int x = 0; x < vdim.x; x++) { - int idx = z*vdim.y*vdim.x + y*vdim.x + x; - accumulator += __builtin_popcount(~(a[idx] ^ b[idx])); // count the XNOR of the two bit vectors - } - } - } - - return accumulator; -} - -/** - * Pre-conditions: - * alpha_volume is an array of size x*y*z. - * alpha_plane is an array of size x*y. - * alpha_volume (x,y,z) is transposed to (z,x,y). - */ -void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim) { - for (int y = 0; y < vdim.y; ++y) { - for (int x = 0; x < vdim.x; ++x) { - int out = y * vdim.x + x; - double accum = 0.0; - for (int z = 0; z < vdim.z; ++z) { - accum += alpha_volume[out * vdim.z + z]; - } - - alpha_plane[out] = accum / vdim.z; - } - } -} - -/** @brief Wrapper function for generating the beta scaling factor */ -void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim) { - ai2_calc_alpha(beta_plane, beta_volume, vdim); -} - -/** @brief Set the bit in a binary word */ -void ai2_bitset(BINARY_WORD *bword, unsigned int position) { - BINARY_WORD mask = (1 << position); - *bword = *bword | mask; -} - -/** @brief Checks that the bit is set in a binary word */ -int ai2_is_set(BINARY_WORD bword, unsigned int position) { - unsigned int position_complement = (BITS_PER_BINARY_WORD - 1) - position; // number of leading bits before the bit position of interest - bword = (bword << position_complement); // zero out leading bits - bword = (bword >> (BITS_PER_BINARY_WORD - 1)); // shift bit position of interest to the 0th position - return (bword & 0x1); // test if bit position of interest is set -} - -void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 dim) { - ai2_transpose3D(real_vol, dim); // (x,y,z) -> (z,x,y) - - int sz = dim.x * dim.y * dim.z; - for (int i = 0; i < sz; i += BITS_PER_BINARY_WORD) { - BINARY_WORD tmp = 0x00000000; - for (int x = 0; x < BITS_PER_BINARY_WORD; ++x) { - int waddr = x + i; - if (signbit(real_vol[waddr]) == 0) - ai2_bitset(&tmp, (BITS_PER_BINARY_WORD - 1) - x); - } - binary_vol[i / BITS_PER_BINARY_WORD] = tmp; - } -} - -void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 dim) { // TODO unit tests - for (int z = 0; z < dim.z; z++) { - for (int y = 0; y < dim.y; y++) { - for (int x = 0; x < dim.x / BITS_PER_BINARY_WORD; x++) { // TODO boundary checks, for uneven input - BINARY_WORD word = binary_vol[z*dim.y*dim.x + y*dim.x + x]; - for (int t = 0; t < BITS_PER_BINARY_WORD; ++t) { - int oidx = z*dim.y*dim.x + y*dim.x + x * BITS_PER_BINARY_WORD + t; - if (ai2_is_set(word, t)) - real_vol[oidx] = 1.f; - else - real_vol[oidx] = -1.f; - } - } - } - } - - // Transpose channels back to output - ai2_transpose3D(real_vol, dim); // (z,y,x) -> (x,y,z) -} - -/* @brief: input is padded. - */ -void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride) { - - int r, rd, c, cd; - int wx_2 = wx / 2; - int wy_2 = wy / 2; - - // Indexing for output pixels. x = [wx_2, ix + wx_2 - 1], y = [wy_2, iy + wy_2 - 1] - int sx = pad; // start x - int ex = ix + pad - 1; // end x - int sy = pad; // start y - int ey = iy + pad - 1; // end y - - // Indexing for weights - int wsx, wex, wsy, wey; - if (wx % 2 == 1) { // odd weights - wsx = -wx_2; wex = wx_2 + 1; - wsy = -wy_2; wey = wy_2 + 1; - } - else { - wsx = -wx_2; wex = wx_2; - wsy = -wy_2; wey = wy_2; - } - - int px = ix + 2*pad; - //int py = iy + 2*pad; - - for (r = sy; r <= ey; ++r) { - for (c = sx; c <= ex; ++c) { - int accumulator = 0; - for (rd = wsy; rd < wey; ++rd) { - for (cd = wsx; cd < wex; ++cd) { - int iidx = (r+rd)*px + (c+cd); - BINARY_WORD pixel = input[iidx]; - //BINARY_WORD pixel = 0xFFFFFFFF; - //BINARY_WORD weight = 0xFFFFFFFF; - int widx = (rd + wy_2)*wx + (cd+wx_2); - BINARY_WORD weight = weights[widx]; - accumulator += __builtin_popcount(~(pixel ^ weight)); - } - } - - // Padded space - int oidx = r*px + c; - output[oidx] += (float) accumulator; - } - } - - //for (r = sy; r <= ey; ++r) { - // for (c = sx; c <= ex; ++c) { - // int accumulator = 0; - // for (rd = -wy_2; rd < wy_2; ++rd) { - // for (cd = -wx_2; cd < wx_2; ++cd) { - // int iidx = (r+rd)*px + (c+cd); - // BINARY_WORD pixel = input[iidx]; - // //BINARY_WORD pixel = 0xFFFFFFFF; - // //BINARY_WORD weight = 0xFFFFFFFF; - // int widx = (rd + wy_2)*wx + (cd+wx_2); - // BINARY_WORD weight = weights[widx]; - // accumulator += __builtin_popcount(~(pixel ^ weight)); - // } - // } - - // // Padded space - // int oidx = r*px + c; - // output[oidx] += (float) accumulator; - // } - //} - - //ai2_bin_conv_within_boundary(output, input, weights, ix, iy, wx, wy, stride); - //ai2_bin_conv_borders(output, input, weights, ix, iy, wx, wy, stride); -} - -void ai2_pointwise_mul_mm(float *output, const float *input, int N) { - int i = 0; - - while (i + 8 <= N) { - output[i+0] *= input[i+0]; - output[i+1] *= input[i+1]; - output[i+2] *= input[i+2]; - output[i+3] *= input[i+3]; - output[i+4] *= input[i+4]; - output[i+5] *= input[i+5]; - output[i+6] *= input[i+6]; - output[i+7] *= input[i+7]; - - i += 8; - } - - while (++i < N) // Finish iteration that's leftover (e.g., last batch not divisible by 8 exactly) - output[i] *= input[i]; -} - -/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors - * Pre-conditions: wx < ix, and wy < iy - */ -void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad) { - // Slower version -// for (int y = 0; y < iy; ++y) -// for (int x = 0; x < ix; x++) -// output[y*ix+x] *= input[(y % wy)*wx + (x % wx)]; - - // Stride prefetch optimized - for (int s = 0; s < wy; ++s) { // for each strip - const float *strip_ptr = &alpha[s*wx]; - for (int y = pad; y < pad + (iy / wy); ++y) { // - int stride = y*((ix+2*pad)*wy) + s*(ix+2*pad); - float *output_ptr = &output[stride]; - - for (int x = 0; x < ix; ++x) { - output_ptr[x] *= strip_ptr[x % wx]; - } - } - } -} - -void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input) { - if (new_input != NULL) { - if (layer->input != NULL) - free(layer->input); - layer->input = new_input; - - dim3 dim; - dim.x = layer->px; - dim.y = layer->py; - dim.z = layer->c; - - // Binarize input - ai2_flt_to_bin(layer->binary_input, layer->input, dim); - - float *new_beta = (float *) calloc (dim.x * dim.y, sizeof(float)); - ai2_setFltBeta(layer, new_beta); - - // layer->input is transposed to (z,x,y) already - ai2_calc_beta(layer->beta, layer->input, dim); - } -} - -void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input) { - if (new_input != NULL) { - if (layer->binary_input != NULL) - free(layer->binary_input); - layer->binary_input = new_input; - } -} - -void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights) { - if (new_weights != NULL) { - if (layer->weights != NULL) - free(layer->weights); - layer->weights = new_weights; - - dim3 dim; - dim.x = layer->wx; - dim.y = layer->wy; - dim.z = layer->c; - - ai2_flt_to_bin(layer->binary_weights, layer->weights, dim); - - // Calculate alpha - if (layer->alpha != NULL) - free(layer->alpha); - - layer->alpha = (float *) calloc (dim.x * dim.y, sizeof(float)); - // layer->weights is already transposed to (z,x,y) from ai2_flt_to_bin() - ai2_calc_alpha(layer->alpha, layer->weights, dim); - } -} - -void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights) { - if (new_weights != NULL) { - if (layer->binary_weights != NULL) - free(layer->binary_weights); - layer->binary_weights = new_weights; - } -} - -void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output) { - if (new_output != NULL) { - if (layer->output != NULL) - free(layer->output); - layer->output = new_output; - } -} - -void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output) { - if (new_output != NULL) { - if (layer->binary_output != NULL) - free(layer->binary_output); - layer->binary_output = new_output; - } -} - -void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha) { - if (new_alpha != NULL) { - if (layer->alpha != NULL) - free(layer->alpha); - layer->alpha = new_alpha; - } -} - -void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta) { - if (new_beta != NULL) { - if (layer->beta != NULL) - free(layer->beta); - layer->beta = new_beta; - } -} - -void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta) { - if (new_new_beta != NULL) { - if (layer->new_beta != NULL) - free(layer->new_beta); - layer->new_beta = new_new_beta; - } -} - -float* ai2_getFltOutput(ai2_bin_conv_layer *layer) { - //if (layer->output != NULL && layer->binary_output != NULL) { - if (layer->output != NULL) { - - // The idea here was that all intermediate states are stored in the binary output. - // Whenever the user needs the real-valued output, the conversion happens at this function call. - //dim3 dim; - //dim.x = layer->px; - //dim.y = layer->py; - //dim.z = layer->batch; - //ai2_bin_to_flt(layer->output, layer->binary_output, dim); - - return layer->output; - } - else - return NULL; -} - -void ai2_transpose3D(float *data, dim3 d) { - // Slow transpose for correctness - - // (x,y,z) becomes (z,x,y). Requires two transposes: - // (x,y,z) -> (x,z,y). - // (x,z,y) -> (z,x,y). - - // Intermediate buffer - float *new_data = (float *) calloc (d.x * d.y * d.z, sizeof(float)); - - // Transpose y and z axis. - // (x,y,z) -> (x,z,y); - for (int y = 0; y < d.y; ++y) { - for (int z = 0; z < d.z; ++z) { - for (int x = 0; x < d.x; ++x) { - new_data[y*d.x*d.z + z*d.x + x] = data[z*d.x*d.y + y*d.x + x]; - //new_data[z*d.y*d.x + y*d.x + x] = data[y*d.x*d.z + z*d.x + x]; - } - } - } - - // Transpose x and z axis. - // (x,z,y) -> (z,x,y) - for (int y = 0; y < d.y; ++y) { - for (int x = 0; x < d.x; ++x) { - for (int z = 0; z < d.z; ++z) { - data[y*d.z*d.x + x*d.z + z] = new_data[y*d.x*d.z + x + z*d.x]; - } - } - } - - free(new_data); -} - -int ai2_isFloatWhole(float f) { // TODO unit test - return (ceilf(f) == f) ? 1 : 0; -} - -/* @brief Initialize and create all memory arrays for this layer - * b - batches (number of filter batches) - * c - input channels - * ix - input width - * iy - input height - * wx - weight/filter width - * wy - weight/filter height - * s - stride between sliding windows - * pad - the amount of padding - */ -ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad) { - // http://cs231n.github.io/convolutional-networks/ - // See: spatial arrangement section for determining what the output size will be - float output_size = ((ix - wx + 2 * pad) / s) + 1; - if (ai2_isFloatWhole(output_size) == 0) { - fprintf(stderr, "ERROR! conv layer of (b,c,ix,iy,s,pad) = (%d, %d, %d, %d, %d, %d) will give " - " invalid output dimension: %fx%f\n", b, c, ix, iy, s, pad, output_size, output_size); - exit(1); - } - - // TODO: Support strided output - if (s != 1) { - fprintf(stderr, "ERROR! Only stride values of 1 is supported\n"); - exit(1); - } - - // padded input size - int px = (int) ix + 2*pad; - int py = (int) iy + 2*pad; - - ai2_bin_conv_layer l = {0}; // initialize all to 0 - l.input = (float *) calloc (c * px * py, sizeof(float)); // is padded - l.binary_input = (BINARY_WORD *) calloc (c * px * py / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); // is padded - - dim3 dim; - dim.x = px; - dim.y = py; - dim.z = c; - ai2_flt_to_bin(l.binary_input, l.input, dim); - - l.weights = (float *) calloc (b * c * wx * wy, sizeof(float)); - l.binary_weights = (BINARY_WORD *) calloc (b * c * wx * wy / BITS_PER_BINARY_WORD, sizeof(BINARY_WORD)); - - l.output = (float *) calloc (c * px * py, sizeof(float)); // is padded - l.new_beta = (float *) calloc(px * py, sizeof(float)); // is padded - - l.batch = b; - l.c = c; - l.h = iy; - l.w = ix; - l.stride = s; - l.pad = pad; - l.px = px; - l.py = py; - l.wx = wx; - l.wy = wy; - - // The following parameters are uninitialized and should be set elsewhere: - // l.beta - padded - // l.alpha - not padded - - return l; -} - -void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer) { - if (layer->input) free (layer->input); - if (layer->binary_input) free(layer->binary_input); - if (layer->weights) free (layer->weights); - if (layer->binary_weights) free(layer->binary_weights); - if (layer->output) free(layer->output); - if (layer->binary_output) free (layer->binary_output); - if (layer->alpha) free(layer->alpha); - if (layer->beta) free(layer->beta); - if (layer->new_beta) free(layer->new_beta); -} - -void ai2_throw_error(char *str) { - fprintf(stderr, "ERROR: %s\n", str); - exit(1); -} - -void ai2_bin_forward(ai2_bin_conv_layer *l) { - if (l->input == NULL) ai2_throw_error("Input was not allocated and set in this layer"); - if (l->weights == NULL) ai2_throw_error("Weights was not allocated and set in this layer"); - if (l->output == NULL) ai2_throw_error("Output was not allocated and set in this layer"); - if (l->alpha == NULL) ai2_throw_error("Alpha was not allocated and set in this layer"); - if (l->beta == NULL) ai2_throw_error("Beta was not allocated and set in this layer"); - - if (l->c % 32 != 0) ai2_throw_error("Channel is not divisible by 32. Need to implement mask " - "before supporting arbitrary channel size. For now, " - "set the channel size to the nearest multiple of 32 " - "and ignore any ''extra'' channels unused."); - - l->c /= BITS_PER_BINARY_WORD; // For compensating with doing more work per word - - float *output = l->output; - float *alpha = l->alpha; - float *beta = l->beta; - int px = l->px; - int py = l->py; - BINARY_WORD *binary_weights = l->binary_weights; - - for (int z = 0; z < l->batch; ++z) { // for each filter map - BINARY_WORD *binary_input = l->binary_input; - for (int c = 0; c < l->c; ++c) { // for each input channel - ai2_bin_conv2D(output, binary_input, binary_weights, l->w, l->h, l->wx, l->wy, l->pad, l->stride); - binary_input += px*py; // increment with next 2D plane - binary_weights += l->wx*l->wy; // increment with next 2D plane - - ai2_pointwise_mul_mm(output, beta, px*py); - ai2_pointwise_mul_mm_2d(output, alpha, l->w, l->h, l->wx, l->wy, l->pad); - } - } -} - -// Deprecated -//double ai2_bin_conv_benchmark(ConvolutionArgs conv_args) { -// printf("Running Binary Convolution test!\n"); -// -// size_t ix, iy, iz, wx, wy, wz, L, stride; -// ix = conv_args.input.x; -// iy = conv_args.input.y; -// iz = conv_args.input.z; -// wx = conv_args.weights.x; -// wy = conv_args.weights.y; -// wz = conv_args.weights.z; -// L = BITS_PER_BINARY_WORD; -// stride = 1; -// -// printf("Input size (num elements, xyz): %zu %zu %zu\n", ix, iy, iz); -// printf("Weights size (num elements. xyz): %zu %zu %zu\n", wx, wy, wz); -// -// double sz_input_elements = ix * iy * iz; -// double sz_input_bytes = getSizeBytesBinaryArray(conv_args.input); -// double sz_weight_bytes = getSizeBytesBinaryArray(conv_args.weights); -// -// printf("Input Size (MB): %f\n", sz_input_bytes / (1 << 20)); -// printf("Weight Size (MB): %f\n", sz_weight_bytes / (1 << 20)); -// -// BINARY_WORD *binary_input = mallocBinaryVolume(conv_args.input); -// BINARY_WORD *binary_weights = mallocBinaryVolume(conv_args.weights); -// BINARY_WORD *b_input = binary_input; // alias -// BINARY_WORD *b_weight = binary_weights; // alias -// float *output = mallocFloatVolume(conv_args.output); -// float *output_ptr = output; -// float *beta = (float *) malloc(sizeof(float) * ix * iy); // we assume beta is given to us -// float *alpha = (float *) malloc(sizeof(float) * wx * wy); // we assume alpha is given to us -// float *new_output = mallocFloatVolume(conv_args.output); -// //float *new_output_ptr = new_output; -// float *new_beta = (float *) malloc(sizeof(float) * ix * iy); -// //float *new_beta_ptr = new_beta; -// -// // Scale number of computations because we're packing. -// // After this point, you should not have to reason about input dimensions for input and weights. -// iz /= BITS_PER_BINARY_WORD; -// wz /= BITS_PER_BINARY_WORD; -// -// // Calculate time taken by a request -// struct timeval start_time; -// gettimeofday(&start_time, NULL); -// -// // Preprocessing -// int pad = wx/2; -// -// for (int z = 0; z < iz; ++z) { // number of channels -// ai2_bin_conv2D(output_ptr, b_input, b_weight, ix, iy, wx, wy, pad, stride); -// b_input += ix*iy; // increment with next 2D plane -// b_weight += wx*wy; // increment with next 2D plane -// -// ai2_pointwise_mul_mm(output_ptr, beta, ix*iy); -// ai2_pointwise_mul_mm_2d(output_ptr, alpha, ix, iy, wx, wy, pad); -// } -// -// // copy to new array (need to wrap this around); TODO. -// struct timeval end_time; -// gettimeofday(&end_time, NULL); -// -// struct timeval diff_time; -// timersub(&end_time, &start_time, &diff_time); -// double time_conv_s = diff_time.tv_sec + diff_time.tv_usec * 1e-6; -// double time_conv_ms = time_conv_s * 1000.0; -// -// double model_ops = (3*ix*iy*wx*wy*wz/L) + 2*ix*iy + ix*iy*iz; -// double conv_ops_s = 1e-9 * model_ops / time_conv_s; -// double conv_bandwidth_gb_s = 1e-9 * sz_input_bytes / (time_conv_ms / 1000.0); -// double conv_bandwidth_gelement_s = 1e-9 * sz_input_elements / (time_conv_ms / 1000.0); -// -// printf("Execution Time (ms): %f\n", time_conv_ms); -// printf("Binary Convolution OPS/s (GOPS/s): %f\n", conv_ops_s); -// printf("Binary Convolution Bandwidth (GB/s): %f\n", conv_bandwidth_gb_s); -// printf("Binary Convolution Bandwidth (GElements/s): %f\n\n", conv_bandwidth_gelement_s); -// -// free(binary_input); -// free(binary_weights); -// free(output); -// free(beta); -// free(alpha); -// free(new_output); -// free(new_beta); -// -// return time_conv_ms; -//} - -// double ai2_bin_conv_benchmark(ConvolutionArgs conv_args); - -//void benchmark() { -// int ix, iy, iz, wx, wy, wz; -// iz = (1 << 9) * BITS_PER_BINARY_WORD; -// ix = 227; // x == y for square face -// iy = 227; -// wx = 3; // x == y for a square face -// wy = 3; -// wz = iz; -// -// int runs = 1; -// double accum_binary = 0; -// double accum_real = 0; -// ConvolutionArgs conv_args = initArgs(ix, iy, iz, wx, wy, wz); -// for (int i = 0; i < runs; ++i) { -// double t_binary_convolve = ai2_bin_conv_benchmark(conv_args); -// double t_real_convolve = run_convolve2D_real(conv_args); -// printf("t binary = %lf\n", t_binary_convolve); -// printf("t real = %lf\n", t_real_convolve); -// accum_binary += t_binary_convolve; -// accum_real += t_real_convolve; -// } -// -// accum_binary /= runs; -// accum_real /= runs; -// printf("Average convolution pass binary (ms): %lf\n", accum_binary); -// printf("Average convolution pass flt (ms): %lf\n", accum_real); -// printf("Speedup (Binary over Real): %lfx\n", accum_real / accum_binary); -// exit(1); -//} diff --git a/src/binary_convolution.h b/src/binary_convolution.h deleted file mode 100644 index 602677e8..00000000 --- a/src/binary_convolution.h +++ /dev/null @@ -1,218 +0,0 @@ -#ifndef AI2_BINARY_CONVOLUTION_H -#define AI2_BINARY_CONVOLUTION_H - -/** @file binary_convolution.h - * @brief Routines related for approximating convolutions using binary operations - * - * @author Carlo C. del Mundo (carlom) - * @date 05/23/2016 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "common.h" - -typedef struct { - int batch; // number of filter batches - int c; // channels, z - int h; // height, y - int w; // width, x - int stride; - int pad; - - int px; // padded x (use this for striding in padded input and output arrays) - int py; // padded y (use this for striding in padded input and output arrays) - int wx; - int wy; - - float *input; // input values - BINARY_WORD *binary_input; - - float *weights; // weight or filter values - BINARY_WORD *binary_weights; - - float *output; // output values - BINARY_WORD *binary_output; - - float *alpha; // we assume alpha is calculated at the beginning of initialization - float *beta; // we assume beta is given to us - float *new_beta; // we calculate the new beta for the next layer - - struct ai2_bin_conv_layer *next; -} ai2_bin_conv_layer; - -/** @brief Performs a binary convolution using XNOR and POPCOUNT between input and weights - * - * @param output A 2D real-valued plane to store the outputs - * @param input A 2D binary-valued plane that holds the inputs - * @param weights A 2D binary-valued plane that holds the weights - * @param ix the input's x dimension - * @param iy the input's y dimensions - * @param wx the weight's x dimension - * @param wy the weight's y dimension - * @param pad the amount of padding applied to input. (ix+2*pad is the x dimension of the input - * @param stride NOP. TODO: implement stride. the stride between sliding windows - * @return the count of all overlapping set bits between the two volumes. - */ -void ai2_bin_conv2D(float *output, const BINARY_WORD *input, const BINARY_WORD *weights, int ix, int iy, int wx, int wy, int pad, int stride); - -/** @brief Performs a binary dot product (XNOR and POPCOUNT) for two equal sized volumes. - * - * @param a A 3D binary tensor - * @param b A 3D binary tensor - * @param vdim the dimensionality of the data. Note: we pack 32 elements in the Z element. - * @return the count of all overlapping set bits between the two volumes. - */ -int ai2_bin_dp(BINARY_WORD *a, BINARY_WORD *b, dim3 vdim); - -/** @brief Calculates the alpha plane given an alpha volume. - * - * Each point in the yz alpha plane - * is the average sum of the absolute value of all elements in the z-direction. - * - * Pre-conditions: - * alpha_volume is an array of size x*y*z. - * alpha_plane is an array of size x*y. - * alpha_volume (x,y,z) is transposed to (z,x,y). - * - * @param alpha_plane The 2D real-valued output plane - * @param alpha_volume The 3D real-valued output volume - * @param vdim the dimensionality of alpha_volume. - */ -void ai2_calc_alpha(float *alpha_plane, float *alpha_volume, dim3 vdim); - -/** @brief Wrapper function for generating the beta scaling factor */ -void ai2_calc_beta(float *beta_plane, float *beta_volume, dim3 vdim); - -/** @brief Set the bit in a binary word */ -void ai2_bitset(BINARY_WORD *bword, unsigned int position); - -/** @brief Checks that the bit is set in a binary word */ -int ai2_is_set(BINARY_WORD bword, unsigned int position) ; - -/** @brief Converts a 3D float tensor into a 3D binary tensor. - * - * The value of the ith element in the binary tensor is the sign - * of the ith element in the floating tensor. - * - * @param binary_vol the binary tensor - * @param real_vol the real tensor - * @param vdim the size of the 3D tensor - */ -void ai2_flt_to_bin(BINARY_WORD *binary_vol, float *real_vol, dim3 vdim) ; - -/** @brief Converts a 3D binary tensor into a 3D float tensor. - * - * The ith float element will be '1' if the ith binary element is '1'. - * Otherwise, the float element will be '-1'. - * - * @param real_vol the output real tensor - * @param binary_vol the input binary tensor - * @param vdim the dimension of both binary_vol and real_vol - */ -void ai2_bin_to_flt(float *real_vol, BINARY_WORD *binary_vol, dim3 vdim); - -/** @brief Performs a pointwise matrix multication between two 2D tensors - * @param output A 2D real-valued plane to store the outputs - * @param input A 2D binary-valued plane that holds the inputs - * @param N the number of elements between the arrays - */ -void ai2_pointwise_mul_mm(float *output, const float *input, int N); - -/** @brief Performs a tiled pointwise matrix multiplication between two 2D tensors - * - * Pre-conditions: wx < ix, and wy < iy - * - * @param output A 2D real-valued plane of size ix, iy - * @param alpha A 2D binary-valued plane of size wx, wy - * @param ix the output's x dimension - * @param iy the output's y dimensions - * @param wx the alpha's x dimension - * @param wy the alpha's y dimension - * @param pad how many cells are padded, adds 2*pad to the borders of the image - */ -void ai2_pointwise_mul_mm_2d(float *output, const float *alpha, int ix, int iy, int wx, int wy, int pad); - -// -------------------------------------- -// SETTER FUNCTIONS -// -------------------------------------- -/** @brief Safe function to set the float input of a conv_layer - */ -void ai2_setFltInput(ai2_bin_conv_layer *layer, float *new_input); - -/** @brief Safe function to set the binary input of a conv_layer - */ -void ai2_setBinInput(ai2_bin_conv_layer *layer, BINARY_WORD *new_input); - -/** @brief Safe function to set the binary weights of a conv_layer - */ -void ai2_setFltWeights(ai2_bin_conv_layer *layer, float *new_weights); - -/** @brief Safe function to set the binary weights of a conv_layer - */ -void ai2_setBinWeights(ai2_bin_conv_layer *layer, BINARY_WORD *new_weights); - -/** @brief Safe function to set the binary outputs of a conv_layer - */ -void ai2_setFltOutput(ai2_bin_conv_layer *layer, float *new_output); - -/** @brief Safe function to set the binary outputs of a conv_layer - */ -void ai2_setBinOutput(ai2_bin_conv_layer *layer, BINARY_WORD *new_output); - -/** @brief Safe function to set the alpha of a conv_layer - */ -void ai2_setFltAlpha(ai2_bin_conv_layer *layer, float *new_alpha); - -/** @brief Safe function to set the beta of a conv_layer - */ -void ai2_setFltBeta(ai2_bin_conv_layer *layer, float *new_beta); - -/** @brief Safe function to set the new_beta of a conv_layer - */ -void ai2_setFltNewBeta(ai2_bin_conv_layer *layer, float *new_new_beta); - -// -------------------------------------- -// GETTER FUNCTIONS -// -------------------------------------- -/** @brief Safe function to get the float outputs of a conv_layer - */ -float * ai2_getFltOutput(ai2_bin_conv_layer *layer); - -/** @brief 3D tranpose from (x,y,z) to (z,y,x) - * @return a new pointer with the transposed matrix - */ -void ai2_transpose3D(float *data, dim3 d); - -/** @brief Checks if a float is a whole number (e.g., an int) - */ -int ai2_isFloatWhole(float f); - -/* @brief Allocates all memory objects in an ai2_bin_conv_layer - * b - batches (number of filter batches) - * c - input channels - * ix - input width - * iy - input height - * wx - weight/filter width - * wy - weight/filter height - * s - stride between sliding windows - * pad - the amount of padding - */ -ai2_bin_conv_layer ai2_make_bin_conv_layer(int b, int c, int ix, int iy, int wx, int wy, int s, int pad); - -/* @brief Safe deallocation of all memory objects in an ai2_bin_conv_layer - */ -void ai2_free_bin_conv_layer(ai2_bin_conv_layer *layer); - -/* @brief Given real-valued filter data and a conv layer, performs a forward pass - */ -void ai2_bin_forward(ai2_bin_conv_layer *layer); - -#endif diff --git a/src/common.c b/src/common.c deleted file mode 100644 index 9d59ee8a..00000000 --- a/src/common.c +++ /dev/null @@ -1,81 +0,0 @@ -#include "common.h" - -// Returns the time in ms -double getElapsedTime(Timer *timer) { - // Calculate time it took in seconds - double accum_ms = ( timer->requestEnd.tv_sec - timer->requestStart.tv_sec ) - + ( timer->requestEnd.tv_nsec - timer->requestStart.tv_nsec ) - / 1e6; - return accum_ms; -} - -void start_timer(Timer *timer) { - clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestStart)); -} - -void stop_timer(Timer *timer) { - clock_gettime(CLOCK_MONOTONIC_RAW, &(timer->requestEnd)); -} - - -BINARY_WORD * mallocBinaryVolume(dim3 vol) { - return (BINARY_WORD *) malloc (vol.x * vol.y * vol.z / BITS_PER_BINARY_WORD * sizeof(BINARY_WORD)); -} - -float * mallocFloatVolume(dim3 vol) { - return (float *) malloc (vol.x * vol.y * vol.z * sizeof(float)); -} - -// Returns the size (in bytes) of a binary array with dimensions stored in conv_args -double getSizeBytesBinaryArray(dim3 conv_args) { - return conv_args.x * conv_args.y * conv_args.z * sizeof(BINARY_WORD) / (BITS_PER_BINARY_WORD); -} - - -ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz) { - ConvolutionArgs conv_args; - // Input Volume - conv_args.input.x = ix; // x == y for a square face - conv_args.input.y = iy; - conv_args.input.z = iz; - conv_args.weights.x = wx; // x == y for square face - conv_args.weights.y = wy; - conv_args.weights.z = wz; - - // - // Intermediate Volumes - conv_args.alpha_plane.x = conv_args.weights.x; - conv_args.alpha_plane.y = conv_args.weights.y; - conv_args.alpha_plane.z = 1; - - conv_args.beta_plane.x = 1; - conv_args.beta_plane.y = conv_args.input.y; - conv_args.beta_plane.z = conv_args.input.z; - - conv_args.gamma_plane.x = conv_args.input.x * conv_args.weights.x; - conv_args.gamma_plane.y = conv_args.input.y * conv_args.weights.y; - conv_args.gamma_plane.z = 1; - - conv_args.zeta_plane.x = conv_args.gamma_plane.x; - conv_args.zeta_plane.y = conv_args.gamma_plane.y; - conv_args.zeta_plane.z = 1; - - // Output Volume - conv_args.output.x = conv_args.input.x; - conv_args.output.y = conv_args.input.y; - conv_args.output.z = 1; // Output should be a 2D plane - - // Verify dimensions - //assert(conv_args.weights.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints -// assert(conv_args.weights.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints - assert(conv_args.weights.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints - //assert(conv_args.input.x % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints -// assert(conv_args.input.y % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints - assert(conv_args.input.z % 32 == 0); // must be divisble by 32 for efficient alignment to unsigned 32-bit ints - assert(conv_args.weights.x <= conv_args.input.x); - assert(conv_args.weights.y <= conv_args.input.y); - assert(conv_args.weights.z <= conv_args.input.z); - // - - return conv_args; -} diff --git a/src/common.h b/src/common.h deleted file mode 100644 index bad428d1..00000000 --- a/src/common.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef AI2_COMMON_H -#define AI2_COMMON_H - -#include -#include -#include -#include -#include -#include -#include -#include -#include -//#include -#include - -typedef uint32_t BINARY_WORD; -#define BITS_PER_BINARY_WORD (sizeof(BINARY_WORD) * CHAR_BIT) - -typedef struct{ - struct timespec requestStart; - struct timespec requestEnd; -} Timer; - -typedef struct { - size_t x; - size_t y; - size_t z; -} dim3; - -typedef struct { - dim3 weights; - dim3 input; - dim3 output; - dim3 alpha_plane; - dim3 beta_plane; - dim3 gamma_plane; - dim3 zeta_plane; -} ConvolutionArgs; - -// Timer stuff -double getElapsedTime(Timer *timer); // Returns the time in ms -void start_timer(Timer *timer); -void stop_timer(Timer *timer); - -BINARY_WORD * mallocBinaryVolume(dim3 vol); -float * mallocFloatVolume(dim3 vol); -ConvolutionArgs initArgs(size_t ix, size_t iy, size_t iz, size_t wx, size_t wy, size_t wz); -double getSizeBytesBinaryArray(dim3 conv_args); - -#endif