From f13954ac4c8bd01c1663e0709abcf4d3ce412ff7 Mon Sep 17 00:00:00 2001 From: Anup Date: Mon, 12 Dec 2016 13:07:34 +0530 Subject: [PATCH] Compiles for OpenCV 3.1 --- Makefile | 72 +++++++++++++++++++++++++--------- src/activation_kernels.cu | 13 ++++-- src/activations.h | 6 +++ src/avgpool_layer.h | 6 +++ src/avgpool_layer_kernels.cu | 12 ++++-- src/batchnorm_layer.h | 6 +++ src/blas.h | 23 +++++++++++ src/blas_kernels.cu | 70 ++++++++++++++++++--------------- src/col2im.h | 6 +++ src/col2im_kernels.cu | 8 +++- src/convolutional_kernels.cu | 8 +++- src/convolutional_layer.h | 13 ++++++ src/crop_layer.h | 6 +++ src/crop_layer_kernels.cu | 10 ++++- src/cuda.h | 9 +++++ src/data.h | 6 +++ src/deconvolutional_kernels.cu | 18 ++++++--- src/dropout_layer.h | 6 +++ src/dropout_layer_kernels.cu | 8 +++- src/gemm.h | 6 +++ src/im2col.h | 6 +++ src/im2col_kernels.cu | 8 +++- src/maxpool_layer.h | 6 +++ src/maxpool_layer_kernels.cu | 12 ++++-- src/network.h | 26 +++++++++++- src/network_kernels.cu | 8 +++- src/stb_image.h | 4 +- src/stb_image_write.h | 4 +- src/yolo_kernels.cu | 20 ++++++---- 29 files changed, 321 insertions(+), 85 deletions(-) diff --git a/Makefile b/Makefile index 3d3d5e43..5258ecd5 100644 --- a/Makefile +++ b/Makefile @@ -1,28 +1,45 @@ -GPU=0 -CUDNN=0 -OPENCV=0 -DEBUG=0 +# +# Default make builds both original darknet, and its CPP equivalent darknet-cpp +# make darknet - only darknet (original code) +# make darknet-cpp - only the CPP version +# +# CPP version supports OpenCV3. Tested on Ubuntu 16.04 +# +# OPENCV=1 (C++ && CV3, or C && CV2 only - check with pkg-config --modversion opencv) +# When building CV3 and C version, will get errors like +# ./obj/image.o: In function `cvPointFrom32f': +# /usr/local/include/opencv2/core/types_c.h:929: undefined reference to `cvRound' +# +# -ARCH= -gencode arch=compute_20,code=[sm_20,sm_21] \ - -gencode arch=compute_30,code=sm_30 \ - -gencode arch=compute_35,code=sm_35 \ - -gencode arch=compute_50,code=[sm_50,compute_50] \ - -gencode arch=compute_52,code=[sm_52,compute_52] +GPU=1 +CUDNN=1 +OPENCV=1 +DEBUG=1 -# This is what I use, uncomment if you know your arch and want to specify -# ARCH= -gencode arch=compute_52,code=compute_52 +ARCH= --gpu-architecture=compute_52 --gpu-code=compute_52 + +# C Definitions VPATH=./src/ EXEC=darknet OBJDIR=./obj/ - CC=gcc -NVCC=nvcc + +# C++ Definitions +EXEC_CPP=darknet-cpp +OBJDIR_CPP=./obj-cpp/ +CC_CPP=g++ +CFLAGS_CPP=-Wno-write-strings + +NVCC=nvcc -ccbin /usr/bin/g++ OPTS=-Ofast LDFLAGS= -lm -pthread COMMON= CFLAGS=-Wall -Wfatal-errors + + ifeq ($(DEBUG), 1) OPTS=-O0 -g endif @@ -36,6 +53,9 @@ LDFLAGS+= `pkg-config --libs opencv` COMMON+= `pkg-config --cflags opencv` endif +# Place the IPP .a file from OpenCV here for easy linking +LDFLAGS += -L/usr/local/share/OpenCV/3rdparty/lib + ifeq ($(GPU), 1) COMMON+= -DGPU -I/usr/local/cuda/include/ CFLAGS+= -DGPU @@ -57,21 +77,35 @@ endif OBJS = $(addprefix $(OBJDIR), $(OBJ)) DEPS = $(wildcard src/*.h) Makefile -all: obj backup results $(EXEC) +OBJS_CPP = $(addprefix $(OBJDIR_CPP), $(OBJ)) -$(EXEC): $(OBJS) - $(CC) $(COMMON) $(CFLAGS) $^ -o $@ $(LDFLAGS) +all: obj obj-cpp results $(EXEC) $(EXEC_CPP) + +$(EXEC): obj clean $(OBJS) + $(CC) $(COMMON) $(CFLAGS) $(OBJS) -o $@ $(LDFLAGS) $(OBJDIR)%.o: %.c $(DEPS) $(CC) $(COMMON) $(CFLAGS) -c $< -o $@ +$(EXEC_CPP): obj-cpp clean-cpp $(OBJS_CPP) + $(CC_CPP) $(COMMON) $(CFLAGS) $(OBJS_CPP) -o $@ $(LDFLAGS) + +$(OBJDIR_CPP)%.o: %.c $(DEPS_CPP) + $(CC_CPP) $(COMMON) $(CFLAGS_CPP) $(CFLAGS) -c $< -o $@ + + $(OBJDIR)%.o: %.cu $(DEPS) $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ +$(OBJDIR_CPP)%.o: %.cu $(DEPS) + $(NVCC) $(ARCH) $(COMMON) --compiler-options "$(CFLAGS)" -c $< -o $@ + + obj: mkdir -p obj -backup: - mkdir -p backup +obj-cpp: + mkdir -p obj-cpp + results: mkdir -p results @@ -79,4 +113,6 @@ results: clean: rm -rf $(OBJS) $(EXEC) +clean-cpp: + rm -rf $(OBJS_CPP) $(EXEC_CPP) diff --git a/src/activation_kernels.cu b/src/activation_kernels.cu index 994e2068..50ddc7d0 100644 --- a/src/activation_kernels.cu +++ b/src/activation_kernels.cu @@ -2,12 +2,17 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "activations.h" #include "cuda.h" } - __device__ float lhtan_activate_kernel(float x) { if(x < 0) return .001*x; @@ -152,13 +157,13 @@ __global__ void gradient_array_kernel(float *x, int n, ACTIVATION a, float *delt if(i < n) delta[i] *= gradient_kernel(x[i], a); } -extern "C" void activate_array_ongpu(float *x, int n, ACTIVATION a) +EXTERNC void activate_array_ongpu(float *x, int n, ACTIVATION a) { activate_array_kernel<<>>(x, n, a); check_error(cudaPeekAtLastError()); } -extern "C" void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) +EXTERNC void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta) { gradient_array_kernel<<>>(x, n, a, delta); check_error(cudaPeekAtLastError()); diff --git a/src/activations.h b/src/activations.h index 1c36ff52..d6aa25dd 100644 --- a/src/activations.h +++ b/src/activations.h @@ -15,8 +15,14 @@ float gradient(float x, ACTIVATION a); void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta); void activate_array(float *x, const int n, const ACTIVATION a); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void activate_array_ongpu(float *x, int n, ACTIVATION a); void gradient_array_ongpu(float *x, int n, ACTIVATION a, float *delta); +#ifdef __cplusplus +} +#endif #endif static inline float stair_activate(float x) diff --git a/src/avgpool_layer.h b/src/avgpool_layer.h index f8329aea..f476b960 100644 --- a/src/avgpool_layer.h +++ b/src/avgpool_layer.h @@ -15,8 +15,14 @@ void forward_avgpool_layer(const avgpool_layer l, network_state state); void backward_avgpool_layer(const avgpool_layer l, network_state state); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_avgpool_layer_gpu(avgpool_layer l, network_state state); void backward_avgpool_layer_gpu(avgpool_layer l, network_state state); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/avgpool_layer_kernels.cu b/src/avgpool_layer_kernels.cu index b7e2770e..905b6e34 100644 --- a/src/avgpool_layer_kernels.cu +++ b/src/avgpool_layer_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "avgpool_layer.h" #include "cuda.h" } @@ -43,7 +49,7 @@ __global__ void backward_avgpool_layer_kernel(int n, int w, int h, int c, float } } -extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state) +EXTERNC void forward_avgpool_layer_gpu(avgpool_layer layer, network_state state) { size_t n = layer.c*layer.batch; @@ -51,7 +57,7 @@ extern "C" void forward_avgpool_layer_gpu(avgpool_layer layer, network_state sta check_error(cudaPeekAtLastError()); } -extern "C" void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state) +EXTERNC void backward_avgpool_layer_gpu(avgpool_layer layer, network_state state) { size_t n = layer.c*layer.batch; diff --git a/src/batchnorm_layer.h b/src/batchnorm_layer.h index 99d1d0fe..5d73ffd1 100644 --- a/src/batchnorm_layer.h +++ b/src/batchnorm_layer.h @@ -10,8 +10,14 @@ void forward_batchnorm_layer(layer l, network_state state); void backward_batchnorm_layer(layer l, network_state state); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_batchnorm_layer_gpu(layer l, network_state state); void backward_batchnorm_layer_gpu(layer l, network_state state); +#ifdef __cplusplus +} +#endif void pull_batchnorm_layer(layer l); void push_batchnorm_layer(layer l); #endif diff --git a/src/blas.h b/src/blas.h index 3d6ee7d3..48ce7fb1 100644 --- a/src/blas.h +++ b/src/blas.h @@ -1,5 +1,9 @@ #ifndef BLAS_H #define BLAS_H + +#ifdef __cplusplus +extern "C" { +#endif void flatten(float *x, int size, int layers, int batch, int forward); void pm(int M, int N, float *A); float *random_matrix(int rows, int cols); @@ -36,10 +40,16 @@ void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); void softmax(float *input, int n, float temp, float *output); +#ifdef __cplusplus +} +#endif #ifdef GPU #include "cuda.h" +#ifdef __cplusplus +extern "C" { +#endif void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); @@ -67,9 +77,17 @@ void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_scale_gpu(float *x_norm, float *delta, int batch, int n, int size, float *scale_updates); void scale_bias_gpu(float *output, float *biases, int batch, int n, int size); +#ifdef __cplusplus +} +#endif + void add_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); +#ifdef __cplusplus +extern "C" { +#endif + void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error); void l2_gpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc); @@ -84,4 +102,9 @@ void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rat void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out); #endif + +#ifdef __cplusplus +} +#endif + #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index d9401766..b0eb4ea7 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -3,7 +3,13 @@ #include "cublas_v2.h" #include -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "blas.h" #include "cuda.h" #include "utils.h" @@ -149,7 +155,7 @@ __global__ void adam_kernel(int N, float *x, float *m, float *v, float B1, float //if(index == 0) printf("%f %f %f %f\n", m[index], v[index], (rate * sqrt(1.-pow(B2, t)) / (1.-pow(B1, t)) * m[index] / (sqrt(v[index]) + eps))); } -extern "C" void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) +EXTERNC void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t) { adam_kernel<<>>(n, x, m, v, B1, B2, rate, eps, t); check_error(cudaPeekAtLastError()); @@ -173,7 +179,7 @@ __global__ void normalize_delta_kernel(int N, float *x, float *mean, float *vari delta[index] = delta[index] * 1./(sqrt(variance[f]) + .000001f) + variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) + mean_delta[f]/(spatial*batch); } -extern "C" void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) +EXTERNC void normalize_delta_gpu(float *x, float *mean, float *variance, float *mean_delta, float *variance_delta, int batch, int filters, int spatial, float *delta) { size_t N = batch*filters*spatial; normalize_delta_kernel<<>>(N, x, mean, variance, mean_delta, variance_delta, batch, filters, spatial, delta); @@ -277,19 +283,19 @@ __global__ void mean_delta_kernel(float *delta, float *variance, int batch, int mean_delta[i] *= (-1./sqrt(variance[i] + .000001f)); } -extern "C" void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) +EXTERNC void mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) { mean_delta_kernel<<>>(delta, variance, batch, filters, spatial, mean_delta); check_error(cudaPeekAtLastError()); } -extern "C" void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) +EXTERNC void fast_mean_delta_gpu(float *delta, float *variance, int batch, int filters, int spatial, float *mean_delta) { fast_mean_delta_kernel<<>>(delta, variance, batch, filters, spatial, mean_delta); check_error(cudaPeekAtLastError()); } -extern "C" void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) +EXTERNC void fast_variance_delta_gpu(float *x, float *delta, float *mean, float *variance, int batch, int filters, int spatial, float *variance_delta) { fast_variance_delta_kernel<<>>(x, delta, mean, variance, batch, filters, spatial, variance_delta); check_error(cudaPeekAtLastError()); @@ -422,7 +428,7 @@ __global__ void mul_kernel(int N, float *X, int INCX, float *Y, int INCY) } -extern "C" void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) +EXTERNC void normalize_gpu(float *x, float *mean, float *variance, int batch, int filters, int spatial) { size_t N = batch*filters*spatial; normalize_kernel<<>>(N, x, mean, variance, batch, filters, spatial); @@ -484,60 +490,60 @@ __global__ void fast_variance_kernel(float *x, float *mean, int batch, int filt } } -extern "C" void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) +EXTERNC void fast_mean_gpu(float *x, int batch, int filters, int spatial, float *mean) { fast_mean_kernel<<>>(x, batch, filters, spatial, mean); check_error(cudaPeekAtLastError()); } -extern "C" void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) +EXTERNC void fast_variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) { fast_variance_kernel<<>>(x, mean, batch, filters, spatial, variance); check_error(cudaPeekAtLastError()); } -extern "C" void mean_gpu(float *x, int batch, int filters, int spatial, float *mean) +EXTERNC void mean_gpu(float *x, int batch, int filters, int spatial, float *mean) { mean_kernel<<>>(x, batch, filters, spatial, mean); check_error(cudaPeekAtLastError()); } -extern "C" void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) +EXTERNC void variance_gpu(float *x, float *mean, int batch, int filters, int spatial, float *variance) { variance_kernel<<>>(x, mean, batch, filters, spatial, variance); check_error(cudaPeekAtLastError()); } -extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) +EXTERNC void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); } -extern "C" void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) +EXTERNC void pow_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { pow_kernel<<>>(N, ALPHA, X, INCX, Y, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +EXTERNC void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { axpy_kernel<<>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY) +EXTERNC void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY) { copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY); } -extern "C" void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY) +EXTERNC void mul_ongpu(int N, float * X, int INCX, float * Y, int INCY) { mul_kernel<<>>(N, X, INCX, Y, INCY); check_error(cudaPeekAtLastError()); } -extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) +EXTERNC void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { copy_kernel<<>>(N, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); @@ -560,52 +566,52 @@ __global__ void flatten_kernel(int N, float *x, int spatial, int layers, int bat else out[i1] = x[i2]; } -extern "C" void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out) +EXTERNC void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, float *out) { int size = spatial*batch*layers; flatten_kernel<<>>(size, x, spatial, layers, batch, forward, out); check_error(cudaPeekAtLastError()); } -extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) +EXTERNC void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out) { int size = w*h*c*batch; reorg_kernel<<>>(size, x, w, h, c, batch, stride, forward, out); check_error(cudaPeekAtLastError()); } -extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask) +EXTERNC void mask_ongpu(int N, float * X, float mask_num, float * mask) { mask_kernel<<>>(N, X, mask_num, mask); check_error(cudaPeekAtLastError()); } -extern "C" void const_ongpu(int N, float ALPHA, float * X, int INCX) +EXTERNC void const_ongpu(int N, float ALPHA, float * X, int INCX) { const_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void constrain_ongpu(int N, float ALPHA, float * X, int INCX) +EXTERNC void constrain_ongpu(int N, float ALPHA, float * X, int INCX) { constrain_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) +EXTERNC void scal_ongpu(int N, float ALPHA, float * X, int INCX) { scal_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void supp_ongpu(int N, float ALPHA, float * X, int INCX) +EXTERNC void supp_ongpu(int N, float ALPHA, float * X, int INCX) { supp_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); } -extern "C" void fill_ongpu(int N, float ALPHA, float * X, int INCX) +EXTERNC void fill_ongpu(int N, float ALPHA, float * X, int INCX) { fill_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); @@ -628,7 +634,7 @@ __global__ void shortcut_kernel(int size, int minw, int minh, int minc, int stri out[out_index] += add[add_index]; } -extern "C" void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out) +EXTERNC void shortcut_gpu(int batch, int w1, int h1, int c1, float *add, int w2, int h2, int c2, float *out) { int minw = (w1 < w2) ? w1 : w2; int minh = (h1 < h2) ? h1 : h2; @@ -663,7 +669,7 @@ __global__ void smooth_l1_kernel(int n, float *pred, float *truth, float *delta, } } -extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error) +EXTERNC void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error) { smooth_l1_kernel<<>>(n, pred, truth, delta, error); check_error(cudaPeekAtLastError()); @@ -679,7 +685,7 @@ __global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float } } -extern "C" void l2_gpu(int n, float *pred, float *truth, float *delta, float *error) +EXTERNC void l2_gpu(int n, float *pred, float *truth, float *delta, float *error) { l2_kernel<<>>(n, pred, truth, delta, error); check_error(cudaPeekAtLastError()); @@ -695,7 +701,7 @@ __global__ void weighted_sum_kernel(int n, float *a, float *b, float *s, float * } } -extern "C" void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c) +EXTERNC void weighted_sum_gpu(float *a, float *b, float *s, int num, float *c) { weighted_sum_kernel<<>>(num, a, b, s, c); check_error(cudaPeekAtLastError()); @@ -711,7 +717,7 @@ __global__ void weighted_delta_kernel(int n, float *a, float *b, float *s, float } } -extern "C" void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc) +EXTERNC void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc) { weighted_delta_kernel<<>>(num, a, b, s, da, db, ds, dc); check_error(cudaPeekAtLastError()); @@ -725,7 +731,7 @@ __global__ void mult_add_into_kernel(int n, float *a, float *b, float *c) } } -extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c) +EXTERNC void mult_add_into_gpu(int num, float *a, float *b, float *c) { mult_add_into_kernel<<>>(num, a, b, c); check_error(cudaPeekAtLastError()); @@ -758,7 +764,7 @@ __global__ void softmax_kernel(int n, int offset, int batch, float *input, float softmax_device(n, input + b*offset, temp, output + b*offset); } -extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output) +EXTERNC void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output) { int inputs = n; int batch = groups; diff --git a/src/col2im.h b/src/col2im.h index 02374972..790c0adc 100644 --- a/src/col2im.h +++ b/src/col2im.h @@ -6,8 +6,14 @@ void col2im_cpu(float* data_col, int ksize, int stride, int pad, float* data_im); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void col2im_ongpu(float *data_col, int channels, int height, int width, int ksize, int stride, int pad, float *data_im); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/col2im_kernels.cu b/src/col2im_kernels.cu index aed2df9b..26608b42 100644 --- a/src/col2im_kernels.cu +++ b/src/col2im_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "col2im.h" #include "cuda.h" } diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index ae9df8f0..e7a2c409 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "convolutional_layer.h" #include "batchnorm_layer.h" #include "gemm.h" diff --git a/src/convolutional_layer.h b/src/convolutional_layer.h index 970aa101..4d5eacad 100644 --- a/src/convolutional_layer.h +++ b/src/convolutional_layer.h @@ -10,6 +10,9 @@ typedef layer convolutional_layer; #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_convolutional_layer_gpu(convolutional_layer layer, network_state state); void backward_convolutional_layer_gpu(convolutional_layer layer, network_state state); void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); @@ -17,6 +20,10 @@ void update_convolutional_layer_gpu(convolutional_layer layer, int batch, float void push_convolutional_layer(convolutional_layer layer); void pull_convolutional_layer(convolutional_layer layer); +#ifdef __cplusplus +} +#endif + void add_bias_gpu(float *output, float *biases, int batch, int n, int size); void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size); #ifdef CUDNN @@ -31,7 +38,13 @@ void forward_convolutional_layer(const convolutional_layer layer, network_state void update_convolutional_layer(convolutional_layer layer, int batch, float learning_rate, float momentum, float decay); image *visualize_convolutional_layer(convolutional_layer layer, char *window, image *prev_weights); void binarize_weights(float *weights, int n, int size, float *binary); +#ifdef __cplusplus +extern "C" { +#endif void swap_binary(convolutional_layer *l); +#ifdef __cplusplus +} +#endif void binarize_weights2(float *weights, int n, int size, char *binary, float *scales); void backward_convolutional_layer(convolutional_layer layer, network_state state); diff --git a/src/crop_layer.h b/src/crop_layer.h index 3aa2d3dd..b76961c7 100644 --- a/src/crop_layer.h +++ b/src/crop_layer.h @@ -13,7 +13,13 @@ void forward_crop_layer(const crop_layer l, network_state state); void resize_crop_layer(layer *l, int w, int h); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_crop_layer_gpu(crop_layer l, network_state state); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/crop_layer_kernels.cu b/src/crop_layer_kernels.cu index 8a086305..dbfec5f0 100644 --- a/src/crop_layer_kernels.cu +++ b/src/crop_layer_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "crop_layer.h" #include "utils.h" #include "cuda.h" @@ -180,7 +186,7 @@ __global__ void forward_crop_layer_kernel(float *input, float *rand, int size, i output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k); } -extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state) +EXTERNC void forward_crop_layer_gpu(crop_layer layer, network_state state) { cuda_random(layer.rand_gpu, layer.batch*8); diff --git a/src/cuda.h b/src/cuda.h index 29b1eefc..c533bdc2 100644 --- a/src/cuda.h +++ b/src/cuda.h @@ -15,7 +15,12 @@ extern int gpu_index; #include "cudnn.h" #endif +#ifdef __cplusplus +extern "C" { +#endif + void check_error(cudaError_t status); +void check_cublas_error(cublasStatus_t status); cublasHandle_t blas_handle(); float *cuda_make_array(float *x, size_t n); int *cuda_make_int_array(size_t n); @@ -31,5 +36,9 @@ dim3 cuda_gridsize(size_t n); cudnnHandle_t cudnn_handle(); #endif +#ifdef __cplusplus +} +#endif + #endif #endif diff --git a/src/data.h b/src/data.h index 3f6ef610..2ce16ffa 100644 --- a/src/data.h +++ b/src/data.h @@ -94,7 +94,13 @@ data load_data_writing(char **paths, int n, int m, int w, int h, int out_w, int list *get_paths(char *filename); char **get_labels(char *filename); void get_random_batch(data d, int n, float *X, float *y); +#ifdef __cplusplus +extern "C" { +#endif data get_data_part(data d, int part, int total); +#ifdef __cplusplus +} +#endif data get_random_data(data d, int num); void get_next_batch(data d, int n, int offset, float *X, float *y); data load_categorical_data_csv(char *filename, int target, int k); diff --git a/src/deconvolutional_kernels.cu b/src/deconvolutional_kernels.cu index d6259fb3..6ae19d1f 100644 --- a/src/deconvolutional_kernels.cu +++ b/src/deconvolutional_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "convolutional_layer.h" #include "deconvolutional_layer.h" #include "gemm.h" @@ -13,7 +19,7 @@ extern "C" { #include "cuda.h" } -extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) +EXTERNC void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) { int i; int out_h = deconvolutional_out_height(layer); @@ -39,7 +45,7 @@ extern "C" void forward_deconvolutional_layer_gpu(deconvolutional_layer layer, n activate_array(layer.output_gpu, layer.batch*layer.n*size, layer.activation); } -extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) +EXTERNC void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, network_state state) { float alpha = 1./layer.batch; int out_h = deconvolutional_out_height(layer); @@ -79,7 +85,7 @@ extern "C" void backward_deconvolutional_layer_gpu(deconvolutional_layer layer, } } -extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer) +EXTERNC void pull_deconvolutional_layer(deconvolutional_layer layer) { cuda_pull_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); cuda_pull_array(layer.biases_gpu, layer.biases, layer.n); @@ -87,7 +93,7 @@ extern "C" void pull_deconvolutional_layer(deconvolutional_layer layer) cuda_pull_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); } -extern "C" void push_deconvolutional_layer(deconvolutional_layer layer) +EXTERNC void push_deconvolutional_layer(deconvolutional_layer layer) { cuda_push_array(layer.weights_gpu, layer.weights, layer.c*layer.n*layer.size*layer.size); cuda_push_array(layer.biases_gpu, layer.biases, layer.n); @@ -95,7 +101,7 @@ extern "C" void push_deconvolutional_layer(deconvolutional_layer layer) cuda_push_array(layer.bias_updates_gpu, layer.bias_updates, layer.n); } -extern "C" void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay) +EXTERNC void update_deconvolutional_layer_gpu(deconvolutional_layer layer, float learning_rate, float momentum, float decay) { int size = layer.size*layer.size*layer.c*layer.n; diff --git a/src/dropout_layer.h b/src/dropout_layer.h index 691cfc5b..035e64b8 100644 --- a/src/dropout_layer.h +++ b/src/dropout_layer.h @@ -13,8 +13,14 @@ void backward_dropout_layer(dropout_layer l, network_state state); void resize_dropout_layer(dropout_layer *l, int inputs); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_dropout_layer_gpu(dropout_layer l, network_state state); void backward_dropout_layer_gpu(dropout_layer l, network_state state); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/dropout_layer_kernels.cu b/src/dropout_layer_kernels.cu index 7e51bd55..0040e55c 100644 --- a/src/dropout_layer_kernels.cu +++ b/src/dropout_layer_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "dropout_layer.h" #include "cuda.h" #include "utils.h" diff --git a/src/gemm.h b/src/gemm.h index f0231bfa..d1153731 100644 --- a/src/gemm.h +++ b/src/gemm.h @@ -19,6 +19,9 @@ void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float *C, int ldc); #ifdef GPU +#ifndef __cplusplus +extern "C" { +#endif void gemm_ongpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A_gpu, int lda, float *B_gpu, int ldb, @@ -30,5 +33,8 @@ void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *B, int ldb, float BETA, float *C, int ldc); +#ifndef __cplusplus +} +#endif #endif #endif diff --git a/src/im2col.h b/src/im2col.h index f0ddeeeb..bfc8fa3d 100644 --- a/src/im2col.h +++ b/src/im2col.h @@ -7,9 +7,15 @@ void im2col_cpu(float* data_im, #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void im2col_ongpu(float *im, int channels, int height, int width, int ksize, int stride, int pad,float *data_col); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index d42d600b..8e1e0b50 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "im2col.h" #include "cuda.h" } diff --git a/src/maxpool_layer.h b/src/maxpool_layer.h index ce56dd88..990bb800 100644 --- a/src/maxpool_layer.h +++ b/src/maxpool_layer.h @@ -15,8 +15,14 @@ void forward_maxpool_layer(const maxpool_layer l, network_state state); void backward_maxpool_layer(const maxpool_layer l, network_state state); #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif void forward_maxpool_layer_gpu(maxpool_layer l, network_state state); void backward_maxpool_layer_gpu(maxpool_layer l, network_state state); +#ifdef __cplusplus +} +#endif #endif #endif diff --git a/src/maxpool_layer_kernels.cu b/src/maxpool_layer_kernels.cu index 6381cc1e..9f47224c 100644 --- a/src/maxpool_layer_kernels.cu +++ b/src/maxpool_layer_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "maxpool_layer.h" #include "cuda.h" } @@ -84,7 +90,7 @@ __global__ void backward_maxpool_layer_kernel(int n, int in_h, int in_w, int in_ prev_delta[index] += d; } -extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state) +EXTERNC void forward_maxpool_layer_gpu(maxpool_layer layer, network_state state) { int h = layer.out_h; int w = layer.out_w; @@ -96,7 +102,7 @@ extern "C" void forward_maxpool_layer_gpu(maxpool_layer layer, network_state sta check_error(cudaPeekAtLastError()); } -extern "C" void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state) +EXTERNC void backward_maxpool_layer_gpu(maxpool_layer layer, network_state state) { size_t n = layer.h*layer.w*layer.c*layer.batch; diff --git a/src/network.h b/src/network.h index e48cbc28..f63b9eae 100644 --- a/src/network.h +++ b/src/network.h @@ -72,6 +72,9 @@ typedef struct network_state { } network_state; #ifdef GPU +#ifdef __cplusplus +extern "C" { +#endif float train_networks(network *nets, int n, data d, int interval); void sync_nets(network *nets, int n, int interval); float train_network_datum_gpu(network net, float *x, float *y); @@ -82,23 +85,38 @@ float *get_network_output_gpu(network net); void forward_network_gpu(network net, network_state state); void backward_network_gpu(network net, network_state state); void update_network_gpu(network net); +#ifdef __cplusplus +} +#endif #endif +#ifdef __cplusplus +extern "C" { +#endif float get_current_rate(network net); int get_current_batch(network net); void free_network(network net); void compare_networks(network n1, network n2, data d); char *get_layer_string(LAYER_TYPE a); +#ifdef __cplusplus +} +#endif network make_network(int n); void forward_network(network net, network_state state); void backward_network(network net, network_state state); void update_network(network net); +#ifdef __cplusplus +extern "C" { +#endif float train_network(network net, data d); float train_network_batch(network net, data d, int n); float train_network_sgd(network net, data d, int n); float train_network_datum(network net, float *x, float *y); +#ifdef __cplusplus +} +#endif matrix network_predict_data(network net, data test); float *network_predict(network net, float *input); @@ -111,7 +129,6 @@ float *get_network_output_layer(network net, int i); float *get_network_delta_layer(network net, int i); float *get_network_delta(network net); int get_network_output_size_layer(network net, int i); -int get_network_output_size(network net); image get_network_image(network net); image get_network_image_layer(network net, int i); int get_predicted_class_network(network net); @@ -119,8 +136,15 @@ void print_network(network net); void visualize_network(network net); int resize_network(network *net, int w, int h); void set_batch_network(network *net, int b); +#ifdef __cplusplus +extern "C" { +#endif +int get_network_output_size(network net); int get_network_input_size(network net); float get_network_cost(network net); +#ifdef __cplusplus +} +#endif int get_network_nuisance(network net); int get_network_background(network net); diff --git a/src/network_kernels.cu b/src/network_kernels.cu index 313cd6d1..e216d868 100644 --- a/src/network_kernels.cu +++ b/src/network_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include #include #include diff --git a/src/stb_image.h b/src/stb_image.h index d0fa9c21..a1628471 100644 --- a/src/stb_image.h +++ b/src/stb_image.h @@ -402,7 +402,7 @@ enum typedef unsigned char stbi_uc; -#ifdef __cplusplus +#ifndef __cplusplus extern "C" { #endif @@ -508,7 +508,7 @@ STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen); -#ifdef __cplusplus +#ifndef __cplusplus } #endif diff --git a/src/stb_image_write.h b/src/stb_image_write.h index f5250b31..44d91a04 100644 --- a/src/stb_image_write.h +++ b/src/stb_image_write.h @@ -80,7 +80,7 @@ CREDITS: #ifndef INCLUDE_STB_IMAGE_WRITE_H #define INCLUDE_STB_IMAGE_WRITE_H -#ifdef __cplusplus +#ifndef __cplusplus extern "C" { #endif @@ -89,7 +89,7 @@ extern int stbi_write_bmp(char const *filename, int w, int h, int comp, const vo extern int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data); extern int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data); -#ifdef __cplusplus +#ifndef __cplusplus } #endif diff --git a/src/yolo_kernels.cu b/src/yolo_kernels.cu index 09f3961f..614e45b5 100644 --- a/src/yolo_kernels.cu +++ b/src/yolo_kernels.cu @@ -2,7 +2,13 @@ #include "curand.h" #include "cublas_v2.h" -extern "C" { +#ifdef __cplusplus +#define EXTERNC extern "C" +#else +#define EXTERNC +#endif + +EXTERNC { #include "network.h" #include "detection_layer.h" #include "cost_layer.h" @@ -16,11 +22,11 @@ extern "C" { #ifdef OPENCV #include "opencv2/highgui/highgui.hpp" #include "opencv2/imgproc/imgproc.hpp" -extern "C" image ipl_to_image(IplImage* src); -extern "C" void convert_yolo_detections(float *predictions, int classes, int num, int square, int side, int w, int h, float thresh, float **probs, box *boxes, int only_objectness); +EXTERNC image ipl_to_image(IplImage* src); +EXTERNC void convert_yolo_detections(float *predictions, int classes, int num, int square, int side, int w, int h, float thresh, float **probs, box *boxes, int only_objectness); -extern "C" char *voc_names[]; -extern "C" image voc_labels[]; +EXTERNC char *voc_names[]; +EXTERNC image voc_labels[]; static float **probs; static box *boxes; @@ -63,7 +69,7 @@ void *detect_in_thread(void *ptr) return 0; } -extern "C" void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index) +EXTERNC void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index) { demo_thresh = thresh; printf("YOLO demo\n"); @@ -125,7 +131,7 @@ extern "C" void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam } } #else -extern "C" void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index){ +EXTERNC void demo_yolo(char *cfgfile, char *weightfile, float thresh, int cam_index){ fprintf(stderr, "YOLO demo needs OpenCV for webcam images.\n"); } #endif