From 00de023601aa25713074b41801ffbbd7a7b1dfe7 Mon Sep 17 00:00:00 2001 From: Stefano Sinigardi Date: Tue, 19 Feb 2019 15:57:18 +0100 Subject: [PATCH] fully separate C-API from CPP-API --- include/darknet.h | 2 ++ include/yolo_v2_class.hpp | 28 +++++++++++++++++++++------- src/convolutional_layer.c | 8 +++++++- src/cpu_gemm.c | 11 ++++++++--- src/gemm.c | 37 +++++++++++++++++++++---------------- src/image.c | 3 +++ src/utils.c | 3 +++ src/yolo_v2_class.cpp | 1 + 8 files changed, 66 insertions(+), 27 deletions(-) diff --git a/include/darknet.h b/include/darknet.h index 40683712..23be5b98 100644 --- a/include/darknet.h +++ b/include/darknet.h @@ -12,6 +12,7 @@ #include #include +#ifndef LIB_API #ifdef LIB_EXPORTS #if defined(_MSC_VER) #define LIB_API __declspec(dllexport) @@ -25,6 +26,7 @@ #define LIB_API #endif #endif +#endif #define NFRAMES 3 #define SECRET_NUM -1234 diff --git a/include/yolo_v2_class.hpp b/include/yolo_v2_class.hpp index e687b1e8..52206251 100644 --- a/include/yolo_v2_class.hpp +++ b/include/yolo_v2_class.hpp @@ -1,23 +1,37 @@ #ifndef YOLO_V2_CLASS_HPP #define YOLO_V2_CLASS_HPP -#include "darknet.h" +#ifndef LIB_API +#ifdef LIB_EXPORTS +#if defined(_MSC_VER) +#define LIB_API __declspec(dllexport) +#else +#define LIB_API __attribute__((visibility("default"))) +#endif +#else +#if defined(_MSC_VER) +#define LIB_API +#else +#define LIB_API +#endif +#endif +#endif #define C_SHARP_MAX_OBJECTS 1000 struct bbox_t { - unsigned int x, y, w, h; // (x,y) - top-left corner, (w, h) - width & height of bounded box + unsigned int x, y, w, h; // (x,y) - top-left corner, (w, h) - width & height of bounded box float prob; // confidence - probability that the object was found correctly - unsigned int obj_id; // class of object - from range [0, classes-1] - unsigned int track_id; // tracking id for video (0 - untracked, 1 - inf - tracked object) - unsigned int frames_counter;// counter of frames on which the object was detected + unsigned int obj_id; // class of object - from range [0, classes-1] + unsigned int track_id; // tracking id for video (0 - untracked, 1 - inf - tracked object) + unsigned int frames_counter; // counter of frames on which the object was detected }; struct image_t { int h; // height int w; // width int c; // number of chanels (3 - for RGB) - float *data; // pointer to the image data + float *data; // pointer to the image data }; struct bbox_t_container { @@ -34,7 +48,7 @@ struct bbox_t_container { #include // C++ #include // C #include // C -#endif // OPENCV +#endif extern "C" LIB_API int init(const char *configurationFilename, const char *weightsFilename, int gpu); extern "C" LIB_API int detect_image(const char *filename, bbox_t_container &container); diff --git a/src/convolutional_layer.c b/src/convolutional_layer.c index 9e2d106e..07f69159 100644 --- a/src/convolutional_layer.c +++ b/src/convolutional_layer.c @@ -18,6 +18,12 @@ #include "xnor_layer.h" #endif +#ifdef __cplusplus +#define PUT_IN_REGISTER +#else +#define PUT_IN_REGISTER register +#endif + #ifndef AI2 #define AI2 0 void forward_xnor_layer(layer l, network_state state); @@ -644,7 +650,7 @@ void gemm_nn_custom(int M, int N, int K, float ALPHA, int i, j, k; for (i = 0; i < M; ++i) { for (k = 0; k < K; ++k) { - float A_PART = ALPHA * A[i * lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k]; //printf("\n weight = %f \n", A_PART); for (j = 0; j < N; ++j) { C[i*ldc + j] += A_PART*B[k*ldb + j]; diff --git a/src/cpu_gemm.c b/src/cpu_gemm.c index 8305bb53..ca1a8e42 100644 --- a/src/cpu_gemm.c +++ b/src/cpu_gemm.c @@ -1,4 +1,9 @@ //#include "mini_blas.h" +#ifdef __cplusplus +#define PUT_IN_REGISTER +#else +#define PUT_IN_REGISTER register +#endif void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, @@ -9,7 +14,7 @@ void cpu_gemm_nn(int TA, int TB, int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(k = 0; k < K; ++k){ - float A_PART = ALPHA * A[i * lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k]; for(j = 0; j < N; ++j){ C[i*ldc+j] += A_PART*B[k*ldb+j]; } @@ -26,7 +31,7 @@ void cpu_gemm_nt(int TA, int TB, int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(j = 0; j < N; ++j){ - float sum = 0; + PUT_IN_REGISTER float sum = 0; for(k = 0; k < K; ++k){ sum += ALPHA*A[i*lda+k]*B[k+j*ldb]; } @@ -44,7 +49,7 @@ void cpu_gemm_tn(int TA, int TB, int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(k = 0; k < K; ++k){ - float A_PART = ALPHA * A[k * lda + i]; + PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i]; for(j = 0; j < N; ++j){ C[i*ldc+j] += A_PART*B[k*ldb+j]; } diff --git a/src/gemm.c b/src/gemm.c index 2fc9af02..112bed94 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -18,6 +18,11 @@ #define TILE_M 4 // 4 ops #define TILE_N 16 // AVX2 = 2 ops * 8 floats #define TILE_K 16 // loop +#ifdef __cplusplus +#define PUT_IN_REGISTER +#else +#define PUT_IN_REGISTER register +#endif void gemm_bin(int M, int N, int K, float ALPHA, char *A, int lda, @@ -713,7 +718,7 @@ void gemm_nn(int M, int N, int K, float ALPHA, else { for (i = 0; i < M; ++i) { for (k = 0; k < K; ++k) { - float A_PART = ALPHA * A[i * lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k]; for (j = 0; j < N; ++j) { C[i*ldc + j] += A_PART*B[k*ldb + j]; } @@ -845,7 +850,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA, { for (k_d = k; k_d < (k + TILE_K); ++k_d) { - register float A_PART = ALPHA*A[i_d*lda + k_d]; + PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k_d]; C[i_d*ldc + j] += A_PART*B[k_d*ldb + j]; } } @@ -856,7 +861,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA, { for (i_d = i; i_d < (i + TILE_M); ++i_d) { - register float A_PART = ALPHA*A[i_d*lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA*A[i_d*lda + k]; for (j = 0; j < N; ++j) { C[i_d*ldc + j] += A_PART*B[k*ldb + j]; } @@ -867,7 +872,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA, for (i = (M / TILE_M)*TILE_M; i < M; ++i) { int j, k; for (k = 0; k < K; ++k) { - register float A_PART = ALPHA*A[i*lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k]; for (j = 0; j < N; ++j) { C[i*ldc + j] += A_PART*B[k*ldb + j]; } @@ -890,7 +895,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) { - register uint32_t A_PART = A[i*lda + s]; + PUT_IN_REGISTER uint32_t A_PART = A[i*lda + s]; __m256i a256 = _mm256_set1_epi32(A_PART); for (j = 0; j < N - 8; j += 8) @@ -927,7 +932,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, for (; j < N; ++j) // out_h*out_w; { - register uint32_t B_PART = B[s*ldb + j]; + PUT_IN_REGISTER uint32_t B_PART = B[s*ldb + j]; uint32_t xnor_result = ~(A_PART ^ B_PART); int32_t count = popcnt_32(xnor_result); // must be Signed int @@ -1950,7 +1955,7 @@ void gemm_nn(int M, int N, int K, float ALPHA, int i, j, k; for (i = 0; i < M; ++i) { for (k = 0; k < K; ++k) { - float A_PART = ALPHA * A[i * lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA * A[i * lda + k]; for (j = 0; j < N; ++j) { C[i*ldc + j] += A_PART*B[k*ldb + j]; } @@ -1967,7 +1972,7 @@ void gemm_nn_fast(int M, int N, int K, float ALPHA, #pragma omp parallel for for (i = 0; i < M; ++i) { for (k = 0; k < K; ++k) { - register float A_PART = ALPHA*A[i*lda + k]; + PUT_IN_REGISTER float A_PART = ALPHA*A[i*lda + k]; for (j = 0; j < N; ++j) { C[i*ldc + j] += A_PART*B[k*ldb + j]; } @@ -1988,12 +1993,12 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, //printf(" l.mean_arr[i] = %d \n ", l.mean_arr[i]); for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) { - //register float A_PART = 1*a[i*k + s]; - register uint32_t A_PART = A[i*lda + s]; + //PUT_IN_REGISTER float A_PART = 1*a[i*k + s]; + PUT_IN_REGISTER uint32_t A_PART = A[i * lda + s]; for (j = 0; j < N; ++j) // out_h*out_w; { //c[i*n + j] += A_PART*b[s*n + j]; - register uint32_t B_PART = B[s*ldb + j]; + PUT_IN_REGISTER uint32_t B_PART = B[s * ldb + j]; uint32_t xnor_result = ~(A_PART ^ B_PART); //printf(" xnor_result = %d, ", xnor_result); int32_t count = popcnt_32(xnor_result); // must be Signed int @@ -2490,8 +2495,8 @@ void gemm_nn_bin_transposed_32bit_packed(int M, int N, int K, float ALPHA, float val = 0; for (s = 0; s < K; ++s) // l.size*l.size*l.c/32 or (l.size*l.size*l.c) { - register uint32_t A_PART = ((uint32_t*)A)[i*lda + s]; - register uint32_t B_PART = ((uint32_t*)B)[j*ldb + s]; + PUT_IN_REGISTER uint32_t A_PART = ((uint32_t*)A)[i*lda + s]; + PUT_IN_REGISTER uint32_t B_PART = ((uint32_t*)B)[j * ldb + s]; uint32_t xnor_result = ~(A_PART ^ B_PART); int32_t count = popcnt_32(xnor_result); // must be Signed int @@ -2576,7 +2581,7 @@ void gemm_nt(int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(j = 0; j < N; ++j){ - float sum = 0; + PUT_IN_REGISTER float sum = 0; for(k = 0; k < K; ++k){ sum += ALPHA*A[i*lda+k]*B[j*ldb + k]; } @@ -2593,7 +2598,7 @@ void gemm_tn(int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(k = 0; k < K; ++k){ - float A_PART = ALPHA * A[k * lda + i]; + PUT_IN_REGISTER float A_PART = ALPHA * A[k * lda + i]; for(j = 0; j < N; ++j){ C[i*ldc+j] += A_PART*B[k*ldb+j]; } @@ -2609,7 +2614,7 @@ void gemm_tt(int M, int N, int K, float ALPHA, int i,j,k; for(i = 0; i < M; ++i){ for(j = 0; j < N; ++j){ - float sum = 0; + PUT_IN_REGISTER float sum = 0; for(k = 0; k < K; ++k){ sum += ALPHA*A[i+k*lda]*B[k+j*ldb]; } diff --git a/src/image.c b/src/image.c index c3a24d37..76deefe5 100644 --- a/src/image.c +++ b/src/image.c @@ -3,6 +3,9 @@ #include "blas.h" #include "cuda.h" #include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif #include #ifndef STB_IMAGE_IMPLEMENTATION diff --git a/src/utils.c b/src/utils.c index 149a07ff..22633125 100644 --- a/src/utils.c +++ b/src/utils.c @@ -2,6 +2,9 @@ #include #include #include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif #include #include #include diff --git a/src/yolo_v2_class.cpp b/src/yolo_v2_class.cpp index 668ef5cd..1b07b43b 100644 --- a/src/yolo_v2_class.cpp +++ b/src/yolo_v2_class.cpp @@ -1,3 +1,4 @@ +#include "darknet.h" #include "yolo_v2_class.hpp" #include "network.h"