diff --git a/.gitignore b/.gitignore index b415cea3..68a7a13b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,12 @@ *.o +*.a *.dSYM *.csv *.out *.png *.so *.exe +*.dll mnist/ data/ caffe/ @@ -28,7 +30,10 @@ Icon? Thumbs.db *.swp +# IDE generated # +.vs/ +.vscode/ + lib/ include/ share/ - diff --git a/CMakeLists.txt b/CMakeLists.txt index 5668b934..f22ba83a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.4) +cmake_minimum_required(VERSION 3.8) set(Darknet_MAJOR_VERSION 1) set(Darknet_MINOR_VERSION 0) @@ -174,14 +174,14 @@ if(ENABLE_CUDA) include_directories(${ZED_INCLUDE_DIRS}) link_directories(${ZED_LIBRARY_DIR}) add_definitions(-DZED_STEREO) - message("ZED Camera support enabled") + message(STATUS "ZED SDK enabled") else() - message("ZED SDK not found !") + message(STATUS "ZED SDK not found") set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE) endif() endif() else() - message(WARNING "ZED SDK requires CUDA !") + message(STATUS "ZED SDK not enabled, since it requires CUDA") set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE) endif() @@ -247,7 +247,7 @@ endif() set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX) -add_library(darklib SHARED ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources}) +add_library(darklib ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources}) set_target_properties(darklib PROPERTIES POSITION_INDEPENDENT_CODE ON) if(ENABLE_CUDA) set_target_properties(darklib PROPERTIES CUDA_SEPARABLE_COMPILATION ON) @@ -323,21 +323,19 @@ endif() set_target_properties(darklib PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp") -install(TARGETS darklib uselib darknet EXPORT DarknetTargets +install(TARGETS darklib EXPORT DarknetTargets RUNTIME DESTINATION "${INSTALL_BIN_DIR}" LIBRARY DESTINATION "${INSTALL_LIB_DIR}" ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" COMPONENT dev ) +install(TARGETS uselib darknet + DESTINATION "${INSTALL_BIN_DIR}" +) if(OpenCV_VERSION VERSION_GREATER "3.0" AND NOT SKIP_USELIB_TRACK) install(TARGETS uselib_track - EXPORT DarknetTargets - RUNTIME DESTINATION "${INSTALL_BIN_DIR}" - LIBRARY DESTINATION "${INSTALL_LIB_DIR}" - ARCHIVE DESTINATION "${INSTALL_LIB_DIR}" - PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}" - COMPONENT dev + DESTINATION "${INSTALL_BIN_DIR}" ) endif() diff --git a/build.ps1 b/build.ps1 index 7063098c..0fd60a93 100755 --- a/build.ps1 +++ b/build.ps1 @@ -1,7 +1,7 @@ #!/usr/bin/env pwsh $number_of_build_workers=8 -#$shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON" +$create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON" $force_using_include_libs=$false #$my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080 @@ -88,31 +88,37 @@ if ($vcpkg_path) { # DEBUG New-Item -Path .\build_win_debug -ItemType directory -Force Set-Location build_win_debug - cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $shared_lib $additional_build_setup .. - cmake --build . --config Debug --parallel ${number_of_build_workers} --target install + cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $create_shared_lib $additional_build_setup .. + cmake --build . --config Debug --target install + #cmake --build . --config Debug --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+ Remove-Item DarknetConfig.cmake Remove-Item DarknetConfigVersion.cmake Set-Location .. + Copy-Item cmake\Modules\*.cmake share\darknet\ # RELEASE New-Item -Path .\build_win_release -ItemType directory -Force Set-Location build_win_release - cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $shared_lib $additional_build_setup .. - cmake --build . --config Release --parallel ${number_of_build_workers} --target install + cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $create_shared_lib $additional_build_setup .. + cmake --build . --config Release --target install + #cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+ Remove-Item DarknetConfig.cmake Remove-Item DarknetConfigVersion.cmake Copy-Item *.dll .. Set-Location .. + Copy-Item cmake\Modules\*.cmake share\darknet\ } else { # USE LOCAL PTHREAD LIB AND LOCAL STB HEADER, NO VCPKG, ONLY RELEASE MODE SUPPORTED # if you want to manually force this case, remove VCPKG_ROOT env variable and remember to use "vcpkg integrate remove" in case you had enabled user-wide vcpkg integration New-Item -Path .\build_win_release_novcpkg -ItemType directory -Force Set-Location build_win_release_novcpkg - cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $shared_lib $additional_build_setup .. - cmake --build . --config Release --parallel ${number_of_build_workers} --target install + cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $create_shared_lib $additional_build_setup .. + cmake --build . --config Release --target install + #cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+ Remove-Item DarknetConfig.cmake Remove-Item DarknetConfigVersion.cmake Copy-Item ..\3rdparty\pthreads\bin\pthreadVC2.dll .. Set-Location .. + Copy-Item cmake\Modules\*.cmake share\darknet\ } diff --git a/build.sh b/build.sh index 7ff54d91..f74351d1 100755 --- a/build.sh +++ b/build.sh @@ -1,6 +1,19 @@ #!/usr/bin/env bash number_of_build_workers=8 +create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON" + +#my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080 +#my_cuda_compute_model=72 #Compute capability for Jetson Xavier +#my_cuda_compute_model=70 #Compute capability for Tesla V100 +#my_cuda_compute_model=62 #Compute capability for Jetson TX2 +#my_cuda_compute_model=61 #Compute capability for Tesla P40 +#my_cuda_compute_model=60 #Compute capability for Tesla P100 +#my_cuda_compute_model=53 #Compute capability for Jetson TX1 +#my_cuda_compute_model=52 #Compute capability for Tesla M40/M60 +#my_cuda_compute_model=37 #Compute capability for Tesla K80 +#my_cuda_compute_model=35 #Compute capability for Tesla K20/K40 +#my_cuda_compute_model=30 #Compute capability for Tesla K10, Quadro K4000 if [[ "$OSTYPE" == "darwin"* ]]; then OpenCV_DIR="/usr/local/Cellar/opencv@3/3.4.5" @@ -11,11 +24,14 @@ if [[ "$OSTYPE" == "darwin"* ]]; then fi fi +if [[ ! -z "$my_cuda_compute_model" ]]; then + additional_build_setup="-DCUDA_COMPUTE_MODEL=${my_cuda_compute_model}" +fi # RELEASE mkdir -p build_release cd build_release -cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} +cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} ${create_shared_lib} ${additional_build_setup} cmake --build . --target install -- -j${number_of_build_workers} #cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+ rm -f DarknetConfig.cmake @@ -26,7 +42,7 @@ cp cmake/Modules/*.cmake share/darknet # DEBUG mkdir -p build_debug cd build_debug -cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} +cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} ${create_shared_lib} ${additional_build_setup} cmake --build . --target install -- -j${number_of_build_workers} #cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+ rm -f DarknetConfig.cmake diff --git a/cmake/Modules/FindStb.cmake b/cmake/Modules/FindStb.cmake old mode 100755 new mode 100644 diff --git a/pthreadVC2.dll b/pthreadVC2.dll deleted file mode 100644 index 165b4d26..00000000 Binary files a/pthreadVC2.dll and /dev/null differ diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu index 8ae99710..cfa4fe7e 100644 --- a/src/convolutional_kernels.cu +++ b/src/convolutional_kernels.cu @@ -184,18 +184,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) if (l.align_bit_weights_gpu && !state.train && l.c >= 32) { //return; - cudaError_t status = cudaSuccess; - int input_size = l.c*l.h*l.w*l.batch; + //cudaError_t status = cudaSuccess; + //int input_size = l.c*l.h*l.w*l.batch; int m = l.n; int k = l.size*l.size*l.c; int n = l.out_w*l.out_h; - float * a = l.weights_gpu; + //float * a = l.weights_gpu; int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - size_t t_intput_size = new_ldb * n; - size_t t_bit_input_size = t_intput_size / 8;// +1; + //size_t t_intput_size = new_ldb * n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; if (l.c % 32 == 0) { @@ -208,8 +208,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) int ldb_align = l.lda_align; size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8; - size_t t_intput_size = new_ldb * l.bit_align;// n; - size_t t_bit_input_size = t_intput_size / 8;// +1; + //size_t t_intput_size = new_ldb * l.bit_align;// n; + //size_t t_bit_input_size = t_intput_size / 8;// +1; const int new_c = l.c / 32; @@ -408,7 +408,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state) //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1); #ifdef CUDNN - float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT + //float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT float alpha = 1, beta = 0; //#ifdef CUDNN_HALF @@ -609,7 +609,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state if(l.xnor) state.input = l.binary_input_gpu; #ifdef CUDNN - float one = 1; + float one = 1.f; float alpha = 1, beta = 0; //#ifdef CUDNN_HALF diff --git a/src/gemm.c b/src/gemm.c index 861a1906..336f3bdd 100644 --- a/src/gemm.c +++ b/src/gemm.c @@ -900,7 +900,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA, { __m256i b256 = *((__m256i*)&B[s*ldb + j]); __m256i xor256 = _mm256_xor_si256(a256, b256); // xnor = xor(a,b) - __m256i all_1 = _mm256_set1_epi8(255); + __m256i all_1 = _mm256_set1_epi8((char)255); __m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b)) // waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a) @@ -1162,7 +1162,7 @@ static inline int popcnt256_custom(__m256i n) { } static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) { - __m256i c_bit256 = _mm256_set1_epi8(255); + __m256i c_bit256 = _mm256_set1_epi8((char)255); __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b)) c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu index 539824a9..954a4694 100644 --- a/src/im2col_kernels.cu +++ b/src/im2col_kernels.cu @@ -228,8 +228,8 @@ __global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im, const int height_col, const int width_col, float *data_col, const int bit_align) { - __shared__ float tmp_s[1]; - __shared__ ulonglong4 tmp256_s[1]; + //__shared__ float tmp_s[1]; + //__shared__ ulonglong4 tmp256_s[1]; //#define SHRED_VALS ((BLOCK / 169) * ) @@ -414,7 +414,7 @@ __global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t s if ((index + i * 1024) < size) src_val = src[index + i*1024]; else src_val = 0; //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0); - const int num_of_warps = blockDim.x / WARP_SIZE; + //const int num_of_warps = blockDim.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE; @@ -436,7 +436,7 @@ void float_to_bit_gpu(float *src, unsigned char *dst, size_t size) } // -------------------------------- - +/* __device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) { size_t dst_i = index / 8; int dst_shift = index % 8; @@ -449,6 +449,7 @@ __device__ __host__ static inline void set_bit(unsigned char *const dst, size_t dst[dst_i] |= 1 << dst_shift; //dst[dst_i] |= 1 << (8 - dst_shift); } +*/ __device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) { size_t src_i = index / 8; @@ -643,9 +644,9 @@ __global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m, const int lda, const int ldb, const int block_size) { - int size = n*m/ (8*8) + 1; + //int size = n*m/ (8*8) + 1; int size32 = n*m / (32*32) + 1; - const int num_blocks = size / BLOCK + 1; + //const int num_blocks = size / BLOCK + 1; const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1; transpose_bin_gpu_kernel_32 << > >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size); //transpose_bin_gpu_kernel << > >(A, B, n, m, lda, ldb, block_size); @@ -692,10 +693,10 @@ __global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_ //new_ldb - aligned (k) by 256 const int src_w_align = src_w + (32 - src_w % 32); - const int src_h_align = src_h + (32 - src_h % 32); + //const int src_h_align = src_h + (32 - src_h % 32); const int warps_in_width = src_w_align / 32; - const int warps_in_height = src_h_align / 32; + //const int warps_in_height = src_h_align / 32; @@ -789,7 +790,7 @@ void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c) // 256 channels -> 8 channels (with 32 floats) __global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c) { - __shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts + //__shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts int index = blockIdx.x*blockDim.x + threadIdx.x; @@ -943,15 +944,15 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) { //typedef unsigned int uint32_t; //typedef unsigned char uint8_t; //typedef char int8_t; - +/* __device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) { return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0; } - +*/ __device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) { return ~(a^b) & 0b1; } - +/* __device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) { return ~(a^b); } @@ -977,13 +978,13 @@ __device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong res.z = ~(a.z^b.z); return res; } - +*/ //------- - +/* __device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) { return (a^b) & 0b1; } - +*/ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) { return (a^b); } @@ -991,7 +992,7 @@ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) { __device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) { return (a^b); } - +/* __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) { uint4 res; res.w = (a.w^b.w); @@ -1000,7 +1001,7 @@ __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) { res.z = (a.z^b.z); return res; } - +*/ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) { ulonglong4 res; res.w = (a.w^b.w); @@ -1010,12 +1011,11 @@ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 return res; } - +/* __device__ static inline int popcnt_256(ulonglong4 a) { return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z); } -/* __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K, unsigned char *A, int lda, unsigned char *B, int ldb, @@ -1269,7 +1269,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i */ - int i, j, k, h; + int i, j, k;//, h; // 47% = 29 + 10 + 8 j = global_warp_id % (N_aligned / WMMA_Nx2); j = j * WMMA_Nx2; @@ -1277,7 +1277,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i i = global_warp_id / (N_aligned / WMMA_Nx2); i = i * WMMA_M; - int count = 0; + //int count = 0; k = 0; if (i < M) //if (i < M) // l.n - filters [16 - 55 - 1024] @@ -1323,7 +1323,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i // Custom XOR-GEMM int k_d = lane_id % 4; int i_d = lane_id / 4; - int j_d = lane_id / 4; + //int j_d = lane_id / 4; int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0); for (int local_j = 0; local_j < 8*2; ++local_j) { @@ -1333,9 +1333,9 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit) for (; k < K; k += 128) // l.size*l.size*l.c - one filter size [27 - 144 - 9216] { - int64_t A_cur_index = (i*lda + k) / 8; + //int64_t A_cur_index = (i*lda + k) / 8; //int64_t A_cur_index = (local_i*lda + k) / 8; - int64_t B_cur_index = (j*ldb + k) / 8; + //int64_t B_cur_index = (j*ldb + k) / 8; // lda, ldb - are in bits // 8*4 = 32 @@ -1593,7 +1593,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int } __syncthreads(); - int i, j, k, h; + int i, j, k; //, h; // 47% = 29 + 10 + 8 j = index % N; { // out_h*out_w - one channel output size [169 - 173056] @@ -2115,7 +2115,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out for (chan = 0; chan < in_c; ++chan) { //int const weights_pre_index = fil*in_c*size*size + chan*size*size; - int const weights_pre_index = fil*new_lda + chan*size*size; + //int const weights_pre_index = fil*new_lda + chan*size*size; int const input_pre_index = chan*in_w*in_h; __shared__ uint32_t input_shared[416*416/32 + 1]; // 21.2 KB bytes (for input size 832x832) @@ -2139,8 +2139,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out } __syncthreads(); */ - int src_index = -1; - uint32_t input_byte; + //int src_index = -1; + //uint32_t input_byte; if (fil < n) // (1-6 for one BLOCK) { @@ -2154,8 +2154,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out int input_x = x + f_x - pad; if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue; - int input_index = input_pre_index + input_y*in_w + input_x; - int weights_index = weights_pre_index + f_y*size + f_x; + //int input_index = input_pre_index + input_y*in_w + input_x; + //int weights_index = weights_pre_index + f_y*size + f_x; //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x; //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;