cleanup in preparation to opencv-4 work

This commit is contained in:
Stefano Sinigardi
2019-03-28 17:54:27 +01:00
parent f5fd086407
commit da74882fe1
9 changed files with 88 additions and 63 deletions

7
.gitignore vendored
View File

@ -1,10 +1,12 @@
*.o
*.a
*.dSYM
*.csv
*.out
*.png
*.so
*.exe
*.dll
mnist/
data/
caffe/
@ -28,7 +30,10 @@ Icon?
Thumbs.db
*.swp
# IDE generated #
.vs/
.vscode/
lib/
include/
share/

View File

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.4)
cmake_minimum_required(VERSION 3.8)
set(Darknet_MAJOR_VERSION 1)
set(Darknet_MINOR_VERSION 0)
@ -174,14 +174,14 @@ if(ENABLE_CUDA)
include_directories(${ZED_INCLUDE_DIRS})
link_directories(${ZED_LIBRARY_DIR})
add_definitions(-DZED_STEREO)
message("ZED Camera support enabled")
message(STATUS "ZED SDK enabled")
else()
message("ZED SDK not found !")
message(STATUS "ZED SDK not found")
set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
endif()
endif()
else()
message(WARNING "ZED SDK requires CUDA !")
message(STATUS "ZED SDK not enabled, since it requires CUDA")
set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
endif()
@ -247,7 +247,7 @@ endif()
set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
add_library(darklib SHARED ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
add_library(darklib ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
set_target_properties(darklib PROPERTIES POSITION_INDEPENDENT_CODE ON)
if(ENABLE_CUDA)
set_target_properties(darklib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@ -323,21 +323,19 @@ endif()
set_target_properties(darklib PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")
install(TARGETS darklib uselib darknet EXPORT DarknetTargets
install(TARGETS darklib EXPORT DarknetTargets
RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
COMPONENT dev
)
install(TARGETS uselib darknet
DESTINATION "${INSTALL_BIN_DIR}"
)
if(OpenCV_VERSION VERSION_GREATER "3.0" AND NOT SKIP_USELIB_TRACK)
install(TARGETS uselib_track
EXPORT DarknetTargets
RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
COMPONENT dev
DESTINATION "${INSTALL_BIN_DIR}"
)
endif()

View File

@ -1,7 +1,7 @@
#!/usr/bin/env pwsh
$number_of_build_workers=8
#$shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
$create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
$force_using_include_libs=$false
#$my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080
@ -88,31 +88,37 @@ if ($vcpkg_path) {
# DEBUG
New-Item -Path .\build_win_debug -ItemType directory -Force
Set-Location build_win_debug
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $shared_lib $additional_build_setup ..
cmake --build . --config Debug --parallel ${number_of_build_workers} --target install
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $create_shared_lib $additional_build_setup ..
cmake --build . --config Debug --target install
#cmake --build . --config Debug --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
Remove-Item DarknetConfig.cmake
Remove-Item DarknetConfigVersion.cmake
Set-Location ..
Copy-Item cmake\Modules\*.cmake share\darknet\
# RELEASE
New-Item -Path .\build_win_release -ItemType directory -Force
Set-Location build_win_release
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $shared_lib $additional_build_setup ..
cmake --build . --config Release --parallel ${number_of_build_workers} --target install
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $create_shared_lib $additional_build_setup ..
cmake --build . --config Release --target install
#cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
Remove-Item DarknetConfig.cmake
Remove-Item DarknetConfigVersion.cmake
Copy-Item *.dll ..
Set-Location ..
Copy-Item cmake\Modules\*.cmake share\darknet\
}
else {
# USE LOCAL PTHREAD LIB AND LOCAL STB HEADER, NO VCPKG, ONLY RELEASE MODE SUPPORTED
# if you want to manually force this case, remove VCPKG_ROOT env variable and remember to use "vcpkg integrate remove" in case you had enabled user-wide vcpkg integration
New-Item -Path .\build_win_release_novcpkg -ItemType directory -Force
Set-Location build_win_release_novcpkg
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $shared_lib $additional_build_setup ..
cmake --build . --config Release --parallel ${number_of_build_workers} --target install
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $create_shared_lib $additional_build_setup ..
cmake --build . --config Release --target install
#cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
Remove-Item DarknetConfig.cmake
Remove-Item DarknetConfigVersion.cmake
Copy-Item ..\3rdparty\pthreads\bin\pthreadVC2.dll ..
Set-Location ..
Copy-Item cmake\Modules\*.cmake share\darknet\
}

View File

@ -1,6 +1,19 @@
#!/usr/bin/env bash
number_of_build_workers=8
create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
#my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080
#my_cuda_compute_model=72 #Compute capability for Jetson Xavier
#my_cuda_compute_model=70 #Compute capability for Tesla V100
#my_cuda_compute_model=62 #Compute capability for Jetson TX2
#my_cuda_compute_model=61 #Compute capability for Tesla P40
#my_cuda_compute_model=60 #Compute capability for Tesla P100
#my_cuda_compute_model=53 #Compute capability for Jetson TX1
#my_cuda_compute_model=52 #Compute capability for Tesla M40/M60
#my_cuda_compute_model=37 #Compute capability for Tesla K80
#my_cuda_compute_model=35 #Compute capability for Tesla K20/K40
#my_cuda_compute_model=30 #Compute capability for Tesla K10, Quadro K4000
if [[ "$OSTYPE" == "darwin"* ]]; then
OpenCV_DIR="/usr/local/Cellar/opencv@3/3.4.5"
@ -11,11 +24,14 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
fi
fi
if [[ ! -z "$my_cuda_compute_model" ]]; then
additional_build_setup="-DCUDA_COMPUTE_MODEL=${my_cuda_compute_model}"
fi
# RELEASE
mkdir -p build_release
cd build_release
cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines}
cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} ${create_shared_lib} ${additional_build_setup}
cmake --build . --target install -- -j${number_of_build_workers}
#cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+
rm -f DarknetConfig.cmake
@ -26,7 +42,7 @@ cp cmake/Modules/*.cmake share/darknet
# DEBUG
mkdir -p build_debug
cd build_debug
cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines}
cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} ${create_shared_lib} ${additional_build_setup}
cmake --build . --target install -- -j${number_of_build_workers}
#cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+
rm -f DarknetConfig.cmake

0
cmake/Modules/FindStb.cmake Executable file → Normal file
View File

Binary file not shown.

View File

@ -184,18 +184,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
{
//return;
cudaError_t status = cudaSuccess;
int input_size = l.c*l.h*l.w*l.batch;
//cudaError_t status = cudaSuccess;
//int input_size = l.c*l.h*l.w*l.batch;
int m = l.n;
int k = l.size*l.size*l.c;
int n = l.out_w*l.out_h;
float * a = l.weights_gpu;
//float * a = l.weights_gpu;
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
size_t t_intput_size = new_ldb * n;
size_t t_bit_input_size = t_intput_size / 8;// +1;
//size_t t_intput_size = new_ldb * n;
//size_t t_bit_input_size = t_intput_size / 8;// +1;
if (l.c % 32 == 0)
{
@ -208,8 +208,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
int ldb_align = l.lda_align;
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
size_t t_intput_size = new_ldb * l.bit_align;// n;
size_t t_bit_input_size = t_intput_size / 8;// +1;
//size_t t_intput_size = new_ldb * l.bit_align;// n;
//size_t t_bit_input_size = t_intput_size / 8;// +1;
const int new_c = l.c / 32;
@ -408,7 +408,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
//fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
#ifdef CUDNN
float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
//float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
float alpha = 1, beta = 0;
//#ifdef CUDNN_HALF
@ -609,7 +609,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
if(l.xnor) state.input = l.binary_input_gpu;
#ifdef CUDNN
float one = 1;
float one = 1.f;
float alpha = 1, beta = 0;
//#ifdef CUDNN_HALF

View File

@ -900,7 +900,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
{
__m256i b256 = *((__m256i*)&B[s*ldb + j]);
__m256i xor256 = _mm256_xor_si256(a256, b256); // xnor = xor(a,b)
__m256i all_1 = _mm256_set1_epi8(255);
__m256i all_1 = _mm256_set1_epi8((char)255);
__m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b))
// waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a)
@ -1162,7 +1162,7 @@ static inline int popcnt256_custom(__m256i n) {
}
static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
__m256i c_bit256 = _mm256_set1_epi8(255);
__m256i c_bit256 = _mm256_set1_epi8((char)255);
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT

View File

@ -228,8 +228,8 @@ __global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
const int height_col, const int width_col,
float *data_col, const int bit_align)
{
__shared__ float tmp_s[1];
__shared__ ulonglong4 tmp256_s[1];
//__shared__ float tmp_s[1];
//__shared__ ulonglong4 tmp256_s[1];
//#define SHRED_VALS ((BLOCK / 169) * )
@ -414,7 +414,7 @@ __global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t s
if ((index + i * 1024) < size) src_val = src[index + i*1024];
else src_val = 0;
//unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
const int num_of_warps = blockDim.x / WARP_SIZE;
//const int num_of_warps = blockDim.x / WARP_SIZE;
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
@ -436,7 +436,7 @@ void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
}
// --------------------------------
/*
__device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
size_t dst_i = index / 8;
int dst_shift = index % 8;
@ -449,6 +449,7 @@ __device__ __host__ static inline void set_bit(unsigned char *const dst, size_t
dst[dst_i] |= 1 << dst_shift;
//dst[dst_i] |= 1 << (8 - dst_shift);
}
*/
__device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
size_t src_i = index / 8;
@ -643,9 +644,9 @@ __global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int
void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
const int lda, const int ldb, const int block_size)
{
int size = n*m/ (8*8) + 1;
//int size = n*m/ (8*8) + 1;
int size32 = n*m / (32*32) + 1;
const int num_blocks = size / BLOCK + 1;
//const int num_blocks = size / BLOCK + 1;
const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
transpose_bin_gpu_kernel_32 << <num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >> >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
//transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size);
@ -692,10 +693,10 @@ __global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_
//new_ldb - aligned (k) by 256
const int src_w_align = src_w + (32 - src_w % 32);
const int src_h_align = src_h + (32 - src_h % 32);
//const int src_h_align = src_h + (32 - src_h % 32);
const int warps_in_width = src_w_align / 32;
const int warps_in_height = src_h_align / 32;
//const int warps_in_height = src_h_align / 32;
@ -789,7 +790,7 @@ void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
// 256 channels -> 8 channels (with 32 floats)
__global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
{
__shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts
//__shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts
int index = blockIdx.x*blockDim.x + threadIdx.x;
@ -943,15 +944,15 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
//typedef unsigned int uint32_t;
//typedef unsigned char uint8_t;
//typedef char int8_t;
/*
__device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
}
*/
__device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
return ~(a^b) & 0b1;
}
/*
__device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
return ~(a^b);
}
@ -977,13 +978,13 @@ __device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong
res.z = ~(a.z^b.z);
return res;
}
*/
//-------
/*
__device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
return (a^b) & 0b1;
}
*/
__device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
return (a^b);
}
@ -991,7 +992,7 @@ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
__device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
return (a^b);
}
/*
__device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
uint4 res;
res.w = (a.w^b.w);
@ -1000,7 +1001,7 @@ __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
res.z = (a.z^b.z);
return res;
}
*/
__device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
ulonglong4 res;
res.w = (a.w^b.w);
@ -1010,12 +1011,11 @@ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4
return res;
}
/*
__device__ static inline int popcnt_256(ulonglong4 a) {
return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
}
/*
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
unsigned char *A, int lda,
unsigned char *B, int ldb,
@ -1269,7 +1269,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
*/
int i, j, k, h;
int i, j, k;//, h;
// 47% = 29 + 10 + 8
j = global_warp_id % (N_aligned / WMMA_Nx2);
j = j * WMMA_Nx2;
@ -1277,7 +1277,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
i = global_warp_id / (N_aligned / WMMA_Nx2);
i = i * WMMA_M;
int count = 0;
//int count = 0;
k = 0;
if (i < M) //if (i < M) // l.n - filters [16 - 55 - 1024]
@ -1323,7 +1323,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
// Custom XOR-GEMM
int k_d = lane_id % 4;
int i_d = lane_id / 4;
int j_d = lane_id / 4;
//int j_d = lane_id / 4;
int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
for (int local_j = 0; local_j < 8*2; ++local_j) {
@ -1333,9 +1333,9 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
// 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
for (; k < K; k += 128) // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
{
int64_t A_cur_index = (i*lda + k) / 8;
//int64_t A_cur_index = (i*lda + k) / 8;
//int64_t A_cur_index = (local_i*lda + k) / 8;
int64_t B_cur_index = (j*ldb + k) / 8;
//int64_t B_cur_index = (j*ldb + k) / 8;
// lda, ldb - are in bits
// 8*4 = 32
@ -1593,7 +1593,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
}
__syncthreads();
int i, j, k, h;
int i, j, k; //, h;
// 47% = 29 + 10 + 8
j = index % N;
{ // out_h*out_w - one channel output size [169 - 173056]
@ -2115,7 +2115,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
for (chan = 0; chan < in_c; ++chan)
{
//int const weights_pre_index = fil*in_c*size*size + chan*size*size;
int const weights_pre_index = fil*new_lda + chan*size*size;
//int const weights_pre_index = fil*new_lda + chan*size*size;
int const input_pre_index = chan*in_w*in_h;
__shared__ uint32_t input_shared[416*416/32 + 1]; // 21.2 KB bytes (for input size 832x832)
@ -2139,8 +2139,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
}
__syncthreads();
*/
int src_index = -1;
uint32_t input_byte;
//int src_index = -1;
//uint32_t input_byte;
if (fil < n) // (1-6 for one BLOCK)
{
@ -2154,8 +2154,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
int input_x = x + f_x - pad;
if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
int input_index = input_pre_index + input_y*in_w + input_x;
int weights_index = weights_pre_index + f_y*size + f_x;
//int input_index = input_pre_index + input_y*in_w + input_x;
//int weights_index = weights_pre_index + f_y*size + f_x;
//int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
//int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;