mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
cleanup in preparation to opencv-4 work
This commit is contained in:
7
.gitignore
vendored
7
.gitignore
vendored
@ -1,10 +1,12 @@
|
||||
*.o
|
||||
*.a
|
||||
*.dSYM
|
||||
*.csv
|
||||
*.out
|
||||
*.png
|
||||
*.so
|
||||
*.exe
|
||||
*.dll
|
||||
mnist/
|
||||
data/
|
||||
caffe/
|
||||
@ -28,7 +30,10 @@ Icon?
|
||||
Thumbs.db
|
||||
*.swp
|
||||
|
||||
# IDE generated #
|
||||
.vs/
|
||||
.vscode/
|
||||
|
||||
lib/
|
||||
include/
|
||||
share/
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
cmake_minimum_required(VERSION 3.4)
|
||||
cmake_minimum_required(VERSION 3.8)
|
||||
|
||||
set(Darknet_MAJOR_VERSION 1)
|
||||
set(Darknet_MINOR_VERSION 0)
|
||||
@ -174,14 +174,14 @@ if(ENABLE_CUDA)
|
||||
include_directories(${ZED_INCLUDE_DIRS})
|
||||
link_directories(${ZED_LIBRARY_DIR})
|
||||
add_definitions(-DZED_STEREO)
|
||||
message("ZED Camera support enabled")
|
||||
message(STATUS "ZED SDK enabled")
|
||||
else()
|
||||
message("ZED SDK not found !")
|
||||
message(STATUS "ZED SDK not found")
|
||||
set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
message(WARNING "ZED SDK requires CUDA !")
|
||||
message(STATUS "ZED SDK not enabled, since it requires CUDA")
|
||||
set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
|
||||
endif()
|
||||
|
||||
@ -247,7 +247,7 @@ endif()
|
||||
|
||||
set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
|
||||
|
||||
add_library(darklib SHARED ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
|
||||
add_library(darklib ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
|
||||
set_target_properties(darklib PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
if(ENABLE_CUDA)
|
||||
set_target_properties(darklib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
|
||||
@ -323,21 +323,19 @@ endif()
|
||||
|
||||
set_target_properties(darklib PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")
|
||||
|
||||
install(TARGETS darklib uselib darknet EXPORT DarknetTargets
|
||||
install(TARGETS darklib EXPORT DarknetTargets
|
||||
RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
|
||||
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
|
||||
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
|
||||
PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
|
||||
COMPONENT dev
|
||||
)
|
||||
install(TARGETS uselib darknet
|
||||
DESTINATION "${INSTALL_BIN_DIR}"
|
||||
)
|
||||
if(OpenCV_VERSION VERSION_GREATER "3.0" AND NOT SKIP_USELIB_TRACK)
|
||||
install(TARGETS uselib_track
|
||||
EXPORT DarknetTargets
|
||||
RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
|
||||
LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
|
||||
ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
|
||||
PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
|
||||
COMPONENT dev
|
||||
DESTINATION "${INSTALL_BIN_DIR}"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
20
build.ps1
20
build.ps1
@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env pwsh
|
||||
|
||||
$number_of_build_workers=8
|
||||
#$shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
|
||||
$create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
|
||||
$force_using_include_libs=$false
|
||||
|
||||
#$my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080
|
||||
@ -88,31 +88,37 @@ if ($vcpkg_path) {
|
||||
# DEBUG
|
||||
New-Item -Path .\build_win_debug -ItemType directory -Force
|
||||
Set-Location build_win_debug
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Debug --parallel ${number_of_build_workers} --target install
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $create_shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Debug --target install
|
||||
#cmake --build . --config Debug --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
|
||||
Remove-Item DarknetConfig.cmake
|
||||
Remove-Item DarknetConfigVersion.cmake
|
||||
Set-Location ..
|
||||
Copy-Item cmake\Modules\*.cmake share\darknet\
|
||||
|
||||
# RELEASE
|
||||
New-Item -Path .\build_win_release -ItemType directory -Force
|
||||
Set-Location build_win_release
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Release --parallel ${number_of_build_workers} --target install
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $create_shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Release --target install
|
||||
#cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
|
||||
Remove-Item DarknetConfig.cmake
|
||||
Remove-Item DarknetConfigVersion.cmake
|
||||
Copy-Item *.dll ..
|
||||
Set-Location ..
|
||||
Copy-Item cmake\Modules\*.cmake share\darknet\
|
||||
}
|
||||
else {
|
||||
# USE LOCAL PTHREAD LIB AND LOCAL STB HEADER, NO VCPKG, ONLY RELEASE MODE SUPPORTED
|
||||
# if you want to manually force this case, remove VCPKG_ROOT env variable and remember to use "vcpkg integrate remove" in case you had enabled user-wide vcpkg integration
|
||||
New-Item -Path .\build_win_release_novcpkg -ItemType directory -Force
|
||||
Set-Location build_win_release_novcpkg
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Release --parallel ${number_of_build_workers} --target install
|
||||
cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $create_shared_lib $additional_build_setup ..
|
||||
cmake --build . --config Release --target install
|
||||
#cmake --build . --config Release --parallel ${number_of_build_workers} --target install #valid only for CMake 3.12+
|
||||
Remove-Item DarknetConfig.cmake
|
||||
Remove-Item DarknetConfigVersion.cmake
|
||||
Copy-Item ..\3rdparty\pthreads\bin\pthreadVC2.dll ..
|
||||
Set-Location ..
|
||||
Copy-Item cmake\Modules\*.cmake share\darknet\
|
||||
}
|
||||
|
20
build.sh
20
build.sh
@ -1,6 +1,19 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
number_of_build_workers=8
|
||||
create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
|
||||
|
||||
#my_cuda_compute_model=75 #Compute capability for Tesla T4, RTX 2080
|
||||
#my_cuda_compute_model=72 #Compute capability for Jetson Xavier
|
||||
#my_cuda_compute_model=70 #Compute capability for Tesla V100
|
||||
#my_cuda_compute_model=62 #Compute capability for Jetson TX2
|
||||
#my_cuda_compute_model=61 #Compute capability for Tesla P40
|
||||
#my_cuda_compute_model=60 #Compute capability for Tesla P100
|
||||
#my_cuda_compute_model=53 #Compute capability for Jetson TX1
|
||||
#my_cuda_compute_model=52 #Compute capability for Tesla M40/M60
|
||||
#my_cuda_compute_model=37 #Compute capability for Tesla K80
|
||||
#my_cuda_compute_model=35 #Compute capability for Tesla K20/K40
|
||||
#my_cuda_compute_model=30 #Compute capability for Tesla K10, Quadro K4000
|
||||
|
||||
if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
OpenCV_DIR="/usr/local/Cellar/opencv@3/3.4.5"
|
||||
@ -11,11 +24,14 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ ! -z "$my_cuda_compute_model" ]]; then
|
||||
additional_build_setup="-DCUDA_COMPUTE_MODEL=${my_cuda_compute_model}"
|
||||
fi
|
||||
|
||||
# RELEASE
|
||||
mkdir -p build_release
|
||||
cd build_release
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines}
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} ${create_shared_lib} ${additional_build_setup}
|
||||
cmake --build . --target install -- -j${number_of_build_workers}
|
||||
#cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+
|
||||
rm -f DarknetConfig.cmake
|
||||
@ -26,7 +42,7 @@ cp cmake/Modules/*.cmake share/darknet
|
||||
# DEBUG
|
||||
mkdir -p build_debug
|
||||
cd build_debug
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines}
|
||||
cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} ${create_shared_lib} ${additional_build_setup}
|
||||
cmake --build . --target install -- -j${number_of_build_workers}
|
||||
#cmake --build . --target install --parallel ${number_of_build_workers} #valid only for CMake 3.12+
|
||||
rm -f DarknetConfig.cmake
|
||||
|
0
cmake/Modules/FindStb.cmake
Executable file → Normal file
0
cmake/Modules/FindStb.cmake
Executable file → Normal file
BIN
pthreadVC2.dll
BIN
pthreadVC2.dll
Binary file not shown.
@ -184,18 +184,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
|
||||
{
|
||||
//return;
|
||||
cudaError_t status = cudaSuccess;
|
||||
int input_size = l.c*l.h*l.w*l.batch;
|
||||
//cudaError_t status = cudaSuccess;
|
||||
//int input_size = l.c*l.h*l.w*l.batch;
|
||||
|
||||
int m = l.n;
|
||||
int k = l.size*l.size*l.c;
|
||||
int n = l.out_w*l.out_h;
|
||||
float * a = l.weights_gpu;
|
||||
//float * a = l.weights_gpu;
|
||||
|
||||
int ldb_align = l.lda_align;
|
||||
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
|
||||
size_t t_intput_size = new_ldb * n;
|
||||
size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
//size_t t_intput_size = new_ldb * n;
|
||||
//size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
|
||||
if (l.c % 32 == 0)
|
||||
{
|
||||
@ -208,8 +208,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
|
||||
int ldb_align = l.lda_align;
|
||||
size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
|
||||
size_t t_intput_size = new_ldb * l.bit_align;// n;
|
||||
size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
//size_t t_intput_size = new_ldb * l.bit_align;// n;
|
||||
//size_t t_bit_input_size = t_intput_size / 8;// +1;
|
||||
|
||||
const int new_c = l.c / 32;
|
||||
|
||||
@ -408,7 +408,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
|
||||
//fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
|
||||
|
||||
#ifdef CUDNN
|
||||
float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
|
||||
//float one = 1; // alpha[0], beta[0] is float for HALF and FLOAT
|
||||
float alpha = 1, beta = 0;
|
||||
|
||||
//#ifdef CUDNN_HALF
|
||||
@ -609,7 +609,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
|
||||
|
||||
if(l.xnor) state.input = l.binary_input_gpu;
|
||||
#ifdef CUDNN
|
||||
float one = 1;
|
||||
float one = 1.f;
|
||||
float alpha = 1, beta = 0;
|
||||
|
||||
//#ifdef CUDNN_HALF
|
||||
|
@ -900,7 +900,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
|
||||
{
|
||||
__m256i b256 = *((__m256i*)&B[s*ldb + j]);
|
||||
__m256i xor256 = _mm256_xor_si256(a256, b256); // xnor = xor(a,b)
|
||||
__m256i all_1 = _mm256_set1_epi8(255);
|
||||
__m256i all_1 = _mm256_set1_epi8((char)255);
|
||||
__m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b))
|
||||
|
||||
// waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a)
|
||||
@ -1162,7 +1162,7 @@ static inline int popcnt256_custom(__m256i n) {
|
||||
}
|
||||
|
||||
static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
|
||||
__m256i c_bit256 = _mm256_set1_epi8(255);
|
||||
__m256i c_bit256 = _mm256_set1_epi8((char)255);
|
||||
|
||||
__m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256); // xnor = not(xor(a,b))
|
||||
c_bit256 = _mm256_andnot_si256(xor256, c_bit256); // can be optimized - we can do other NOT for wegihts once and do not do this NOT
|
||||
|
@ -228,8 +228,8 @@ __global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
|
||||
const int height_col, const int width_col,
|
||||
float *data_col, const int bit_align)
|
||||
{
|
||||
__shared__ float tmp_s[1];
|
||||
__shared__ ulonglong4 tmp256_s[1];
|
||||
//__shared__ float tmp_s[1];
|
||||
//__shared__ ulonglong4 tmp256_s[1];
|
||||
|
||||
|
||||
//#define SHRED_VALS ((BLOCK / 169) * )
|
||||
@ -414,7 +414,7 @@ __global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t s
|
||||
if ((index + i * 1024) < size) src_val = src[index + i*1024];
|
||||
else src_val = 0;
|
||||
//unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
|
||||
const int num_of_warps = blockDim.x / WARP_SIZE;
|
||||
//const int num_of_warps = blockDim.x / WARP_SIZE;
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int lane_id = threadIdx.x % WARP_SIZE;
|
||||
|
||||
@ -436,7 +436,7 @@ void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
|
||||
}
|
||||
// --------------------------------
|
||||
|
||||
|
||||
/*
|
||||
__device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
|
||||
size_t dst_i = index / 8;
|
||||
int dst_shift = index % 8;
|
||||
@ -449,6 +449,7 @@ __device__ __host__ static inline void set_bit(unsigned char *const dst, size_t
|
||||
dst[dst_i] |= 1 << dst_shift;
|
||||
//dst[dst_i] |= 1 << (8 - dst_shift);
|
||||
}
|
||||
*/
|
||||
|
||||
__device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
|
||||
size_t src_i = index / 8;
|
||||
@ -643,9 +644,9 @@ __global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int
|
||||
void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
|
||||
const int lda, const int ldb, const int block_size)
|
||||
{
|
||||
int size = n*m/ (8*8) + 1;
|
||||
//int size = n*m/ (8*8) + 1;
|
||||
int size32 = n*m / (32*32) + 1;
|
||||
const int num_blocks = size / BLOCK + 1;
|
||||
//const int num_blocks = size / BLOCK + 1;
|
||||
const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
|
||||
transpose_bin_gpu_kernel_32 << <num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >> >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
|
||||
//transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size);
|
||||
@ -692,10 +693,10 @@ __global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_
|
||||
//new_ldb - aligned (k) by 256
|
||||
|
||||
const int src_w_align = src_w + (32 - src_w % 32);
|
||||
const int src_h_align = src_h + (32 - src_h % 32);
|
||||
//const int src_h_align = src_h + (32 - src_h % 32);
|
||||
|
||||
const int warps_in_width = src_w_align / 32;
|
||||
const int warps_in_height = src_h_align / 32;
|
||||
//const int warps_in_height = src_h_align / 32;
|
||||
|
||||
|
||||
|
||||
@ -789,7 +790,7 @@ void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
|
||||
// 256 channels -> 8 channels (with 32 floats)
|
||||
__global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
|
||||
{
|
||||
__shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts
|
||||
//__shared__ uint32_t tmp[33 * 32]; // 33x32 is misaligned 32 x 32 to avoid bank conflicts
|
||||
|
||||
int index = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
@ -943,15 +944,15 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
|
||||
//typedef unsigned int uint32_t;
|
||||
//typedef unsigned char uint8_t;
|
||||
//typedef char int8_t;
|
||||
|
||||
/*
|
||||
__device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
|
||||
return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
|
||||
}
|
||||
|
||||
*/
|
||||
__device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
|
||||
return ~(a^b) & 0b1;
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
|
||||
return ~(a^b);
|
||||
}
|
||||
@ -977,13 +978,13 @@ __device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong
|
||||
res.z = ~(a.z^b.z);
|
||||
return res;
|
||||
}
|
||||
|
||||
*/
|
||||
//-------
|
||||
|
||||
/*
|
||||
__device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
|
||||
return (a^b) & 0b1;
|
||||
}
|
||||
|
||||
*/
|
||||
__device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
|
||||
return (a^b);
|
||||
}
|
||||
@ -991,7 +992,7 @@ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
|
||||
__device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
|
||||
return (a^b);
|
||||
}
|
||||
|
||||
/*
|
||||
__device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
|
||||
uint4 res;
|
||||
res.w = (a.w^b.w);
|
||||
@ -1000,7 +1001,7 @@ __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
|
||||
res.z = (a.z^b.z);
|
||||
return res;
|
||||
}
|
||||
|
||||
*/
|
||||
__device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
|
||||
ulonglong4 res;
|
||||
res.w = (a.w^b.w);
|
||||
@ -1010,12 +1011,11 @@ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4
|
||||
return res;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
__device__ static inline int popcnt_256(ulonglong4 a) {
|
||||
return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
|
||||
}
|
||||
|
||||
/*
|
||||
__global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
|
||||
unsigned char *A, int lda,
|
||||
unsigned char *B, int ldb,
|
||||
@ -1269,7 +1269,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
||||
*/
|
||||
|
||||
|
||||
int i, j, k, h;
|
||||
int i, j, k;//, h;
|
||||
// 47% = 29 + 10 + 8
|
||||
j = global_warp_id % (N_aligned / WMMA_Nx2);
|
||||
j = j * WMMA_Nx2;
|
||||
@ -1277,7 +1277,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
||||
i = global_warp_id / (N_aligned / WMMA_Nx2);
|
||||
i = i * WMMA_M;
|
||||
|
||||
int count = 0;
|
||||
//int count = 0;
|
||||
k = 0;
|
||||
|
||||
if (i < M) //if (i < M) // l.n - filters [16 - 55 - 1024]
|
||||
@ -1323,7 +1323,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
||||
// Custom XOR-GEMM
|
||||
int k_d = lane_id % 4;
|
||||
int i_d = lane_id / 4;
|
||||
int j_d = lane_id / 4;
|
||||
//int j_d = lane_id / 4;
|
||||
|
||||
int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
|
||||
for (int local_j = 0; local_j < 8*2; ++local_j) {
|
||||
@ -1333,9 +1333,9 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
|
||||
// 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
|
||||
for (; k < K; k += 128) // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
|
||||
{
|
||||
int64_t A_cur_index = (i*lda + k) / 8;
|
||||
//int64_t A_cur_index = (i*lda + k) / 8;
|
||||
//int64_t A_cur_index = (local_i*lda + k) / 8;
|
||||
int64_t B_cur_index = (j*ldb + k) / 8;
|
||||
//int64_t B_cur_index = (j*ldb + k) / 8;
|
||||
|
||||
// lda, ldb - are in bits
|
||||
// 8*4 = 32
|
||||
@ -1593,7 +1593,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
int i, j, k, h;
|
||||
int i, j, k; //, h;
|
||||
// 47% = 29 + 10 + 8
|
||||
j = index % N;
|
||||
{ // out_h*out_w - one channel output size [169 - 173056]
|
||||
@ -2115,7 +2115,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
|
||||
for (chan = 0; chan < in_c; ++chan)
|
||||
{
|
||||
//int const weights_pre_index = fil*in_c*size*size + chan*size*size;
|
||||
int const weights_pre_index = fil*new_lda + chan*size*size;
|
||||
//int const weights_pre_index = fil*new_lda + chan*size*size;
|
||||
int const input_pre_index = chan*in_w*in_h;
|
||||
|
||||
__shared__ uint32_t input_shared[416*416/32 + 1]; // 21.2 KB bytes (for input size 832x832)
|
||||
@ -2139,8 +2139,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
|
||||
}
|
||||
__syncthreads();
|
||||
*/
|
||||
int src_index = -1;
|
||||
uint32_t input_byte;
|
||||
//int src_index = -1;
|
||||
//uint32_t input_byte;
|
||||
|
||||
if (fil < n) // (1-6 for one BLOCK)
|
||||
{
|
||||
@ -2154,8 +2154,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
|
||||
int input_x = x + f_x - pad;
|
||||
if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
|
||||
|
||||
int input_index = input_pre_index + input_y*in_w + input_x;
|
||||
int weights_index = weights_pre_index + f_y*size + f_x;
|
||||
//int input_index = input_pre_index + input_y*in_w + input_x;
|
||||
//int weights_index = weights_pre_index + f_y*size + f_x;
|
||||
//int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
|
||||
//int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;
|
||||
|
||||
|
Reference in New Issue
Block a user