diff --git a/.gitignore b/.gitignore
index b415cea3..68a7a13b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,12 @@
 *.o
+*.a
 *.dSYM
 *.csv
 *.out
 *.png
 *.so
 *.exe
+*.dll
 mnist/
 data/
 caffe/
@@ -28,7 +30,10 @@ Icon?
 Thumbs.db
 *.swp
 
+# IDE generated #
+.vs/
+.vscode/
+
 lib/
 include/
 share/
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5668b934..f22ba83a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.8)
 
 set(Darknet_MAJOR_VERSION 1)
 set(Darknet_MINOR_VERSION 0)
@@ -174,14 +174,14 @@ if(ENABLE_CUDA)
       include_directories(${ZED_INCLUDE_DIRS})
       link_directories(${ZED_LIBRARY_DIR})
       add_definitions(-DZED_STEREO)
-      message("ZED Camera support enabled")
+      message(STATUS "ZED SDK enabled")
     else()
-      message("ZED SDK not found !")
+      message(STATUS "ZED SDK not found")
       set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
     endif()
   endif()
 else()
-  message(WARNING "ZED SDK requires CUDA !")
+  message(STATUS "ZED SDK not enabled, since it requires CUDA")
   set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
 endif()
 
@@ -247,7 +247,7 @@ endif()
 
 set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)
 
-add_library(darklib SHARED ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
+add_library(darklib ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
 set_target_properties(darklib PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if(ENABLE_CUDA)
   set_target_properties(darklib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@@ -323,21 +323,19 @@ endif()
 
 set_target_properties(darklib PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")
 
-install(TARGETS darklib uselib darknet EXPORT DarknetTargets
+install(TARGETS darklib EXPORT DarknetTargets
   RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
   LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
   ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
   PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
   COMPONENT dev
 )
+install(TARGETS uselib darknet
+  DESTINATION "${INSTALL_BIN_DIR}"
+)
 if(OpenCV_VERSION VERSION_GREATER "3.0" AND NOT SKIP_USELIB_TRACK)
   install(TARGETS uselib_track
-    EXPORT DarknetTargets
-    RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
-    LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
-    ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
-    PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
-    COMPONENT dev
+    DESTINATION "${INSTALL_BIN_DIR}"
   )
 endif()
 
diff --git a/build.ps1 b/build.ps1
index 7063098c..0fd60a93 100755
--- a/build.ps1
+++ b/build.ps1
@@ -1,7 +1,7 @@
 #!/usr/bin/env pwsh
 
 $number_of_build_workers=8
-#$shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
+$create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
 $force_using_include_libs=$false
 
 #$my_cuda_compute_model=75    #Compute capability for Tesla T4, RTX 2080
@@ -88,31 +88,37 @@ if ($vcpkg_path) {
   # DEBUG
   New-Item -Path .\build_win_debug -ItemType directory -Force
   Set-Location build_win_debug
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $shared_lib $additional_build_setup ..
-  cmake --build . --config Debug --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Debug --target install
+  #cmake --build . --config Debug --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
   Remove-Item DarknetConfig.cmake
   Remove-Item DarknetConfigVersion.cmake
   Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\
 
   # RELEASE
   New-Item -Path .\build_win_release -ItemType directory -Force
   Set-Location build_win_release
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $shared_lib $additional_build_setup ..
-  cmake --build . --config Release --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Release --target install
+  #cmake --build . --config Release --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
   Remove-Item DarknetConfig.cmake
   Remove-Item DarknetConfigVersion.cmake
   Copy-Item *.dll ..
   Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\
 }
 else {
   # USE LOCAL PTHREAD LIB AND LOCAL STB HEADER, NO VCPKG, ONLY RELEASE MODE SUPPORTED
   # if you want to manually force this case, remove VCPKG_ROOT env variable and remember to use "vcpkg integrate remove" in case you had enabled user-wide vcpkg integration
   New-Item -Path .\build_win_release_novcpkg -ItemType directory -Force
   Set-Location build_win_release_novcpkg
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $shared_lib $additional_build_setup ..
-  cmake --build . --config Release --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Release --target install
+  #cmake --build . --config Release --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
   Remove-Item DarknetConfig.cmake
   Remove-Item DarknetConfigVersion.cmake
   Copy-Item ..\3rdparty\pthreads\bin\pthreadVC2.dll ..
   Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\
 }
diff --git a/build.sh b/build.sh
index 7ff54d91..f74351d1 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,19 @@
 #!/usr/bin/env bash
 
 number_of_build_workers=8
+create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
+
+#my_cuda_compute_model=75    #Compute capability for Tesla T4, RTX 2080
+#my_cuda_compute_model=72    #Compute capability for Jetson Xavier
+#my_cuda_compute_model=70    #Compute capability for Tesla V100
+#my_cuda_compute_model=62    #Compute capability for Jetson TX2
+#my_cuda_compute_model=61    #Compute capability for Tesla P40
+#my_cuda_compute_model=60    #Compute capability for Tesla P100
+#my_cuda_compute_model=53    #Compute capability for Jetson TX1
+#my_cuda_compute_model=52    #Compute capability for Tesla M40/M60
+#my_cuda_compute_model=37    #Compute capability for Tesla K80
+#my_cuda_compute_model=35    #Compute capability for Tesla K20/K40
+#my_cuda_compute_model=30    #Compute capability for Tesla K10, Quadro K4000
 
 if [[ "$OSTYPE" == "darwin"* ]]; then
   OpenCV_DIR="/usr/local/Cellar/opencv@3/3.4.5"
@@ -11,11 +24,14 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
   fi
 fi
 
+if [[ ! -z "$my_cuda_compute_model" ]]; then
+  additional_build_setup="-DCUDA_COMPUTE_MODEL=${my_cuda_compute_model}"
+fi
 
 # RELEASE
 mkdir -p build_release
 cd build_release
-cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines}
+cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} ${create_shared_lib} ${additional_build_setup}
 cmake --build . --target install -- -j${number_of_build_workers}
 #cmake --build . --target install --parallel ${number_of_build_workers}  #valid only for CMake 3.12+
 rm -f DarknetConfig.cmake
@@ -26,7 +42,7 @@ cp cmake/Modules/*.cmake share/darknet
 # DEBUG
 mkdir -p build_debug
 cd build_debug
-cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines}
+cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} ${create_shared_lib} ${additional_build_setup}
 cmake --build . --target install -- -j${number_of_build_workers}
 #cmake --build . --target install --parallel ${number_of_build_workers}  #valid only for CMake 3.12+
 rm -f DarknetConfig.cmake
diff --git a/cmake/Modules/FindStb.cmake b/cmake/Modules/FindStb.cmake
old mode 100755
new mode 100644
diff --git a/pthreadVC2.dll b/pthreadVC2.dll
deleted file mode 100644
index 165b4d26..00000000
Binary files a/pthreadVC2.dll and /dev/null differ
diff --git a/src/convolutional_kernels.cu b/src/convolutional_kernels.cu
index 8ae99710..cfa4fe7e 100644
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@@ -184,18 +184,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
         if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
         {
             //return;
-            cudaError_t status = cudaSuccess;
-            int input_size = l.c*l.h*l.w*l.batch;
+            //cudaError_t status = cudaSuccess;
+            //int input_size = l.c*l.h*l.w*l.batch;
 
             int m = l.n;
             int k = l.size*l.size*l.c;
             int n = l.out_w*l.out_h;
-            float * a = l.weights_gpu;
+            //float * a = l.weights_gpu;
 
             int ldb_align = l.lda_align;
             size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
-            size_t t_intput_size = new_ldb * n;
-            size_t t_bit_input_size = t_intput_size / 8;// +1;
+            //size_t t_intput_size = new_ldb * n;
+            //size_t t_bit_input_size = t_intput_size / 8;// +1;
 
             if (l.c % 32 == 0)
             {
@@ -208,8 +208,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
 
                 int ldb_align = l.lda_align;
                 size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
-                size_t t_intput_size = new_ldb * l.bit_align;// n;
-                size_t t_bit_input_size = t_intput_size / 8;// +1;
+                //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                //size_t t_bit_input_size = t_intput_size / 8;// +1;
 
                 const int new_c = l.c / 32;
 
@@ -408,7 +408,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
     //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);
 
 #ifdef CUDNN
-    float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
+    //float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
     float alpha = 1, beta = 0;
 
 //#ifdef CUDNN_HALF
@@ -609,7 +609,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state
 
     if(l.xnor) state.input = l.binary_input_gpu;
 #ifdef CUDNN
-    float one = 1;
+    float one = 1.f;
     float alpha = 1, beta = 0;
 
 //#ifdef CUDNN_HALF
diff --git a/src/gemm.c b/src/gemm.c
index 861a1906..336f3bdd 100644
--- a/src/gemm.c
+++ b/src/gemm.c
@@ -900,7 +900,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
             {
                 __m256i b256 = *((__m256i*)&B[s*ldb + j]);
                 __m256i xor256 = _mm256_xor_si256(a256, b256);  // xnor = xor(a,b)
-                __m256i all_1 = _mm256_set1_epi8(255);
+                __m256i all_1 = _mm256_set1_epi8((char)255);
                 __m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b))
 
                 // waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a)
@@ -1162,7 +1162,7 @@ static inline int popcnt256_custom(__m256i n) {
 }
 
 static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
-    __m256i c_bit256 = _mm256_set1_epi8(255);
+    __m256i c_bit256 = _mm256_set1_epi8((char)255);
 
     __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
     c_bit256 = _mm256_andnot_si256(xor256, c_bit256);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT
diff --git a/src/im2col_kernels.cu b/src/im2col_kernels.cu
index 539824a9..954a4694 100644
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@@ -228,8 +228,8 @@ __global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
     const int height_col, const int width_col,
     float *data_col, const int bit_align)
 {
-    __shared__ float tmp_s[1];
-    __shared__ ulonglong4 tmp256_s[1];
+    //__shared__ float tmp_s[1];
+    //__shared__ ulonglong4 tmp256_s[1];
 
 
     //#define SHRED_VALS ((BLOCK / 169) * )
@@ -414,7 +414,7 @@ __global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t s
         if ((index + i * 1024) < size) src_val = src[index + i*1024];
         else src_val = 0;
         //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
-        const int num_of_warps = blockDim.x / WARP_SIZE;
+        //const int num_of_warps = blockDim.x / WARP_SIZE;
         const int warp_id = threadIdx.x / WARP_SIZE;
         const int lane_id = threadIdx.x % WARP_SIZE;
 
@@ -436,7 +436,7 @@ void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
 }
 // --------------------------------
 
-
+/*
 __device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
     size_t dst_i = index / 8;
     int dst_shift = index % 8;
@@ -449,6 +449,7 @@ __device__ __host__ static inline void set_bit(unsigned char *const dst, size_t
     dst[dst_i] |= 1 << dst_shift;
     //dst[dst_i] |= 1 << (8 - dst_shift);
 }
+*/
 
 __device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
     size_t src_i = index / 8;
@@ -643,9 +644,9 @@ __global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int
 void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
     const int lda, const int ldb, const int block_size)
 {
-    int size = n*m/ (8*8) + 1;
+    //int size = n*m/ (8*8) + 1;
     int size32 = n*m / (32*32) + 1;
-    const int num_blocks = size / BLOCK + 1;
+    //const int num_blocks = size / BLOCK + 1;
     const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
     transpose_bin_gpu_kernel_32 << <num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >> >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
     //transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size);
@@ -692,10 +693,10 @@ __global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_
     //new_ldb - aligned (k) by 256
 
     const int src_w_align = src_w + (32 - src_w % 32);
-    const int src_h_align = src_h + (32 - src_h % 32);
+    //const int src_h_align = src_h + (32 - src_h % 32);
 
     const int warps_in_width = src_w_align / 32;
-    const int warps_in_height = src_h_align / 32;
+    //const int warps_in_height = src_h_align / 32;
 
 
 
@@ -789,7 +790,7 @@ void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
 {
-    __shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts
+    //__shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts
 
     int index = blockIdx.x*blockDim.x + threadIdx.x;
 
@@ -943,15 +944,15 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
 //typedef unsigned int uint32_t;
 //typedef unsigned char uint8_t;
 //typedef char int8_t;
-
+/*
 __device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
     return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
 }
-
+*/
 __device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
     return ~(a^b) & 0b1;
 }
-
+/*
 __device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
     return ~(a^b);
 }
@@ -977,13 +978,13 @@ __device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong
     res.z = ~(a.z^b.z);
     return res;
 }
-
+*/
 //-------
-
+/*
 __device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
     return (a^b) & 0b1;
 }
-
+*/
 __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
     return (a^b);
 }
@@ -991,7 +992,7 @@ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
 __device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
     return (a^b);
 }
-
+/*
 __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
     uint4 res;
     res.w = (a.w^b.w);
@@ -1000,7 +1001,7 @@ __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
     res.z = (a.z^b.z);
     return res;
 }
-
+*/
 __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
     ulonglong4 res;
     res.w = (a.w^b.w);
@@ -1010,12 +1011,11 @@ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4
     return res;
 }
 
-
+/*
 __device__ static inline int popcnt_256(ulonglong4 a) {
     return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
 }
 
-/*
 __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
     unsigned char *A, int lda,
     unsigned char *B, int ldb,
@@ -1269,7 +1269,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
     */
 
 
-    int i, j, k, h;
+    int i, j, k;//, h;
     // 47% = 29 + 10 + 8
     j = global_warp_id % (N_aligned / WMMA_Nx2);
     j = j * WMMA_Nx2;
@@ -1277,7 +1277,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
         i = global_warp_id / (N_aligned / WMMA_Nx2);
         i = i * WMMA_M;
 
-        int count = 0;
+        //int count = 0;
         k = 0;
 
         if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
@@ -1323,7 +1323,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
             // Custom XOR-GEMM
             int k_d = lane_id % 4;
             int i_d = lane_id / 4;
-            int j_d = lane_id / 4;
+            //int j_d = lane_id / 4;
 
             int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
             for (int local_j = 0; local_j < 8*2; ++local_j) {
@@ -1333,9 +1333,9 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
             // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
             for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
             {
-                int64_t A_cur_index = (i*lda + k) / 8;
+                //int64_t A_cur_index = (i*lda + k) / 8;
                 //int64_t A_cur_index = (local_i*lda + k) / 8;
-                int64_t B_cur_index = (j*ldb + k) / 8;
+                //int64_t B_cur_index = (j*ldb + k) / 8;
 
                 // lda, ldb - are in bits
                 // 8*4 = 32
@@ -1593,7 +1593,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
     }
     __syncthreads();
 
-    int i, j, k, h;
+    int i, j, k; //, h;
     // 47% = 29 + 10 + 8
     j = index % N;
     {    // out_h*out_w - one channel output size [169 - 173056]
@@ -2115,7 +2115,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
         for (chan = 0; chan < in_c; ++chan)
         {
             //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
-            int const weights_pre_index = fil*new_lda + chan*size*size;
+            //int const weights_pre_index = fil*new_lda + chan*size*size;
             int const input_pre_index = chan*in_w*in_h;
 
             __shared__ uint32_t input_shared[416*416/32 + 1];   // 21.2 KB bytes (for input size 832x832)
@@ -2139,8 +2139,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
             }
             __syncthreads();
             */
-            int src_index = -1;
-            uint32_t input_byte;
+            //int src_index = -1;
+            //uint32_t input_byte;
 
             if (fil < n)    // (1-6 for one BLOCK)
             {
@@ -2154,8 +2154,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
                         int input_x = x + f_x - pad;
                         if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;
 
-                        int input_index = input_pre_index + input_y*in_w + input_x;
-                        int weights_index = weights_pre_index + f_y*size + f_x;
+                        //int input_index = input_pre_index + input_y*in_w + input_x;
+                        //int weights_index = weights_pre_index + f_y*size + f_x;
                         //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                         //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;