cleanup in preparation to opencv-4 work

2023-08-10 21:13:14 +03:00 · 2019-03-28 17:54:27 +01:00
parent f5fd086407
commit da74882fe1
9 changed files with 88 additions and 63 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,10 +1,12 @@
 *.o
+*.a
 *.dSYM
 *.csv
 *.out
 *.png
 *.so
 *.exe
+*.dll
 mnist/
 data/
 caffe/
@ -28,7 +30,10 @@ Icon?
 Thumbs.db
 *.swp

+# IDE generated #
+.vs/
+.vscode/
+
 lib/
 include/
 share/
-
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4)
+cmake_minimum_required(VERSION 3.8)

 set(Darknet_MAJOR_VERSION 1)
 set(Darknet_MINOR_VERSION 0)
@ -174,14 +174,14 @@ if(ENABLE_CUDA)
      include_directories(${ZED_INCLUDE_DIRS})
      link_directories(${ZED_LIBRARY_DIR})
      add_definitions(-DZED_STEREO)
-      message("ZED Camera support enabled")
+      message(STATUS "ZED SDK enabled")
    else()
-      message("ZED SDK not found !")
+      message(STATUS "ZED SDK not found")
      set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
    endif()
  endif()
 else()
-  message(WARNING "ZED SDK requires CUDA !")
+  message(STATUS "ZED SDK not enabled, since it requires CUDA")
  set(ENABLE_ZED_CAMERA "FALSE" CACHE BOOL "Enable ZED Camera support" FORCE)
 endif()

@ -247,7 +247,7 @@ endif()

 set_source_files_properties(${sources} PROPERTIES LANGUAGE CXX)

-add_library(darklib SHARED ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
+add_library(darklib ${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp ${CMAKE_CURRENT_LIST_DIR}/src/yolo_v2_class.cpp ${sources} ${headers} ${cuda_sources})
 set_target_properties(darklib PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if(ENABLE_CUDA)
  set_target_properties(darklib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
@ -323,21 +323,19 @@ endif()

 set_target_properties(darklib PROPERTIES PUBLIC_HEADER "${exported_headers};${CMAKE_CURRENT_LIST_DIR}/include/yolo_v2_class.hpp")

-install(TARGETS darklib uselib darknet EXPORT DarknetTargets
+install(TARGETS darklib EXPORT DarknetTargets
  RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
  LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
  ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
  PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
  COMPONENT dev
 )
+install(TARGETS uselib darknet
+  DESTINATION "${INSTALL_BIN_DIR}"
+)
 if(OpenCV_VERSION VERSION_GREATER "3.0" AND NOT SKIP_USELIB_TRACK)
  install(TARGETS uselib_track
-    EXPORT DarknetTargets
-    RUNTIME DESTINATION "${INSTALL_BIN_DIR}"
-    LIBRARY DESTINATION "${INSTALL_LIB_DIR}"
-    ARCHIVE DESTINATION "${INSTALL_LIB_DIR}"
-    PUBLIC_HEADER DESTINATION "${INSTALL_INCLUDE_DIR}"
-    COMPONENT dev
+    DESTINATION "${INSTALL_BIN_DIR}"
  )
 endif()

--- a/build.ps1
+++ b/build.ps1
@ -1,7 +1,7 @@
 #!/usr/bin/env pwsh

 $number_of_build_workers=8
-#$shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
+$create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
 $force_using_include_libs=$false

 #$my_cuda_compute_model=75    #Compute capability for Tesla T4, RTX 2080
@ -88,31 +88,37 @@ if ($vcpkg_path) {
  # DEBUG
  New-Item -Path .\build_win_debug -ItemType directory -Force
  Set-Location build_win_debug
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $shared_lib $additional_build_setup ..
-  cmake --build . --config Debug --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Debug" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Debug --target install
+  #cmake --build . --config Debug --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
  Remove-Item DarknetConfig.cmake
  Remove-Item DarknetConfigVersion.cmake
  Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\

  # RELEASE
  New-Item -Path .\build_win_release -ItemType directory -Force
  Set-Location build_win_release
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $shared_lib $additional_build_setup ..
-  cmake --build . --config Release --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" "-DCMAKE_TOOLCHAIN_FILE=$vcpkg_path\scripts\buildsystems\vcpkg.cmake" "-DVCPKG_TARGET_TRIPLET=$vcpkg_triplet" "-DCMAKE_BUILD_TYPE=Release" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Release --target install
+  #cmake --build . --config Release --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
  Remove-Item DarknetConfig.cmake
  Remove-Item DarknetConfigVersion.cmake
  Copy-Item *.dll ..
  Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\
 }
 else {
  # USE LOCAL PTHREAD LIB AND LOCAL STB HEADER, NO VCPKG, ONLY RELEASE MODE SUPPORTED
  # if you want to manually force this case, remove VCPKG_ROOT env variable and remember to use "vcpkg integrate remove" in case you had enabled user-wide vcpkg integration
  New-Item -Path .\build_win_release_novcpkg -ItemType directory -Force
  Set-Location build_win_release_novcpkg
-  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $shared_lib $additional_build_setup ..
-  cmake --build . --config Release --parallel ${number_of_build_workers} --target install
+  cmake -G "Visual Studio 15 2017" -T "host=x64" -A "x64" $create_shared_lib $additional_build_setup ..
+  cmake --build . --config Release --target install
+  #cmake --build . --config Release --parallel ${number_of_build_workers} --target install  #valid only for CMake 3.12+
  Remove-Item DarknetConfig.cmake
  Remove-Item DarknetConfigVersion.cmake
  Copy-Item ..\3rdparty\pthreads\bin\pthreadVC2.dll ..
  Set-Location ..
+  Copy-Item cmake\Modules\*.cmake share\darknet\
 }
--- a/build.sh
+++ b/build.sh
@ -1,6 +1,19 @@
 #!/usr/bin/env bash

 number_of_build_workers=8
+create_shared_lib="-DBUILD_SHARED_LIBS:BOOL=ON"
+
+#my_cuda_compute_model=75    #Compute capability for Tesla T4, RTX 2080
+#my_cuda_compute_model=72    #Compute capability for Jetson Xavier
+#my_cuda_compute_model=70    #Compute capability for Tesla V100
+#my_cuda_compute_model=62    #Compute capability for Jetson TX2
+#my_cuda_compute_model=61    #Compute capability for Tesla P40
+#my_cuda_compute_model=60    #Compute capability for Tesla P100
+#my_cuda_compute_model=53    #Compute capability for Jetson TX1
+#my_cuda_compute_model=52    #Compute capability for Tesla M40/M60
+#my_cuda_compute_model=37    #Compute capability for Tesla K80
+#my_cuda_compute_model=35    #Compute capability for Tesla K20/K40
+#my_cuda_compute_model=30    #Compute capability for Tesla K10, Quadro K4000

 if [[ "$OSTYPE" == "darwin"* ]]; then
  OpenCV_DIR="/usr/local/Cellar/opencv@3/3.4.5"
@ -11,11 +24,14 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
  fi
 fi

+if [[ ! -z "$my_cuda_compute_model" ]]; then
+  additional_build_setup="-DCUDA_COMPUTE_MODEL=${my_cuda_compute_model}"
+fi

 # RELEASE
 mkdir -p build_release
 cd build_release
-cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines}
+cmake .. -DCMAKE_BUILD_TYPE=Release ${additional_defines} ${create_shared_lib} ${additional_build_setup}
 cmake --build . --target install -- -j${number_of_build_workers}
 #cmake --build . --target install --parallel ${number_of_build_workers}  #valid only for CMake 3.12+
 rm -f DarknetConfig.cmake
@ -26,7 +42,7 @@ cp cmake/Modules/*.cmake share/darknet
 # DEBUG
 mkdir -p build_debug
 cd build_debug
-cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines}
+cmake .. -DCMAKE_BUILD_TYPE=Debug ${additional_defines} ${create_shared_lib} ${additional_build_setup}
 cmake --build . --target install -- -j${number_of_build_workers}
 #cmake --build . --target install --parallel ${number_of_build_workers}  #valid only for CMake 3.12+
 rm -f DarknetConfig.cmake
--- a/cmake/Modules/FindStb.cmake
+++ b/cmake/Modules/FindStb.cmake
--- a/pthreadVC2.dll
+++ b/pthreadVC2.dll
--- a/src/convolutional_kernels.cu
+++ b/src/convolutional_kernels.cu
@ -184,18 +184,18 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
        if (l.align_bit_weights_gpu && !state.train && l.c >= 32)
        {
            //return;
-            cudaError_t status = cudaSuccess;
-            int input_size = l.c*l.h*l.w*l.batch;
+            //cudaError_t status = cudaSuccess;
+            //int input_size = l.c*l.h*l.w*l.batch;

            int m = l.n;
            int k = l.size*l.size*l.c;
            int n = l.out_w*l.out_h;
-            float * a = l.weights_gpu;
+            //float * a = l.weights_gpu;

            int ldb_align = l.lda_align;
            size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
-            size_t t_intput_size = new_ldb * n;
-            size_t t_bit_input_size = t_intput_size / 8;// +1;
+            //size_t t_intput_size = new_ldb * n;
+            //size_t t_bit_input_size = t_intput_size / 8;// +1;

            if (l.c % 32 == 0)
            {
@ -208,8 +208,8 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)

                int ldb_align = l.lda_align;
                size_t new_ldb = k + (ldb_align - k%ldb_align); // (k / 8 + 1) * 8;
-                size_t t_intput_size = new_ldb * l.bit_align;// n;
-                size_t t_bit_input_size = t_intput_size / 8;// +1;
+                //size_t t_intput_size = new_ldb * l.bit_align;// n;
+                //size_t t_bit_input_size = t_intput_size / 8;// +1;

                const int new_c = l.c / 32;

@ -408,7 +408,7 @@ void forward_convolutional_layer_gpu(convolutional_layer l, network_state state)
    //fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);

 #ifdef CUDNN
-    float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
+    //float one = 1;    // alpha[0], beta[0] is float for HALF and FLOAT
    float alpha = 1, beta = 0;

 //#ifdef CUDNN_HALF
@ -609,7 +609,7 @@ void backward_convolutional_layer_gpu(convolutional_layer l, network_state state

    if(l.xnor) state.input = l.binary_input_gpu;
 #ifdef CUDNN
-    float one = 1;
+    float one = 1.f;
    float alpha = 1, beta = 0;

 //#ifdef CUDNN_HALF
--- a/src/gemm.c
+++ b/src/gemm.c
@ -900,7 +900,7 @@ void gemm_nn_bin_32bit_packed(int M, int N, int K, float ALPHA,
            {
                __m256i b256 = *((__m256i*)&B[s*ldb + j]);
                __m256i xor256 = _mm256_xor_si256(a256, b256);  // xnor = xor(a,b)
-                __m256i all_1 = _mm256_set1_epi8(255);
+                __m256i all_1 = _mm256_set1_epi8((char)255);
                __m256i xnor256 = _mm256_andnot_si256(xor256, all_1); // xnor = not(xor(a,b))

                // waiting for - CPUID Flags: AVX512VPOPCNTDQ: __m512i _mm512_popcnt_epi32(__m512i a)
@ -1162,7 +1162,7 @@ static inline int popcnt256_custom(__m256i n) {
 }

 static inline void xnor_avx2_popcnt(__m256i a_bit256, __m256i b_bit256, __m256i *count_sum) {
-    __m256i c_bit256 = _mm256_set1_epi8(255);
+    __m256i c_bit256 = _mm256_set1_epi8((char)255);

    __m256i xor256 = _mm256_xor_si256(a_bit256, b_bit256);  // xnor = not(xor(a,b))
    c_bit256 = _mm256_andnot_si256(xor256, c_bit256);  // can be optimized - we can do other NOT for wegihts once and do not do this NOT
--- a/src/im2col_kernels.cu
+++ b/src/im2col_kernels.cu
@ -228,8 +228,8 @@ __global__ void im2col_align_bin_gpu_kernel(const int n, const float* data_im,
    const int height_col, const int width_col,
    float *data_col, const int bit_align)
 {
-    __shared__ float tmp_s[1];
-    __shared__ ulonglong4 tmp256_s[1];
+    //__shared__ float tmp_s[1];
+    //__shared__ ulonglong4 tmp256_s[1];


    //#define SHRED_VALS ((BLOCK / 169) * )
@ -414,7 +414,7 @@ __global__ void float_to_bit_gpu_kernel(float *src, unsigned char *dst, size_t s
        if ((index + i * 1024) < size) src_val = src[index + i*1024];
        else src_val = 0;
        //unsigned int bit_mask = __ballot_sync(0xffffffff, src_val > 0);
-        const int num_of_warps = blockDim.x / WARP_SIZE;
+        //const int num_of_warps = blockDim.x / WARP_SIZE;
        const int warp_id = threadIdx.x / WARP_SIZE;
        const int lane_id = threadIdx.x % WARP_SIZE;

@ -436,7 +436,7 @@ void float_to_bit_gpu(float *src, unsigned char *dst, size_t size)
 }
 // --------------------------------

-
+/*
 __device__ __host__ static inline void remove_bit(unsigned char *const dst, size_t index) {
    size_t dst_i = index / 8;
    int dst_shift = index % 8;
@ -449,6 +449,7 @@ __device__ __host__ static inline void set_bit(unsigned char *const dst, size_t
    dst[dst_i] |= 1 << dst_shift;
    //dst[dst_i] |= 1 << (8 - dst_shift);
 }
+*/

 __device__ __host__ static inline unsigned char get_bit(unsigned char const*const src, size_t index) {
    size_t src_i = index / 8;
@ -643,9 +644,9 @@ __global__ void transpose_bin_gpu_kernel_32(uint32_t *A, uint32_t *B, const int
 void transpose_bin_gpu(unsigned char *A, unsigned char *B, const int n, const int m,
    const int lda, const int ldb, const int block_size)
 {
-    int size = n*m/ (8*8) + 1;
+    //int size = n*m/ (8*8) + 1;
    int size32 = n*m / (32*32) + 1;
-    const int num_blocks = size / BLOCK + 1;
+    //const int num_blocks = size / BLOCK + 1;
    const int num_blocks32 = size32 / BLOCK_TRANSPOSE32 + 1;
    transpose_bin_gpu_kernel_32 << <num_blocks32, BLOCK_TRANSPOSE32, 0, get_cuda_stream() >> >((uint32_t *)A, (uint32_t *)B, n, m, lda, ldb, block_size);
    //transpose_bin_gpu_kernel << <num_blocks, BLOCK, 0, get_cuda_stream() >> >(A, B, n, m, lda, ldb, block_size);
@ -692,10 +693,10 @@ __global__ void transpose_uint32_kernel_2(uint32_t *src, uint32_t *dst, int src_
    //new_ldb - aligned (k) by 256

    const int src_w_align = src_w + (32 - src_w % 32);
-    const int src_h_align = src_h + (32 - src_h % 32);
+    //const int src_h_align = src_h + (32 - src_h % 32);

    const int warps_in_width = src_w_align / 32;
-    const int warps_in_height = src_h_align / 32;
+    //const int warps_in_height = src_h_align / 32;



@ -789,7 +790,7 @@ void repack_input_gpu(float *input, float *re_packed_input, int w, int h, int c)
 // 256 channels -> 8 channels (with 32 floats)
 __global__ void repack_input_kernel_2(float *input, float *re_packed_input, int w, int h, int c)
 {
-    __shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts
+    //__shared__ uint32_t tmp[33 * 32];  // 33x32 is misaligned 32 x 32 to avoid bank conflicts

    int index = blockIdx.x*blockDim.x + threadIdx.x;

@ -943,15 +944,15 @@ void fill_int8_gpu(unsigned char *src, unsigned char val, size_t size) {
 //typedef unsigned int uint32_t;
 //typedef unsigned char uint8_t;
 //typedef char int8_t;
-
+/*
 __device__ __host__ static inline uint64_t broadcast_bit_1_to_64(uint8_t src) {
    return (src > 0) ? 0xFFFFFFFFFFFFFFFF : 0;
 }
-
+*/
 __device__ __host__ static inline uint8_t xnor_bit1(uint8_t a, uint8_t b) {
    return ~(a^b) & 0b1;
 }
-
+/*
 __device__ __host__ static inline uint32_t xnor_int32(uint32_t a, uint32_t b) {
    return ~(a^b);
 }
@ -977,13 +978,13 @@ __device__ __host__ static inline ulonglong4 xnor_int256(ulonglong4 a, ulonglong
    res.z = ~(a.z^b.z);
    return res;
 }
-
+*/
 //-------
-
+/*
 __device__ __host__ static inline uint8_t xor_bit1(uint8_t a, uint8_t b) {
    return (a^b) & 0b1;
 }
-
+*/
 __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
    return (a^b);
 }
@ -991,7 +992,7 @@ __device__ __host__ static inline uint32_t xor_int32(uint32_t a, uint32_t b) {
 __device__ __host__ static inline uint64_t xor_int64(uint64_t a, uint64_t b) {
    return (a^b);
 }
-
+/*
 __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
    uint4 res;
    res.w = (a.w^b.w);
@ -1000,7 +1001,7 @@ __device__ __host__ static inline uint4 xor_int128(uint4 a, uint4 b) {
    res.z = (a.z^b.z);
    return res;
 }
-
+*/
 __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4 b) {
    ulonglong4 res;
    res.w = (a.w^b.w);
@ -1010,12 +1011,11 @@ __device__ __host__ static inline ulonglong4 xor_int256(ulonglong4 a, ulonglong4
    return res;
 }

-
+/*
 __device__ static inline int popcnt_256(ulonglong4 a) {
    return __popcll(a.w) + __popcll(a.x) + __popcll(a.y) + __popcll(a.z);
 }

-/*
 __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int K,
    unsigned char *A, int lda,
    unsigned char *B, int ldb,
@ -1269,7 +1269,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
    */


-    int i, j, k, h;
+    int i, j, k;//, h;
    // 47% = 29 + 10 + 8
    j = global_warp_id % (N_aligned / WMMA_Nx2);
    j = j * WMMA_Nx2;
@ -1277,7 +1277,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
        i = global_warp_id / (N_aligned / WMMA_Nx2);
        i = i * WMMA_M;

-        int count = 0;
+        //int count = 0;
        k = 0;

        if (i < M)  //if (i < M)  // l.n - filters [16 - 55 - 1024]
@ -1323,7 +1323,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
            // Custom XOR-GEMM
            int k_d = lane_id % 4;
            int i_d = lane_id / 4;
-            int j_d = lane_id / 4;
+            //int j_d = lane_id / 4;

            int32_t accum_c_val[8*2]; // wmma::fill_fragment(c_frag, 0);
            for (int local_j = 0; local_j < 8*2; ++local_j) {
@ -1333,9 +1333,9 @@ __global__ void gemm_nn_custom_bin_mean_transposed_tensor_kernel(int M, int N, i
            // 8 x 8 x 4 (uint32_t, 4 * 32 = 128 bit)
            for (; k < K; k += 128)  // l.size*l.size*l.c - one filter size [27 - 144 - 9216]
            {
-                int64_t A_cur_index = (i*lda + k) / 8;
+                //int64_t A_cur_index = (i*lda + k) / 8;
                //int64_t A_cur_index = (local_i*lda + k) / 8;
-                int64_t B_cur_index = (j*ldb + k) / 8;
+                //int64_t B_cur_index = (j*ldb + k) / 8;

                // lda, ldb - are in bits
                // 8*4 = 32
@ -1593,7 +1593,7 @@ __global__ void gemm_nn_custom_bin_mean_transposed_gpu_kernel(int M, int N, int
    }
    __syncthreads();

-    int i, j, k, h;
+    int i, j, k; //, h;
    // 47% = 29 + 10 + 8
    j = index % N;
    {    // out_h*out_w - one channel output size [169 - 173056]
@ -2115,7 +2115,7 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
        for (chan = 0; chan < in_c; ++chan)
        {
            //int const weights_pre_index = fil*in_c*size*size + chan*size*size;
-            int const weights_pre_index = fil*new_lda + chan*size*size;
+            //int const weights_pre_index = fil*new_lda + chan*size*size;
            int const input_pre_index = chan*in_w*in_h;

            __shared__ uint32_t input_shared[416*416/32 + 1];   // 21.2 KB bytes (for input size 832x832)
@ -2139,8 +2139,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
            }
            __syncthreads();
            */
-            int src_index = -1;
-            uint32_t input_byte;
+            //int src_index = -1;
+            //uint32_t input_byte;

            if (fil < n)    // (1-6 for one BLOCK)
            {
@ -2154,8 +2154,8 @@ __global__ void convolve_bin_gpu_kernel(float *input, float *weights, float *out
                        int input_x = x + f_x - pad;
                        if (input_y < 0 || input_x < 0 || input_y >= in_h || input_x >= in_w) continue;

-                        int input_index = input_pre_index + input_y*in_w + input_x;
-                        int weights_index = weights_pre_index + f_y*size + f_x;
+                        //int input_index = input_pre_index + input_y*in_w + input_x;
+                        //int weights_index = weights_pre_index + f_y*size + f_x;
                        //int weights_index = fil*in_c*size*size + chan*size*size + f_y*size + f_x;
                        //int weights_index = fil*new_lda + chan*size*size + f_y*size + f_x;