extern "C" { #include "blas.h" #include "cuda.h" } __global__ void axpy_kernel(int N, float ALPHA, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < N) Y[OFFY+i*INCY] += ALPHA*X[OFFX+i*INCX]; } __global__ void scal_kernel(int N, float ALPHA, float *X, int INCX) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < N) X[i*INCX] *= ALPHA; } __global__ void mask_kernel(int n, float *x, float *mask) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < n && mask[i] == 0) x[i] = 0; } __global__ void copy_kernel(int N, float *X, int OFFX, int INCX, float *Y, int OFFY, int INCY) { int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; if(i < N) Y[i*INCY + OFFY] = X[i*INCX + OFFX]; } extern "C" void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY) { axpy_ongpu_offset(N, ALPHA, X, 0, INCX, Y, 0, INCY); } extern "C" void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { axpy_kernel<<>>(N, ALPHA, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } extern "C" void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY) { copy_ongpu_offset(N, X, 0, INCX, Y, 0, INCY); } extern "C" void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY) { copy_kernel<<>>(N, X, OFFX, INCX, Y, OFFY, INCY); check_error(cudaPeekAtLastError()); } extern "C" void mask_ongpu(int N, float * X, float * mask) { mask_kernel<<>>(N, X, mask); check_error(cudaPeekAtLastError()); } extern "C" void scal_ongpu(int N, float ALPHA, float * X, int INCX) { scal_kernel<<>>(N, ALPHA, X, INCX); check_error(cudaPeekAtLastError()); }