mirror of
https://github.com/pjreddie/darknet.git
synced 2023-08-10 21:13:14 +03:00
I am so done with opencl, switching to cuda
This commit is contained in:
parent
6e1d5b45de
commit
4ac78c8926
@ -210,7 +210,7 @@ void train_imagenet(char *cfgfile)
|
|||||||
//network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg");
|
//network net = parse_network_cfg("/home/pjreddie/imagenet_backup/alexnet_1270.cfg");
|
||||||
srand(time(0));
|
srand(time(0));
|
||||||
network net = parse_network_cfg(cfgfile);
|
network net = parse_network_cfg(cfgfile);
|
||||||
set_learning_network(&net, net.learning_rate*10., net.momentum, net.decay);
|
set_learning_network(&net, net.learning_rate*100., net.momentum, net.decay);
|
||||||
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
|
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net.learning_rate, net.momentum, net.decay);
|
||||||
int imgs = 1024;
|
int imgs = 1024;
|
||||||
int i = 6600;
|
int i = 6600;
|
||||||
|
@ -164,8 +164,7 @@ cl_kernel get_gemm_nn_kernel()
|
|||||||
|
|
||||||
#define TILE 64
|
#define TILE 64
|
||||||
#define TILE_K 16
|
#define TILE_K 16
|
||||||
#define WPT 8
|
#define THREADS 64
|
||||||
#define THREADS (TILE*TILE)/(WPT*WPT)
|
|
||||||
|
|
||||||
cl_kernel get_gemm_nn_fast_kernel()
|
cl_kernel get_gemm_nn_fast_kernel()
|
||||||
{
|
{
|
||||||
@ -175,7 +174,6 @@ cl_kernel get_gemm_nn_fast_kernel()
|
|||||||
gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE)
|
gemm_kernel = get_kernel("src/gemm_fast.cl", "gemm_nn_fast", "-D TILE=" STR(TILE)
|
||||||
" -cl-nv-verbose "
|
" -cl-nv-verbose "
|
||||||
" -D TILE_K=" STR(TILE_K)
|
" -D TILE_K=" STR(TILE_K)
|
||||||
" -D WPT=" STR(WPT)
|
|
||||||
" -D THREADS=" STR(THREADS));
|
" -D THREADS=" STR(THREADS));
|
||||||
init = 1;
|
init = 1;
|
||||||
}
|
}
|
||||||
@ -464,7 +462,6 @@ void test_gpu_blas()
|
|||||||
|
|
||||||
test_gpu_accuracy(0,0,128,128,128);
|
test_gpu_accuracy(0,0,128,128,128);
|
||||||
|
|
||||||
/*
|
|
||||||
time_ongpu(0,0,64,2916,363);
|
time_ongpu(0,0,64,2916,363);
|
||||||
time_ongpu_fast(0,0,64,2916,363);
|
time_ongpu_fast(0,0,64,2916,363);
|
||||||
time_ongpu(0,0,64,2916,363);
|
time_ongpu(0,0,64,2916,363);
|
||||||
@ -483,7 +480,6 @@ void test_gpu_blas()
|
|||||||
time_ongpu_fast(0,0,128,4096,12544);
|
time_ongpu_fast(0,0,128,4096,12544);
|
||||||
time_ongpu(0,0,128,4096,4096);
|
time_ongpu(0,0,128,4096,4096);
|
||||||
time_ongpu_fast(0,0,128,4096,4096);
|
time_ongpu_fast(0,0,128,4096,4096);
|
||||||
*/
|
|
||||||
// time_ongpu(1,0,2304,196,256);
|
// time_ongpu(1,0,2304,196,256);
|
||||||
// time_ongpu_fast(1,0,2304,196,256);
|
// time_ongpu_fast(1,0,2304,196,256);
|
||||||
// time_ongpu(0,1,256,2304,196);
|
// time_ongpu(0,1,256,2304,196);
|
||||||
|
@ -16,16 +16,15 @@ __kernel void gemm_nn_fast(int TA, int TB, int M, int N, int K, float ALPHA,
|
|||||||
int ctile = get_group_id(0);
|
int ctile = get_group_id(0);
|
||||||
int rtile = get_group_id(1);
|
int rtile = get_group_id(1);
|
||||||
|
|
||||||
float Breg;
|
float Areg[TILE];
|
||||||
float Areg[WPT];
|
float acc[TILE][TILE/THREADS];
|
||||||
float acc[WPT][WPT];
|
|
||||||
|
|
||||||
A += rtile*TILE*lda;
|
A += rtile*TILE*lda;
|
||||||
B += ctile*TILE;
|
B += ctile*TILE;
|
||||||
C += rtile*TILE*ldc + ctile*TILE;
|
C += rtile*TILE*ldc + ctile*TILE;
|
||||||
|
|
||||||
for(i = 0; i < WPT; ++i){
|
for(i = 0; i < TILE; ++i){
|
||||||
for(j = 0; j < WPT; ++j){
|
for(j = 0; j < TILE/THREADS; ++j){
|
||||||
acc[i][j] = 0;
|
acc[i][j] = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -51,28 +50,26 @@ __kernel void gemm_nn_fast(int TA, int TB, int M, int N, int K, float ALPHA,
|
|||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
for(k = 0; k < TILE_K; ++k){
|
for(k = 0; k < TILE_K; ++k){
|
||||||
for(y = 0; y < WPT; ++y){
|
#pragma unroll
|
||||||
int row = (offset + (y*WPT)*THREADS)/TILE;
|
for(y = 0; y < TILE; ++y){
|
||||||
//Areg[y] = Asub[y*WPT][k];
|
Areg[y] = Asub[y][k];
|
||||||
}
|
}
|
||||||
for(y = 0; y < WPT; ++y){
|
for(x = 0; x < TILE; x += THREADS){
|
||||||
for(x = 0; x < WPT; ++x){
|
float Breg = Bsub[k][x+offset];
|
||||||
int index = offset + (y*WPT + x)*THREADS;
|
#pragma unroll
|
||||||
int row = index / TILE;
|
for(y = 0; y < TILE; ++y){
|
||||||
int col = index % TILE;
|
acc[y][x/THREADS] += Breg * Areg[y];
|
||||||
acc[y][x] += Asub[row][k]*Bsub[k][col];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
|
|
||||||
for(y = 0; y < WPT; ++y){
|
for(i = 0; i < TILE; ++i){
|
||||||
for(x = 0; x < WPT; ++x){
|
for(j = 0; j < TILE/THREADS; ++j){
|
||||||
int index = offset + (y*WPT + x)*THREADS;
|
int col = j*THREADS + offset;
|
||||||
int row = index / TILE;
|
int row = i;
|
||||||
int col = index % TILE;
|
C[row*ldc+col] = ALPHA*acc[i][j] + BETA*C[row*ldc+col];
|
||||||
C[row*ldc+col] = ALPHA*acc[y][x] + BETA*C[row*ldc+col];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user