darknet/src/crop_layer_kernels.cu

extern "C" {
#include "crop_layer.h"
#include "utils.h"
#include "cuda.h"
#include "image.h"
}

__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c)
{
    if(x < 0 || x >= w || y < 0 || y >= h) return 0;
    return image[x + w*(y + c*h)];
}

__device__ float3 rgb_to_hsv_kernel(float3 rgb)
{
    float r = rgb.x;
    float g = rgb.y; 
    float b = rgb.z;

    float h, s, v;
    float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b);
    float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b);
    float delta = max - min;
    v = max;
    if(max == 0){
        s = 0;
        h = -1;
    }else{
        s = delta/max;
        if(r == max){
            h = (g - b) / delta;
        } else if (g == max) {
            h = 2 + (b - r) / delta;
        } else {
            h = 4 + (r - g) / delta;
        }
        if (h < 0) h += 6;
    }
    return make_float3(h, s, v);
}

__device__ float3 hsv_to_rgb_kernel(float3 hsv)
{
    float h = hsv.x;
    float s = hsv.y; 
    float v = hsv.z;

    float r, g, b;
    float f, p, q, t;

    if (s == 0) {
        r = g = b = v;
    } else {
        int index = (int) floorf(h);
        f = h - index;
        p = v*(1-s);
        q = v*(1-s*f);
        t = v*(1-s*(1-f));
        if(index == 0){
            r = v; g = t; b = p;
        } else if(index == 1){
            r = q; g = v; b = p;
        } else if(index == 2){
            r = p; g = v; b = t;
        } else if(index == 3){
            r = p; g = q; b = v;
        } else if(index == 4){
            r = t; g = p; b = v;
        } else {
            r = v; g = p; b = q;
        }
    }
    r = (r < 0) ? 0 : ((r > 1) ? 1 : r);
    g = (g < 0) ? 0 : ((g > 1) ? 1 : g);
    b = (b < 0) ? 0 : ((b > 1) ? 1 : b);
    return make_float3(r, g, b);
}

__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c)
{
    int ix = (int) floorf(x);
    int iy = (int) floorf(y);

    float dx = x - ix;
    float dy = y - iy;

    float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) + 
        dy     * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) + 
        (1-dy) *   dx   * get_pixel_kernel(image, w, h, ix+1, iy, c) +
        dy     *   dx   * get_pixel_kernel(image, w, h, ix+1, iy+1, c);
    return val;
}

__global__ void levels_image_kernel(float *image, float *rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift)
{
    int size = batch * w * h;
    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if(id >= size) return;
    int x = id % w;
    id /= w;
    int y = id % h;
    id /= h;
    float rshift = rand[0];
    float gshift = rand[1];
    float bshift = rand[2];
    float r0 = rand[8*id + 0];
    float r1 = rand[8*id + 1];
    float r2 = rand[8*id + 2];
    float r3 = rand[8*id + 3];

    saturation = r0*(saturation - 1) + 1;
    saturation = (r1 > .5) ? 1./saturation : saturation;
    exposure = r2*(exposure - 1) + 1;
    exposure = (r3 > .5) ? 1./exposure : exposure;

    size_t offset = id * h * w * 3;
    image += offset;
    float r = image[x + w*(y + h*0)];
    float g = image[x + w*(y + h*1)];
    float b = image[x + w*(y + h*2)];
    float3 rgb = make_float3(r,g,b);
    if(train){
        float3 hsv = rgb_to_hsv_kernel(rgb);
        hsv.y *= saturation;
        hsv.z *= exposure;
        rgb = hsv_to_rgb_kernel(hsv);
    } else {
        shift = 0;
    }
    image[x + w*(y + h*0)] = rgb.x*scale + translate + (rshift - .5)*shift;
    image[x + w*(y + h*1)] = rgb.y*scale + translate + (gshift - .5)*shift;
    image[x + w*(y + h*2)] = rgb.z*scale + translate + (bshift - .5)*shift;
}

__global__ void forward_crop_layer_kernel(float *input, float *rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)
{
    int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
    if(id >= size) return;

    float cx = w/2.;
    float cy = h/2.;

    int count = id;
    int j = id % crop_width;
    id /= crop_width;
    int i = id % crop_height;
    id /= crop_height;
    int k = id % c;
    id /= c;
    int b = id;

    float r4 = rand[8*b + 4];
    float r5 = rand[8*b + 5];
    float r6 = rand[8*b + 6];
    float r7 = rand[8*b + 7];

    float dw = (w - crop_width)*r4;
    float dh = (h - crop_height)*r5;
    flip = (flip && (r6 > .5));
    angle = 2*angle*r7 - angle;
    if(!train){
        dw = (w - crop_width)/2.;
        dh = (h - crop_height)/2.;
        flip = 0;
        angle = 0;
    }

    input += w*h*c*b;

    float x = (flip) ? w - dw - j - 1 : j + dw;    
    float y = i + dh;

    float rx = cos(angle)*(x-cx) - sin(angle)*(y-cy) + cx;
    float ry = sin(angle)*(x-cx) + cos(angle)*(y-cy) + cy;

    output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);
}

extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)
{
    cuda_random(layer.rand_gpu, layer.batch*8);

    float radians = layer.angle*3.14159/180.;

    float scale = 2;
    float translate = -1;
    if(layer.noadjust){
        scale = 1;
        translate = 0;
    }

    int size = layer.batch * layer.w * layer.h;

    levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);
    check_error(cudaPeekAtLastError());

    size = layer.batch*layer.c*layer.crop_width*layer.crop_height;

    forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.crop_height, layer.crop_width, state.train, layer.flip, radians, layer.output_gpu);
    check_error(cudaPeekAtLastError());

/*
       cuda_pull_array(layer.output_gpu, layer.output, size);
       image im = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 0*(size/layer.batch));
       image im2 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 1*(size/layer.batch));
       image im3 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 2*(size/layer.batch));

       translate_image(im, -translate);
       scale_image(im, 1/scale);
       translate_image(im2, -translate);
       scale_image(im2, 1/scale);
       translate_image(im3, -translate);
       scale_image(im3, 1/scale);
       
       show_image(im, "cropped");
       show_image(im2, "cropped2");
       show_image(im3, "cropped3");
       cvWaitKey(0);
       */
}
CUDA so fast 2015-01-23 03:38:24 +03:00			`extern "C" {`
			`#include "crop_layer.h"`
not much changed... 2015-04-11 11:24:07 +03:00			`#include "utils.h"`
CUDA so fast 2015-01-23 03:38:24 +03:00			`#include "cuda.h"`
not much changed... 2015-04-11 11:24:07 +03:00			`#include "image.h"`
CUDA so fast 2015-01-23 03:38:24 +03:00			`}`

not much changed... 2015-04-11 11:24:07 +03:00			`__device__ float get_pixel_kernel(float *image, int w, int h, int x, int y, int c)`
			`{`
			`if(x < 0 \|\| x >= w \|\| y < 0 \|\| y >= h) return 0;`
			`return image[x + w(y + ch)];`
			`}`

level adjustment for images 2015-04-15 10:32:32 +03:00			`__device__ float3 rgb_to_hsv_kernel(float3 rgb)`
			`{`
			`float r = rgb.x;`
			`float g = rgb.y;`
			`float b = rgb.z;`

			`float h, s, v;`
			`float max = (r > g) ? ( (r > b) ? r : b) : ( (g > b) ? g : b);`
			`float min = (r < g) ? ( (r < b) ? r : b) : ( (g < b) ? g : b);`
			`float delta = max - min;`
			`v = max;`
			`if(max == 0){`
			`s = 0;`
			`h = -1;`
			`}else{`
			`s = delta/max;`
			`if(r == max){`
			`h = (g - b) / delta;`
			`} else if (g == max) {`
			`h = 2 + (b - r) / delta;`
			`} else {`
			`h = 4 + (r - g) / delta;`
			`}`
			`if (h < 0) h += 6;`
			`}`
			`return make_float3(h, s, v);`
			`}`

			`__device__ float3 hsv_to_rgb_kernel(float3 hsv)`
			`{`
			`float h = hsv.x;`
			`float s = hsv.y;`
			`float v = hsv.z;`

			`float r, g, b;`
			`float f, p, q, t;`

			`if (s == 0) {`
			`r = g = b = v;`
			`} else {`
			`int index = (int) floorf(h);`
			`f = h - index;`
			`p = v*(1-s);`
			`q = v(1-sf);`
			`t = v(1-s(1-f));`
			`if(index == 0){`
			`r = v; g = t; b = p;`
			`} else if(index == 1){`
			`r = q; g = v; b = p;`
			`} else if(index == 2){`
			`r = p; g = v; b = t;`
			`} else if(index == 3){`
			`r = p; g = q; b = v;`
			`} else if(index == 4){`
			`r = t; g = p; b = v;`
			`} else {`
			`r = v; g = p; b = q;`
			`}`
			`}`
			`r = (r < 0) ? 0 : ((r > 1) ? 1 : r);`
			`g = (g < 0) ? 0 : ((g > 1) ? 1 : g);`
			`b = (b < 0) ? 0 : ((b > 1) ? 1 : b);`
			`return make_float3(r, g, b);`
			`}`

Switch to fast resize 2015-07-21 21:46:24 +03:00			`__device__ float bilinear_interpolate_kernel(float *image, int w, int h, float x, float y, int c)`
not much changed... 2015-04-11 11:24:07 +03:00			`{`
			`int ix = (int) floorf(x);`
			`int iy = (int) floorf(y);`

			`float dx = x - ix;`
			`float dy = y - iy;`

			`float val = (1-dy) * (1-dx) * get_pixel_kernel(image, w, h, ix, iy, c) +`
level adjustment for images 2015-04-15 10:32:32 +03:00			`dy * (1-dx) * get_pixel_kernel(image, w, h, ix, iy+1, c) +`
			`(1-dy) * dx * get_pixel_kernel(image, w, h, ix+1, iy, c) +`
			`dy * dx * get_pixel_kernel(image, w, h, ix+1, iy+1, c);`
not much changed... 2015-04-11 11:24:07 +03:00			`return val;`
			`}`

CVPR Experiments 2015-11-04 06:23:17 +03:00			`__global__ void levels_image_kernel(float image, float rand, int batch, int w, int h, int train, float saturation, float exposure, float translate, float scale, float shift)`
level adjustment for images 2015-04-15 10:32:32 +03:00			`{`
			`int size = batch * w * h;`
			`int id = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;`
			`if(id >= size) return;`
			`int x = id % w;`
			`id /= w;`
			`int y = id % h;`
			`id /= h;`
CVPR Experiments 2015-11-04 06:23:17 +03:00			`float rshift = rand[0];`
			`float gshift = rand[1];`
			`float bshift = rand[2];`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`float r0 = rand[8*id + 0];`
			`float r1 = rand[8*id + 1];`
			`float r2 = rand[8*id + 2];`
			`float r3 = rand[8*id + 3];`

			`saturation = r0*(saturation - 1) + 1;`
			`saturation = (r1 > .5) ? 1./saturation : saturation;`
			`exposure = r2*(exposure - 1) + 1;`
			`exposure = (r3 > .5) ? 1./exposure : exposure;`

level adjustment for images 2015-04-15 10:32:32 +03:00			`size_t offset = id * h * w * 3;`
			`image += offset;`
NIPS 2015-06-10 10:11:41 +03:00			`float r = image[x + w(y + h0)];`
level adjustment for images 2015-04-15 10:32:32 +03:00			`float g = image[x + w(y + h1)];`
NIPS 2015-06-10 10:11:41 +03:00			`float b = image[x + w(y + h2)];`
level adjustment for images 2015-04-15 10:32:32 +03:00			`float3 rgb = make_float3(r,g,b);`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`if(train){`
			`float3 hsv = rgb_to_hsv_kernel(rgb);`
			`hsv.y *= saturation;`
			`hsv.z *= exposure;`
			`rgb = hsv_to_rgb_kernel(hsv);`
CVPR Experiments 2015-11-04 06:23:17 +03:00			`} else {`
			`shift = 0;`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`}`
CVPR Experiments 2015-11-04 06:23:17 +03:00			`image[x + w(y + h0)] = rgb.xscale + translate + (rshift - .5)shift;`
			`image[x + w(y + h1)] = rgb.yscale + translate + (gshift - .5)shift;`
			`image[x + w(y + h2)] = rgb.zscale + translate + (bshift - .5)shift;`
level adjustment for images 2015-04-15 10:32:32 +03:00			`}`

per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`__global__ void forward_crop_layer_kernel(float input, float rand, int size, int c, int h, int w, int crop_height, int crop_width, int train, int flip, float angle, float *output)`
CUDA so fast 2015-01-23 03:38:24 +03:00			`{`
			`int id = (blockIdx.x + blockIdx.ygridDim.x) blockDim.x + threadIdx.x;`
			`if(id >= size) return;`

not much changed... 2015-04-11 11:24:07 +03:00			`float cx = w/2.;`
			`float cy = h/2.;`

CUDA so fast 2015-01-23 03:38:24 +03:00			`int count = id;`
			`int j = id % crop_width;`
			`id /= crop_width;`
			`int i = id % crop_height;`
			`id /= crop_height;`
			`int k = id % c;`
			`id /= c;`
			`int b = id;`
not much changed... 2015-04-11 11:24:07 +03:00
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`float r4 = rand[8*b + 4];`
			`float r5 = rand[8*b + 5];`
			`float r6 = rand[8*b + 6];`
			`float r7 = rand[8*b + 7];`

			`float dw = (w - crop_width)*r4;`
			`float dh = (h - crop_height)*r5;`
			`flip = (flip && (r6 > .5));`
			`angle = 2angler7 - angle;`
			`if(!train){`
			`dw = (w - crop_width)/2.;`
			`dh = (h - crop_height)/2.;`
			`flip = 0;`
			`angle = 0;`
			`}`

not much changed... 2015-04-11 11:24:07 +03:00			`input += whc*b;`

per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`float x = (flip) ? w - dw - j - 1 : j + dw;`
			`float y = i + dh;`
not much changed... 2015-04-11 11:24:07 +03:00
			`float rx = cos(angle)(x-cx) - sin(angle)(y-cy) + cx;`
			`float ry = sin(angle)(x-cx) + cos(angle)(y-cy) + cy;`

Switch to fast resize 2015-07-21 21:46:24 +03:00			`output[count] = bilinear_interpolate_kernel(input, w, h, rx, ry, k);`
CUDA so fast 2015-01-23 03:38:24 +03:00			`}`

refactoring and added DARK ZONE 2015-03-12 08:20:15 +03:00			`extern "C" void forward_crop_layer_gpu(crop_layer layer, network_state state)`
CUDA so fast 2015-01-23 03:38:24 +03:00			`{`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`cuda_random(layer.rand_gpu, layer.batch*8);`
level adjustment for images 2015-04-15 10:32:32 +03:00
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`float radians = layer.angle*3.14159/180.;`
level adjustment for images 2015-04-15 10:32:32 +03:00
			`float scale = 2;`
			`float translate = -1;`
nochange option in crop 2015-07-14 09:25:08 +03:00			`if(layer.noadjust){`
			`scale = 1;`
			`translate = 0;`
			`}`
level adjustment for images 2015-04-15 10:32:32 +03:00
			`int size = layer.batch * layer.w * layer.h;`

CVPR Experiments 2015-11-04 06:23:17 +03:00			`levels_image_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, layer.batch, layer.w, layer.h, state.train, layer.saturation, layer.exposure, translate, scale, layer.shift);`
level adjustment for images 2015-04-15 10:32:32 +03:00			`check_error(cudaPeekAtLastError());`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00
level adjustment for images 2015-04-15 10:32:32 +03:00			`size = layer.batchlayer.clayer.crop_width*layer.crop_height;`
CUDA so fast 2015-01-23 03:38:24 +03:00
per image randomness in crop layer 2015-04-17 22:32:54 +03:00			`forward_crop_layer_kernel<<<cuda_gridsize(size), BLOCK>>>(state.input, layer.rand_gpu, size, layer.c, layer.h, layer.w, layer.crop_height, layer.crop_width, state.train, layer.flip, radians, layer.output_gpu);`
CUDA so fast 2015-01-23 03:38:24 +03:00			`check_error(cudaPeekAtLastError());`
probably stuff changed 2015-04-14 00:09:55 +03:00
			`/*`
level adjustment for images 2015-04-15 10:32:32 +03:00			`cuda_pull_array(layer.output_gpu, layer.output, size);`
			`image im = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 0*(size/layer.batch));`
			`image im2 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 1*(size/layer.batch));`
			`image im3 = float_to_image(layer.crop_width, layer.crop_height, layer.c, layer.output + 2*(size/layer.batch));`
per image randomness in crop layer 2015-04-17 22:32:54 +03:00
			`translate_image(im, -translate);`
			`scale_image(im, 1/scale);`
			`translate_image(im2, -translate);`
			`scale_image(im2, 1/scale);`
			`translate_image(im3, -translate);`
			`scale_image(im3, 1/scale);`

level adjustment for images 2015-04-15 10:32:32 +03:00			`show_image(im, "cropped");`
			`show_image(im2, "cropped2");`
			`show_image(im3, "cropped3");`
			`cvWaitKey(0);`
			`*/`
CUDA so fast 2015-01-23 03:38:24 +03:00			`}`