Merge pull request #1724 from eon129/master

Classifier now is working for CPU/GPU
2023-08-10 21:13:14 +03:00 · 2018-10-10 13:46:46 +03:00
parent 24b60456c7 38801979b2
commit dbc75aec77
8 changed files with 219 additions and 44 deletions
--- a/src/blas.c
+++ b/src/blas.c
@ -244,6 +244,28 @@ void l1_cpu(int n, float *pred, float *truth, float *delta, float *error)
    }
 }
 void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
    int i;
    for(i = 0; i < n; ++i){
        float t = truth[i];
        float p = pred[i];
        error[i] = (t) ? -log(p) : 0;
        delta[i] = t-p;
    }
 }
 void logistic_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
    int i;
    for(i = 0; i < n; ++i){
        float t = truth[i];
        float p = pred[i];
        error[i] = -t*log(p) - (1-t)*log(1-p);
        delta[i] = t-p;
    }
 }
 void l2_cpu(int n, float *pred, float *truth, float *delta, float *error)
 {
    int i;
--- a/src/blas.h
+++ b/src/blas.h
@ -37,9 +37,12 @@ void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c);
 void softmax(float *input, int n, float temp, float *output, int stride);
 void upsample_cpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 void softmax_cpu(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
 void softmax_x_ent_cpu(int n, float *pred, float *truth, float *delta, float *error);
 #ifdef GPU
 #include "cuda.h"
 #include "tree.h"
 void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY);
 void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
@ -47,6 +50,7 @@ void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY);
 void copy_ongpu_offset(int N, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY);
 void scal_ongpu(int N, float ALPHA, float * X, int INCX);
 void supp_ongpu(int N, float ALPHA, float * X, int INCX);
 void mask_gpu_new_api(int N, float * X, float mask_num, float * mask, float val);
 void mask_ongpu(int N, float * X, float mask_num, float * mask);
 void const_ongpu(int N, float ALPHA, float *X, int INCX);
 void pow_ongpu(int N, float ALPHA, float *X, int INCX, float *Y, int INCY);
@ -71,6 +75,7 @@ void scale_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void add_bias_gpu(float *output, float *biases, int batch, int n, int size);
 void backward_bias_gpu(float *bias_updates, float *delta, int batch, int n, int size);
 void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void l2_gpu(int n, float *pred, float *truth, float *delta, float *error);
 void weighted_delta_gpu(float *a, float *b, float *s, float *da, float *db, float *ds, int num, float *dc);
@ -79,6 +84,7 @@ void mult_add_into_gpu(int num, float *a, float *b, float *c);
 void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out);
 void softmax_gpu_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output);
 void softmax_gpu(float *input, int n, int offset, int groups, float temp, float *output);
 void adam_gpu(int n, float *x, float *m, float *v, float B1, float B2, float rate, float eps, int t);
 void adam_update_gpu(float *w, float *d, float *m, float *v, float B1, float B2, float eps, float decay, float rate, int n, int batch, int t);
@ -87,5 +93,7 @@ void flatten_ongpu(float *x, int spatial, int layers, int batch, int forward, fl
 void upsample_gpu(float *in, int w, int h, int c, int batch, int stride, int forward, float scale, float *out);
 void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier);
 #endif
 #endif
--- a/src/blas_kernels.cu
+++ b/src/blas_kernels.cu
@ -7,6 +7,7 @@ extern "C" {
 #include "blas.h"
 #include "cuda.h"
 #include "utils.h"
 #include "tree.h"
 }
 __global__ void scale_bias_kernel(float *output, float *biases, int n, int size)
@ -419,6 +420,12 @@ __global__ void fill_kernel(int N, float ALPHA, float *X, int INCX)
    if(i < N) X[i*INCX] = ALPHA;
 }
 __global__ void mask_kernel_new_api(int n, float *x, float mask_num, float *mask, float val)
 {
 	int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
 	if (i < n && mask[i] == mask_num) x[i] = val;
 }
 __global__ void mask_kernel(int n, float *x, float mask_num, float *mask)
 {
    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -592,6 +599,12 @@ extern "C" void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride
    check_error(cudaPeekAtLastError());
 }
 extern "C" void mask_gpu_new_api(int N, float * X, float mask_num, float * mask, float val)
 {
 	mask_kernel_new_api <<<cuda_gridsize(N), BLOCK >>>(N, X, mask_num, mask, val);
 	check_error(cudaPeekAtLastError());
 }
 extern "C" void mask_ongpu(int N, float * X, float mask_num, float * mask)
 {
    mask_kernel<<<cuda_gridsize(N), BLOCK, 0, get_cuda_stream() >>>(N, X, mask_num, mask);
@ -687,6 +700,23 @@ extern "C" void smooth_l1_gpu(int n, float *pred, float *truth, float *delta, fl
    check_error(cudaPeekAtLastError());
 }
 __global__ void softmax_x_ent_kernel(int n, float *pred, float *truth, float *delta, float *error)
 {
 	int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
 	if (i < n) {
 		float t = truth[i];
 		float p = pred[i];
 		error[i] = (t) ? -log(p) : 0;
 		delta[i] = t - p;
 	}
 }
 extern "C" void softmax_x_ent_gpu(int n, float *pred, float *truth, float *delta, float *error)
 {
 	softmax_x_ent_kernel << <cuda_gridsize(n), BLOCK >> >(n, pred, truth, delta, error);
 	check_error(cudaPeekAtLastError());
 }
 __global__ void l2_kernel(int n, float *pred, float *truth, float *delta, float *error)
 {
    int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
@ -784,6 +814,40 @@ extern "C" void softmax_gpu(float *input, int n, int offset, int groups, float t
    check_error(cudaPeekAtLastError());
 }
 __device__ void softmax_device_new_api(float *input, int n, float temp, int stride, float *output)
 {
 	int i;
 	float sum = 0;
 	float largest = -INFINITY;
 	for (i = 0; i < n; ++i) {
 		int val = input[i*stride];
 		largest = (val>largest) ? val : largest;
 	}
 	for (i = 0; i < n; ++i) {
 		float e = expf(input[i*stride] / temp - largest / temp);
 		sum += e;
 		output[i*stride] = e;
 	}
 	for (i = 0; i < n; ++i) {
 		output[i*stride] /= sum;
 	}
 }
 __global__ void softmax_kernel_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
 {
 	int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
 	if (id >= batch*groups) return;
 	int b = id / groups;
 	int g = id % groups;
 	softmax_device_new_api(input + b*batch_offset + g*group_offset, n, temp, stride, output + b*batch_offset + g*group_offset);
 }
 extern "C" void softmax_gpu_new_api(float *input, int n, int batch, int batch_offset, int groups, int group_offset, int stride, float temp, float *output)
 {
 	softmax_kernel_new_api << <cuda_gridsize(batch*groups), BLOCK >> >(input, n, batch, batch_offset, groups, group_offset, stride, temp, output);
 	check_error(cudaPeekAtLastError());
 }
 __global__ void upsample_kernel(size_t N, float *x, int w, int h, int c, int batch, int stride, int forward, float scale, float *out)
 {
@ -815,3 +879,35 @@ extern "C" void upsample_gpu(float *in, int w, int h, int c, int batch, int stri
    upsample_kernel << <cuda_gridsize(size), BLOCK >> >(size, in, w, h, c, batch, stride, forward, scale, out);
    check_error(cudaPeekAtLastError());
 }
 __global__ void softmax_tree_kernel(float *input, int spatial, int batch, int stride, float temp, float *output, int groups, int *group_size, int *group_offset)
 {
 	int id = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
 	if (id >= spatial*batch*groups) return;
 	int s = id % spatial;
 	id = id / spatial;
 	int g = id % groups;
 	int b = id / groups;
 	int goff = group_offset[g] * spatial;
 	int boff = b*stride;
 	softmax_device_new_api(input + goff + boff + s, group_size[g], temp, spatial, output + goff + boff + s);
 }
 extern "C" void softmax_tree_gpu(float *input, int spatial, int batch, int stride, float temp, float *output, tree hier)
 {
 	int *tree_groups_size = cuda_make_int_array_new_api(hier.group_size, hier.groups);
 	int *tree_groups_offset = cuda_make_int_array_new_api(hier.group_offset, hier.groups);
 	/*
 	static int *tree_groups_size = 0;
 	static int *tree_groups_offset = 0;
 	if(!tree_groups_size){
 	tree_groups_size = cuda_make_int_array(hier.group_size, hier.groups);
 	tree_groups_offset = cuda_make_int_array(hier.group_offset, hier.groups);
 	}
 	*/
 	int num = spatial*batch*hier.groups;
 	softmax_tree_kernel <<<cuda_gridsize(num), BLOCK >>>(input, spatial, batch, stride, temp, output, hier.groups, tree_groups_size, tree_groups_offset);
 	check_error(cudaPeekAtLastError());
 	cuda_free((float *)tree_groups_size);
 	cuda_free((float *)tree_groups_offset);
 }
--- a/src/cuda.c
+++ b/src/cuda.c
@ -162,6 +162,20 @@ int *cuda_make_int_array(size_t n)
    return x_gpu;
 }
 int *cuda_make_int_array_new_api(int *x, size_t n)
 {
 	int *x_gpu;
 	size_t size = sizeof(int)*n;
 	cudaError_t status = cudaMalloc((void **)&x_gpu, size);
 	check_error(status);
 	if (x) {
 		status = cudaMemcpy(x_gpu, x, size, cudaMemcpyHostToDevice);
 		check_error(status);
 	}
 	if (!x_gpu) error("Cuda malloc failed\n");
 	return x_gpu;
 }
 void cuda_free(float *x_gpu)
 {
    //cudaStreamSynchronize(get_cuda_stream());
--- a/src/cuda.h
+++ b/src/cuda.h
@ -40,6 +40,7 @@ extern "C" {
    cublasHandle_t blas_handle();
    float *cuda_make_array(float *x, size_t n);
    int *cuda_make_int_array(size_t n);
 	int *cuda_make_int_array_new_api(int *x, size_t n);
    void cuda_push_array(float *x_gpu, float *x, size_t n);
    YOLODLL_API void cuda_pull_array(float *x_gpu, float *x, size_t n);
    YOLODLL_API void cuda_set_device(int n);
--- a/src/layer.h
+++ b/src/layer.h
@ -83,6 +83,7 @@ struct layer{
    int side;
    int stride;
    int reverse;
 	int spatial;
    int pad;
    int sqrt;
    int flip;
@ -100,6 +101,7 @@ struct layer{
    float shift;
    float ratio;
 	int focal_loss;
 	int noloss;
    int softmax;
    int classes;
    int coords;
@ -198,6 +200,7 @@ struct layer{
    int   * input_sizes;
    float * delta;
    float * output;
 	float * loss;
    float * squared;
    float * norms;
@ -289,6 +292,7 @@ struct layer{
    float * scale_updates_gpu;
    float * output_gpu;
 	float * loss_gpu;
    float * delta_gpu;
    float * rand_gpu;
    float * squared_gpu;
--- a/src/parser.c
+++ b/src/parser.c
@ -238,6 +238,11 @@ softmax_layer parse_softmax(list *options, size_params params)
 	layer.temperature = option_find_float_quiet(options, "temperature", 1);
 	char *tree_file = option_find_str(options, "tree", 0);
 	if (tree_file) layer.softmax_tree = read_tree(tree_file);
 	layer.w = params.w;
 	layer.h = params.h;
 	layer.c = params.c;
 	layer.spatial = option_find_float_quiet(options, "spatial", 0);
 	layer.noloss = option_find_int_quiet(options, "noloss", 0);
 	return layer;
 }
--- a/src/softmax_layer.c
+++ b/src/softmax_layer.c
@ -1,36 +1,16 @@
 #include "softmax_layer.h"
 #include "blas.h"
 #include "cuda.h"
 #include "utils.h"
 #include "blas.h"
 #include <float.h>
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
-softmax_layer make_softmax_layer(int batch, int inputs, int groups)
+#define SECRET_NUM -1234
 {
    assert(inputs%groups == 0);
    fprintf(stderr, "softmax                                        %4d\n",  inputs);
    softmax_layer l = {0};
    l.type = SOFTMAX;
    l.batch = batch;
    l.groups = groups;
    l.inputs = inputs;
    l.outputs = inputs;
    l.output = calloc(inputs*batch, sizeof(float));
    l.delta = calloc(inputs*batch, sizeof(float));
    l.forward = forward_softmax_layer;
    l.backward = backward_softmax_layer;
    #ifdef GPU
    l.forward_gpu = forward_softmax_layer_gpu;
    l.backward_gpu = backward_softmax_layer_gpu;
    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
    #endif
    return l;
 }
 void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarchy, float *output)
 {
@ -46,26 +26,57 @@ void softmax_tree(float *input, int batch, int inputs, float temp, tree *hierarc
 	}
 }
-void forward_softmax_layer(const softmax_layer l, network_state state)
+softmax_layer make_softmax_layer(int batch, int inputs, int groups)
 {
-    int b;
+    assert(inputs%groups == 0);
-    int inputs = l.inputs / l.groups;
+    fprintf(stderr, "softmax                                        %4d\n",  inputs);
-    int batch = l.batch * l.groups;
+    softmax_layer l = {0};
-    if(l.softmax_tree){
+    l.type = SOFTMAX;
-        softmax_tree(state.input, batch, inputs, l.temperature, l.softmax_tree, l.output);
+    l.batch = batch;
-    } else {
+    l.groups = groups;
-        for(b = 0; b < batch; ++b){
+    l.inputs = inputs;
-            softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs, 1);
+    l.outputs = inputs;
    l.loss = calloc(inputs*batch, sizeof(float));
    l.output = calloc(inputs*batch, sizeof(float));
    l.delta = calloc(inputs*batch, sizeof(float));
    l.cost = calloc(1, sizeof(float));
    l.forward = forward_softmax_layer;
    l.backward = backward_softmax_layer;
    #ifdef GPU
    l.forward_gpu = forward_softmax_layer_gpu;
    l.backward_gpu = backward_softmax_layer_gpu;
    l.output_gpu = cuda_make_array(l.output, inputs*batch); 
    l.loss_gpu = cuda_make_array(l.loss, inputs*batch); 
    l.delta_gpu = cuda_make_array(l.delta, inputs*batch); 
    #endif
    return l;
 }
 void forward_softmax_layer(const softmax_layer l, network_state net)
 {
    if(l.softmax_tree){
        int i;
        int count = 0;
        for (i = 0; i < l.softmax_tree->groups; ++i) {
            int group_size = l.softmax_tree->group_size[i];
            softmax_cpu(net.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output + count);
            count += group_size;
        }
    } else {
        softmax_cpu(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output);
    }
    if(net.truth && !l.noloss){
        softmax_x_ent_cpu(l.batch*l.inputs, l.output, net.truth, l.delta, l.loss);
        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
    }
 }
-void backward_softmax_layer(const softmax_layer l, network_state state)
+void backward_softmax_layer(const softmax_layer l, network_state net)
 {
-    int i;
+    axpy_cpu(l.inputs*l.batch, 1, l.delta, 1, net.delta, 1);
    for(i = 0; i < l.inputs*l.batch; ++i){
        state.delta[i] += l.delta[i];
    }
 }
 #ifdef GPU
@ -75,26 +86,40 @@ void pull_softmax_layer_output(const softmax_layer layer)
    cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch);
 }
-void forward_softmax_layer_gpu(const softmax_layer l, network_state state)
+void forward_softmax_layer_gpu(const softmax_layer l, network_state net)
 {
    int inputs = l.inputs / l.groups;
    int batch = l.batch * l.groups;
    if(l.softmax_tree){
 		softmax_tree_gpu(net.input, 1, l.batch, l.inputs, l.temperature, l.output_gpu, *l.softmax_tree);
 		/*
 		int i;
 		int count = 0;
 		for (i = 0; i < l.softmax_tree->groups; ++i) {
 		int group_size = l.softmax_tree->group_size[i];
-            softmax_gpu(state.input+count, group_size, inputs, batch, l.temperature, l.output_gpu + count);
+		softmax_gpu(net.input_gpu + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count);
 		count += group_size;
 		}
 		*/
    } else {
-        softmax_gpu(state.input, inputs, inputs, batch, l.temperature, l.output_gpu);
+        if(l.spatial){
 			softmax_gpu_new_api(net.input, l.c, l.batch*l.c, l.inputs/l.c, l.w*l.h, 1, l.w*l.h, 1, l.output_gpu);
        }else{
 			softmax_gpu_new_api(net.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu);
        }
    }
    if(net.truth && !l.noloss){
        softmax_x_ent_gpu(l.batch*l.inputs, l.output_gpu, net.truth, l.delta_gpu, l.loss_gpu);
        if(l.softmax_tree){
 			mask_gpu_new_api(l.batch*l.inputs, l.delta_gpu, SECRET_NUM, net.truth, 0);
 			mask_gpu_new_api(l.batch*l.inputs, l.loss_gpu, SECRET_NUM, net.truth, 0);
        }
        cuda_pull_array(l.loss_gpu, l.loss, l.batch*l.inputs);
        l.cost[0] = sum_array(l.loss, l.batch*l.inputs);
    }
 }
-void backward_softmax_layer_gpu(const softmax_layer layer, network_state state)
+void backward_softmax_layer_gpu(const softmax_layer layer, network_state net)
 {
-    axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1);
+	axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, net.delta, 1);
 }
 #endif