From d8adaf8ea6a31a380f6bf1fe65e88b661d3bb51e Mon Sep 17 00:00:00 2001 From: Joseph Redmon Date: Fri, 21 Oct 2016 13:16:43 -0700 Subject: [PATCH] tree stuff --- Makefile | 4 +- src/blas.c | 19 ++++++++ src/blas.h | 6 +++ src/blas_kernels.cu | 30 +++++++++++++ src/classifier.c | 44 ++++++++++++++----- src/data.c | 73 +++++++++++++++++++++++-------- src/data.h | 4 +- src/detection_layer.c | 2 +- src/layer.h | 3 ++ src/network.c | 1 - src/network.h | 2 + src/parser.c | 3 ++ src/region_layer.c | 3 +- src/softmax_layer.c | 85 +++++++++++++++++++++++++++--------- src/softmax_layer_kernels.cu | 70 ----------------------------- src/tree.c | 49 +++++++++++++++++++++ src/tree.h | 16 +++++++ 17 files changed, 287 insertions(+), 127 deletions(-) delete mode 100644 src/softmax_layer_kernels.cu create mode 100644 src/tree.c create mode 100644 src/tree.h diff --git a/Makefile b/Makefile index 0a48e550..ca358bfd 100644 --- a/Makefile +++ b/Makefile @@ -41,10 +41,10 @@ CFLAGS+= -DCUDNN LDFLAGS+= -lcudnn endif -OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o super.o voxel.o +OBJ=gemm.o utils.o cuda.o convolutional_layer.o list.o image.o activations.o im2col.o col2im.o blas.o crop_layer.o dropout_layer.o maxpool_layer.o softmax_layer.o data.o matrix.o network.o connected_layer.o cost_layer.o parser.o option_list.o darknet.o detection_layer.o captcha.o route_layer.o writing.o box.o nightmare.o normalization_layer.o avgpool_layer.o coco.o dice.o yolo.o detector.o layer.o compare.o classifier.o local_layer.o swag.o shortcut_layer.o activation_layer.o rnn_layer.o gru_layer.o rnn.o rnn_vid.o crnn_layer.o demo.o tag.o cifar.o go.o batchnorm_layer.o art.o region_layer.o reorg_layer.o super.o voxel.o tree.o ifeq ($(GPU), 1) LDFLAGS+= -lstdc++ -OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o softmax_layer_kernels.o network_kernels.o avgpool_layer_kernels.o +OBJ+=convolutional_kernels.o activation_kernels.o im2col_kernels.o col2im_kernels.o blas_kernels.o crop_layer_kernels.o dropout_layer_kernels.o maxpool_layer_kernels.o network_kernels.o avgpool_layer_kernels.o endif OBJS = $(addprefix $(OBJDIR), $(OBJ)) diff --git a/src/blas.c b/src/blas.c index 9d42562b..d6ab88bd 100644 --- a/src/blas.c +++ b/src/blas.c @@ -1,6 +1,7 @@ #include "blas.h" #include "math.h" #include +#include #include #include #include @@ -179,3 +180,21 @@ float dot_cpu(int N, float *X, int INCX, float *Y, int INCY) return dot; } +void softmax(float *input, int n, float temp, float *output) +{ + int i; + float sum = 0; + float largest = -FLT_MAX; + for(i = 0; i < n; ++i){ + if(input[i] > largest) largest = input[i]; + } + for(i = 0; i < n; ++i){ + sum += exp(input[i]/temp-largest/temp); + } + if(sum) sum = largest/temp+log(sum); + else sum = largest-100; + for(i = 0; i < n; ++i){ + output[i] = exp(input[i]/temp-sum); + } +} + diff --git a/src/blas.h b/src/blas.h index 95374c95..6b6b8f5b 100644 --- a/src/blas.h +++ b/src/blas.h @@ -34,7 +34,11 @@ void smooth_l1_cpu(int n, float *pred, float *truth, float *delta, float *error) void l2_cpu(int n, float *pred, float *truth, float *delta, float *error); void weighted_sum_cpu(float *a, float *b, float *s, int num, float *c); +void softmax(float *input, int n, float temp, float *output); + #ifdef GPU +#include "cuda.h" + void axpy_ongpu(int N, float ALPHA, float * X, int INCX, float * Y, int INCY); void axpy_ongpu_offset(int N, float ALPHA, float * X, int OFFX, int INCX, float * Y, int OFFY, int INCY); void copy_ongpu(int N, float * X, int INCX, float * Y, int INCY); @@ -73,5 +77,7 @@ void mult_add_into_gpu(int num, float *a, float *b, float *c); void reorg_ongpu(float *x, int w, int h, int c, int batch, int stride, int forward, float *out); +void softmax_gpu(float *input, int n, int groups, float temp, float *output, cudaStream_t stream); + #endif #endif diff --git a/src/blas_kernels.cu b/src/blas_kernels.cu index 0391e2e5..59ec0057 100644 --- a/src/blas_kernels.cu +++ b/src/blas_kernels.cu @@ -691,3 +691,33 @@ extern "C" void mult_add_into_gpu(int num, float *a, float *b, float *c) mult_add_into_kernel<<>>(num, a, b, c); check_error(cudaPeekAtLastError()); } + + +__global__ void softmax_kernel(int n, int batch, float *input, float temp, float *output) +{ + int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; + if(b >= batch) return; + + int i; + float sum = 0; + float largest = -INFINITY; + for(i = 0; i < n; ++i){ + int val = input[i+b*n]; + largest = (val>largest) ? val : largest; + } + for(i = 0; i < n; ++i){ + sum += exp(input[i+b*n]/temp-largest/temp); + } + sum = (sum != 0) ? largest/temp+log(sum) : largest-100; + for(i = 0; i < n; ++i){ + output[i+b*n] = exp(input[i+b*n]/temp-sum); + } +} + +extern "C" void softmax_gpu(float *input, int n, int groups, float temp, float *output, cudaStream_t stream) +{ + int inputs = n; + int batch = groups; + softmax_kernel<<>>(inputs, batch, input, temp, output); + check_error(cudaPeekAtLastError()); +} diff --git a/src/classifier.c b/src/classifier.c index 208b7ed4..e588af56 100644 --- a/src/classifier.c +++ b/src/classifier.c @@ -41,6 +41,20 @@ list *read_data_cfg(char *filename) return options; } +void hierarchy_predictions(float *predictions, int n, tree *hier) +{ + int j; + for(j = 0; j < n; ++j){ + int parent = hier->parent[j]; + if(parent >= 0){ + predictions[j] *= predictions[parent]; + } + } + for(j = 0; j < n; ++j){ + if(!hier->leaf[j]) predictions[j] = 0; + } +} + float *get_regression_values(char **labels, int n) { float *v = calloc(n, sizeof(float)); @@ -99,7 +113,8 @@ void train_classifier_multi(char *datacfg, char *cfgfile, char *weightfile, int load_args args = {0}; args.w = net.w; args.h = net.h; - args.threads = 16; + args.threads = 32; + args.hierarchy = net.hierarchy; args.min = net.min_crop; args.max = net.max_crop; @@ -206,6 +221,7 @@ void train_classifier(char *datacfg, char *cfgfile, char *weightfile, int clear) args.saturation = net.saturation; args.hue = net.hue; args.size = net.w; + args.hierarchy = net.hierarchy; args.paths = paths; args.classes = classes; @@ -394,6 +410,7 @@ void validate_classifier_10(char *datacfg, char *filename, char *weightfile) float *pred = calloc(classes, sizeof(float)); for(j = 0; j < 10; ++j){ float *p = network_predict(net, images[j].data); + if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy); axpy_cpu(classes, 1, p, 1, pred, 1); free_image(images[j]); } @@ -454,6 +471,7 @@ void validate_classifier_full(char *datacfg, char *filename, char *weightfile) //show_image(crop, "cropped"); //cvWaitKey(0); float *pred = network_predict(net, resized.data); + if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy); free_image(im); free_image(resized); @@ -513,6 +531,7 @@ void validate_classifier_single(char *datacfg, char *filename, char *weightfile) //show_image(crop, "cropped"); //cvWaitKey(0); float *pred = network_predict(net, crop.data); + if(net.hierarchy) hierarchy_predictions(pred, net.outputs, net.hierarchy); if(resized.data != im.data) free_image(resized); free_image(im); @@ -573,6 +592,7 @@ void validate_classifier_multi(char *datacfg, char *filename, char *weightfile) image r = resize_min(im, scales[j]); resize_network(&net, r.w, r.h); float *p = network_predict(net, r.data); + if(net.hierarchy) hierarchy_predictions(p, net.outputs, net.hierarchy); axpy_cpu(classes, 1, p, 1, pred, 1); flip_image(r); p = network_predict(net, r.data); @@ -672,7 +692,6 @@ void try_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filena } } - void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *filename) { network net = parse_network_cfg(cfgfile); @@ -713,11 +732,13 @@ void predict_classifier(char *datacfg, char *cfgfile, char *weightfile, char *fi float *X = r.data; time=clock(); float *predictions = network_predict(net, X); - top_predictions(net, top, indexes); + if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy); + top_k(predictions, net.outputs, top, indexes); printf("%s: Predicted in %f seconds.\n", input, sec(clock()-time)); for(i = 0; i < top; ++i){ int index = indexes[i]; - printf("%s: %f\n", names[index], predictions[index]); + if(net.hierarchy) printf("%d, %s: %f, parent: %s \n",index, names[index], predictions[index], (net.hierarchy->parent[index] >= 0) ? names[net.hierarchy->parent[index]] : "Root"); + else printf("%s: %f\n",names[index], predictions[index]); } if(r.data != im.data) free_image(r); free_image(im); @@ -899,15 +920,15 @@ void threat_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_i float curr_threat = 0; if(1){ curr_threat = predictions[0] * 0 + - predictions[1] * .6 + - predictions[2]; + predictions[1] * .6 + + predictions[2]; } else { curr_threat = predictions[218] + - predictions[539] + - predictions[540] + - predictions[368] + - predictions[369] + - predictions[370]; + predictions[539] + + predictions[540] + + predictions[368] + + predictions[369] + + predictions[370]; } threat = roll * curr_threat + (1-roll) * threat; @@ -1092,6 +1113,7 @@ void demo_classifier(char *datacfg, char *cfgfile, char *weightfile, int cam_ind show_image(in, "Classifier"); float *predictions = network_predict(net, in_s.data); + if(net.hierarchy) hierarchy_predictions(predictions, net.outputs, net.hierarchy); top_predictions(net, top, indexes); printf("\033[2J"); diff --git a/src/data.c b/src/data.c index 20d57481..a2390a9b 100644 --- a/src/data.c +++ b/src/data.c @@ -388,12 +388,47 @@ void fill_truth(char *path, char **labels, int k, float *truth) if(count != 1) printf("Too many or too few labels: %d, %s\n", count, path); } -matrix load_labels_paths(char **paths, int n, char **labels, int k) +void fill_hierarchy(float *truth, int k, tree *hierarchy) +{ + int j; + for(j = 0; j < k; ++j){ + if(truth[j]){ + int parent = hierarchy->parent[j]; + while(parent >= 0){ + truth[parent] = 1; + parent = hierarchy->parent[parent]; + } + } + } + int i; + int count = 0; + for(j = 0; j < hierarchy->groups; ++j){ + //printf("%d\n", count); + int mask = 1; + for(i = 0; i < hierarchy->group_size[j]; ++i){ + if(truth[count + i]){ + mask = 0; + break; + } + } + if (mask) { + for(i = 0; i < hierarchy->group_size[j]; ++i){ + truth[count + i] = SECRET_NUM; + } + } + count += hierarchy->group_size[j]; + } +} + +matrix load_labels_paths(char **paths, int n, char **labels, int k, tree *hierarchy) { matrix y = make_matrix(n, k); int i; for(i = 0; i < n && labels; ++i){ fill_truth(paths[i], labels, k, y.vals[i]); + if(hierarchy){ + fill_hierarchy(y.vals[i], k, hierarchy); + } } return y; } @@ -540,7 +575,7 @@ data load_data_compare(int n, char **paths, int m, int classes, int w, int h) while(fscanf(fp2, "%d %f", &id, &iou) == 2){ if (d.y.vals[i][2*id + 1] < iou) d.y.vals[i][2*id + 1] = iou; } - + for (j = 0; j < classes; ++j){ if (d.y.vals[i][2*j] > .5 && d.y.vals[i][2*j+1] < .5){ d.y.vals[i][2*j] = 1; @@ -567,7 +602,7 @@ data load_data_swag(char **paths, int n, int classes, float jitter) { int index = rand()%n; char *random_path = paths[index]; - + image orig = load_image_color(random_path, 0, 0); int h = orig.h; int w = orig.w; @@ -680,7 +715,7 @@ void *load_thread(void *ptr) if (a.type == OLD_CLASSIFICATION_DATA){ *a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h); } else if (a.type == CLASSIFICATION_DATA){ - *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure); + *a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure); } else if (a.type == SUPER_DATA){ *a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale); } else if (a.type == WRITING_DATA){ @@ -771,24 +806,24 @@ data load_data_old(char **paths, int n, int m, char **labels, int k, int w, int data d = {0}; d.shallow = 0; d.X = load_image_paths(paths, n, w, h); - d.y = load_labels_paths(paths, n, labels, k); + d.y = load_labels_paths(paths, n, labels, k, 0); if(m) free(paths); return d; } /* -data load_data_study(char **paths, int n, int m, char **labels, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) -{ - data d = {0}; - d.indexes = calloc(n, sizeof(int)); - if(m) paths = get_random_paths_indexes(paths, n, m, d.indexes); - d.shallow = 0; - d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure); - d.y = load_labels_paths(paths, n, labels, k); - if(m) free(paths); - return d; -} -*/ + data load_data_study(char **paths, int n, int m, char **labels, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) + { + data d = {0}; + d.indexes = calloc(n, sizeof(int)); + if(m) paths = get_random_paths_indexes(paths, n, m, d.indexes); + d.shallow = 0; + d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure); + d.y = load_labels_paths(paths, n, labels, k); + if(m) free(paths); + return d; + } + */ data load_data_super(char **paths, int n, int m, int w, int h, int scale) { @@ -820,13 +855,13 @@ data load_data_super(char **paths, int n, int m, int w, int h, int scale) return d; } -data load_data_augment(char **paths, int n, int m, char **labels, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) +data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure) { if(m) paths = get_random_paths(paths, n, m); data d = {0}; d.shallow = 0; d.X = load_image_augment_paths(paths, n, min, max, size, angle, aspect, hue, saturation, exposure); - d.y = load_labels_paths(paths, n, labels, k); + d.y = load_labels_paths(paths, n, labels, k, hierarchy); if(m) free(paths); return d; } diff --git a/src/data.h b/src/data.h index c24201dd..3f6ef610 100644 --- a/src/data.h +++ b/src/data.h @@ -5,6 +5,7 @@ #include "matrix.h" #include "list.h" #include "image.h" +#include "tree.h" static inline float distance_from_edge(int x, int max) { @@ -58,6 +59,7 @@ typedef struct load_args{ image *im; image *resized; data_type type; + tree *hierarchy; } load_args; typedef struct{ @@ -80,7 +82,7 @@ data load_data_detection(int n, char **paths, int m, int w, int h, int boxes, in data load_data_tag(char **paths, int n, int m, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); matrix load_image_augment_paths(char **paths, int n, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); data load_data_super(char **paths, int n, int m, int w, int h, int scale); -data load_data_augment(char **paths, int n, int m, char **labels, int k, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); +data load_data_augment(char **paths, int n, int m, char **labels, int k, tree *hierarchy, int min, int max, int size, float angle, float aspect, float hue, float saturation, float exposure); data load_go(char *filename); box_label *read_boxes(char *filename, int *n); diff --git a/src/detection_layer.c b/src/detection_layer.c index 6ee7f648..cd98b4b4 100644 --- a/src/detection_layer.c +++ b/src/detection_layer.c @@ -58,7 +58,7 @@ void forward_detection_layer(const detection_layer l, network_state state) int index = b*l.inputs; for (i = 0; i < locations; ++i) { int offset = i*l.classes; - softmax_array(l.output + index + offset, l.classes, 1, + softmax(l.output + index + offset, l.classes, 1, l.output + index + offset); } } diff --git a/src/layer.h b/src/layer.h index ea6862b8..341e58af 100644 --- a/src/layer.h +++ b/src/layer.h @@ -3,6 +3,7 @@ #include "activations.h" #include "stddef.h" +#include "tree.h" struct network_state; @@ -93,6 +94,8 @@ struct layer{ int reorg; int log; + tree *softmax_tree; + float alpha; float beta; float kappa; diff --git a/src/network.c b/src/network.c index 01b79622..8d46c55b 100644 --- a/src/network.c +++ b/src/network.c @@ -565,7 +565,6 @@ float *network_accuracies(network net, data d, int n) return acc; } - float network_accuracy_multi(network net, data d, int n) { matrix guess = network_predict_data_multi(net, d, n); diff --git a/src/network.h b/src/network.h index 4f9ba750..67f93f79 100644 --- a/src/network.h +++ b/src/network.h @@ -5,6 +5,7 @@ #include "image.h" #include "layer.h" #include "data.h" +#include "tree.h" typedef enum { CONSTANT, STEP, EXP, POLY, STEPS, SIG, RANDOM @@ -47,6 +48,7 @@ typedef struct network{ float hue; int gpu_index; + tree *hierarchy; #ifdef GPU float **input_gpu; diff --git a/src/parser.c b/src/parser.c index a27d2459..e04c6c29 100644 --- a/src/parser.c +++ b/src/parser.c @@ -221,6 +221,8 @@ softmax_layer parse_softmax(list *options, size_params params) int groups = option_find_int_quiet(options, "groups",1); softmax_layer layer = make_softmax_layer(params.batch, params.inputs, groups); layer.temperature = option_find_float_quiet(options, "temperature", 1); + char *tree_file = option_find_str(options, "tree", 0); + if (tree_file) layer.softmax_tree = read_tree(tree_file); return layer; } @@ -598,6 +600,7 @@ network parse_network_cfg(char *filename) l = parse_detection(options, params); }else if(lt == SOFTMAX){ l = parse_softmax(options, params); + net.hierarchy = l.softmax_tree; }else if(lt == NORMALIZATION){ l = parse_normalization(options, params); }else if(lt == BATCHNORM){ diff --git a/src/region_layer.c b/src/region_layer.c index bc3acaae..5f8b3cc9 100644 --- a/src/region_layer.c +++ b/src/region_layer.c @@ -1,6 +1,5 @@ #include "region_layer.h" #include "activations.h" -#include "softmax_layer.h" #include "blas.h" #include "box.h" #include "cuda.h" @@ -99,7 +98,7 @@ void forward_region_layer(const region_layer l, network_state state) int index = size*i + b*l.outputs; l.output[index + 4] = logistic_activate(l.output[index + 4]); if(l.softmax){ - softmax_array(l.output + index + 5, l.classes, 1, l.output + index + 5); + softmax(l.output + index + 5, l.classes, 1, l.output + index + 5); } } } diff --git a/src/softmax_layer.c b/src/softmax_layer.c index 20bc07f3..2a34caea 100644 --- a/src/softmax_layer.c +++ b/src/softmax_layer.c @@ -32,31 +32,25 @@ softmax_layer make_softmax_layer(int batch, int inputs, int groups) return l; } -void softmax_array(float *input, int n, float temp, float *output) -{ - int i; - float sum = 0; - float largest = -FLT_MAX; - for(i = 0; i < n; ++i){ - if(input[i] > largest) largest = input[i]; - } - for(i = 0; i < n; ++i){ - sum += exp(input[i]/temp-largest/temp); - } - if(sum) sum = largest/temp+log(sum); - else sum = largest-100; - for(i = 0; i < n; ++i){ - output[i] = exp(input[i]/temp-sum); - } -} - void forward_softmax_layer(const softmax_layer l, network_state state) { int b; int inputs = l.inputs / l.groups; int batch = l.batch * l.groups; - for(b = 0; b < batch; ++b){ - softmax_array(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs); + if(l.softmax_tree){ + for(b = 0; b < batch; ++b){ + int i; + int count = 0; + for(i = 0; i < l.softmax_tree->groups; ++i){ + int group_size = l.softmax_tree->group_size[i]; + softmax(state.input+b*inputs + count, group_size, l.temperature, l.output+b*inputs + count); + count += group_size; + } + } + } else { + for(b = 0; b < batch; ++b){ + softmax(state.input+b*inputs, inputs, l.temperature, l.output+b*inputs); + } } } @@ -68,3 +62,54 @@ void backward_softmax_layer(const softmax_layer l, network_state state) } } +#ifdef GPU + +void pull_softmax_layer_output(const softmax_layer layer) +{ + cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); +} + +void forward_softmax_layer_gpu(const softmax_layer l, network_state state) +{ + int inputs = l.inputs / l.groups; + int batch = l.batch * l.groups; + int b; + if(l.softmax_tree){ + if(0){ + float *buff = calloc(inputs * batch, sizeof(float)); + cuda_pull_array(state.input, buff, batch * inputs); + state.input = buff; + forward_softmax_layer(l, state); + cuda_push_array(l.output_gpu, l.output, batch*inputs); + free(buff); + } else { + int i; + const int nstreams = 32; + cudaStream_t streams[nstreams]; + for (i = 0; i < nstreams; ++i) { + cudaStreamCreate(&streams[i]); + } + for (b = 0; b < batch; ++b) { + int i; + int count = 0; + for (i = 0; i < l.softmax_tree->groups; ++i) { + int group_size = l.softmax_tree->group_size[i]; + softmax_gpu(state.input+b*inputs + count, group_size, 1, l.temperature, l.output_gpu+b*inputs + count, streams[(b*l.softmax_tree->groups + i) % nstreams]); + count += group_size; + } + } + for(i = 0; i < nstreams; ++i){ + cudaStreamDestroy(streams[i]); + } + } + } else { + softmax_gpu(state.input, inputs, batch, l.temperature, l.output_gpu, 0); + } +} + +void backward_softmax_layer_gpu(const softmax_layer layer, network_state state) +{ + axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1); +} + +#endif diff --git a/src/softmax_layer_kernels.cu b/src/softmax_layer_kernels.cu deleted file mode 100644 index 8feaf89b..00000000 --- a/src/softmax_layer_kernels.cu +++ /dev/null @@ -1,70 +0,0 @@ -#include "cuda_runtime.h" -#include "curand.h" -#include "cublas_v2.h" - -extern "C" { -#include "softmax_layer.h" -#include "cuda.h" -#include "blas.h" -} - -__global__ void forward_softmax_layer_kernel(int n, int batch, float *input, float temp, float *output) -{ - int b = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x; - if(b >= batch) return; - - int i; - float sum = 0; - float largest = -INFINITY; - for(i = 0; i < n; ++i){ - int val = input[i+b*n]; - largest = (val>largest) ? val : largest; - } - for(i = 0; i < n; ++i){ - sum += exp(input[i+b*n]/temp-largest/temp); - } - sum = (sum != 0) ? largest/temp+log(sum) : largest-100; - for(i = 0; i < n; ++i){ - output[i+b*n] = exp(input[i+b*n]/temp-sum); - } -} - -extern "C" void pull_softmax_layer_output(const softmax_layer layer) -{ - cuda_pull_array(layer.output_gpu, layer.output, layer.inputs*layer.batch); -} - -extern "C" void forward_softmax_layer_gpu(const softmax_layer layer, network_state state) -{ - int inputs = layer.inputs / layer.groups; - int batch = layer.batch * layer.groups; - forward_softmax_layer_kernel<<>>(inputs, batch, state.input, layer.temperature, layer.output_gpu); - check_error(cudaPeekAtLastError()); -} - -extern "C" void backward_softmax_layer_gpu(const softmax_layer layer, network_state state) -{ - axpy_ongpu(layer.batch*layer.inputs, 1, layer.delta_gpu, 1, state.delta, 1); -} - -/* This is if you want softmax w/o log-loss classification. You probably don't. - int i,j,b; - for(b = 0; b < layer.batch; ++b){ - for(i = 0; i < layer.inputs; ++i){ - for(j = 0; j < layer.inputs; ++j){ - int d = (i==j); - layer.jacobian[b*layer.inputs*layer.inputs + i*layer.inputs + j] = - layer.output[b*layer.inputs + i] * (d - layer.output[b*layer.inputs + j]); - } - } - } - for(b = 0; b < layer.batch; ++b){ - int M = layer.inputs; - int N = 1; - int K = layer.inputs; - float *A = layer.jacobian + b*layer.inputs*layer.inputs; - float *B = layer.delta + b*layer.inputs; - float *C = delta + b*layer.inputs; - gemm(0,0,M,N,K,1,A,K,B,N,0,C,N); - } - */ diff --git a/src/tree.c b/src/tree.c new file mode 100644 index 00000000..5a758f78 --- /dev/null +++ b/src/tree.c @@ -0,0 +1,49 @@ +#include +#include +#include "tree.h" +#include "utils.h" + +tree *read_tree(char *filename) +{ + tree t = {0}; + FILE *fp = fopen(filename, "r"); + + char *line; + int last_parent = -1; + int group_size = 0; + int groups = 0; + int n = 0; + while((line=fgetl(fp)) != 0){ + char *id = calloc(256, sizeof(char)); + int parent = -1; + sscanf(line, "%s %d", id, &parent); + t.parent = realloc(t.parent, (n+1)*sizeof(int)); + t.parent[n] = parent; + t.name = realloc(t.name, (n+1)*sizeof(char *)); + t.name[n] = id; + if(parent != last_parent){ + ++groups; + t.group_size = realloc(t.group_size, groups * sizeof(int)); + t.group_size[groups - 1] = group_size; + group_size = 0; + last_parent = parent; + } + ++n; + ++group_size; + } + ++groups; + t.group_size = realloc(t.group_size, groups * sizeof(int)); + t.group_size[groups - 1] = group_size; + t.n = n; + t.groups = groups; + t.leaf = calloc(n, sizeof(int)); + int i; + for(i = 0; i < n; ++i) t.leaf[i] = 1; + for(i = 0; i < n; ++i) if(t.parent[i] >= 0) t.leaf[t.parent[i]] = 0; + + fclose(fp); + tree *tree_ptr = calloc(1, sizeof(tree)); + *tree_ptr = t; + //error(0); + return tree_ptr; +} diff --git a/src/tree.h b/src/tree.h new file mode 100644 index 00000000..c713866d --- /dev/null +++ b/src/tree.h @@ -0,0 +1,16 @@ +#ifndef TREE_H +#define TREE_H + +typedef struct{ + int *leaf; + int n; + int *parent; + char **name; + + int groups; + int *group_size; +} tree; + +tree *read_tree(char *filename); + +#endif