Skip to content

Commit

Permalink
upgrading Accuracy layer: (1) efficient CPU implementation O(L) for t…
Browse files Browse the repository at this point in the history
…op_k, no need for fancy priority_queue etc. (2) GPU implementation
  • Loading branch information
Shai authored and shaibagon committed Oct 10, 2017
1 parent fac7434 commit 62e0c85
Show file tree
Hide file tree
Showing 4 changed files with 364 additions and 180 deletions.
4 changes: 4 additions & 0 deletions include/caffe/layers/accuracy_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class AccuracyLayer : public Layer<Dtype> {
*/
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
virtual void Forward_gpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);


/// @brief Not implemented -- AccuracyLayer cannot be used as a loss.
Expand All @@ -77,6 +79,8 @@ class AccuracyLayer : public Layer<Dtype> {
if (propagate_down[i]) { NOT_IMPLEMENTED; }
}
}
virtual void Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

int label_axis_, outer_num_, inner_num_;

Expand Down
33 changes: 16 additions & 17 deletions src/caffe/layers/accuracy_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const Dtype* bottom_label = bottom[1]->cpu_data();
const int dim = bottom[0]->count() / outer_num_;
const int num_labels = bottom[0]->shape(label_axis_);
vector<Dtype> maxval(top_k_+1);
vector<int> max_id(top_k_+1);
if (top.size() > 1) {
caffe_set(nums_buffer_.count(), Dtype(0), nums_buffer_.mutable_cpu_data());
caffe_set(top[1]->count(), Dtype(0), top[1]->mutable_cpu_data());
Expand All @@ -66,25 +64,22 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
if (has_ignore_label_ && label_value == ignore_label_) {
continue;
}
if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
DCHECK_GE(label_value, 0);
DCHECK_LT(label_value, num_labels);
if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
const Dtype prob_of_true_class = bottom_data[i * dim
+ label_value * inner_num_
+ j];
int num_better_predictions = -1; // true_class also counts as "better"
// Top-k accuracy
std::vector<std::pair<Dtype, int> > bottom_data_vector;
for (int k = 0; k < num_labels; ++k) {
bottom_data_vector.push_back(std::make_pair(
bottom_data[i * dim + k * inner_num_ + j], k));
for (int k = 0; k < num_labels && num_better_predictions < top_k_; ++k) {
num_better_predictions +=
(bottom_data[i * dim + k * inner_num_ + j] >= prob_of_true_class);
}
std::partial_sort(
bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
bottom_data_vector.end(), std::greater<std::pair<Dtype, int> >());
// check if true label is in top k predictions
for (int k = 0; k < top_k_; k++) {
if (bottom_data_vector[k].second == label_value) {
++accuracy;
if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
break;
}
// check if there are less than top_k_ predictions
if (num_better_predictions < top_k_) {
++accuracy;
if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
}
++count;
}
Expand All @@ -102,6 +97,10 @@ void AccuracyLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
// Accuracy layer should not be used as a loss function.
}

#ifdef CPU_ONLY
STUB_GPU(AccuracyLayer);
#endif

INSTANTIATE_CLASS(AccuracyLayer);
REGISTER_LAYER_CLASS(Accuracy);

Expand Down
147 changes: 147 additions & 0 deletions src/caffe/layers/accuracy_layer.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#include <vector>

#include "caffe/layers/accuracy_layer.hpp"
#include "caffe/util/math_functions.hpp"


namespace caffe {

template <typename Dtype>
__global__ void AccuracyForwardGPU(const int nthreads,
const Dtype* bottom_data, const Dtype* label, Dtype* acc,
const int num, const int dim, const int spatial_dim,
const int num_labels, const int top_k,
const bool has_ignore_label_, const int ignore_label_,
Dtype* counts) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const int label_value = static_cast<int>(label[n * spatial_dim + s]);
const Dtype prob_of_true_class = bottom_data[n * dim
+ label_value * spatial_dim
+ s];
int num_better_predictions = -1; // true_class also counts as "better"
if (has_ignore_label_ && label_value == ignore_label_) {
acc[index] = 0;
counts[index] = 0;
} else {
for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
num_better_predictions +=
(bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
}
acc[index] = (num_better_predictions < top_k);
counts[index] = 1;
}
}
}

template <typename Dtype>
__global__ void AccuracyForwardWithPerClassGPU(const int nthreads,
const Dtype* bottom_data, const Dtype* label,
Dtype* acc, Dtype* counts,
const int num, const int dim, const int spatial_dim,
const int num_labels, const int top_k,
const bool has_ignore_label_, const int ignore_label_) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const int label_value = static_cast<int>(label[n * spatial_dim + s]);
const Dtype prob_of_true_class = bottom_data[n * dim
+ label_value * spatial_dim
+ s];
if (has_ignore_label_ && label_value == ignore_label_) {
// nothing to be done.
} else {
int num_better_predictions = -1; // true_class also counts as "better"
for (int k = 0; k < num_labels & num_better_predictions < top_k; k++) {
num_better_predictions +=
(bottom_data[n * dim + k * spatial_dim + s] >= prob_of_true_class);
}
acc[label_value*nthreads + index] += (num_better_predictions < top_k);
counts[label_value*nthreads + index] = 1;
}
}
}

template <typename Dtype>
void AccuracyLayer<Dtype>::Forward_gpu(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
const Dtype* bottom_data = bottom[0]->gpu_data();
const Dtype* bottom_label = bottom[1]->gpu_data();
const int dim = bottom[0]->count() / outer_num_;
const int num_labels = bottom[0]->shape(label_axis_);
const int nthreads = outer_num_ * inner_num_;
// Since this memory is not used for anything,
// we use it here to avoid having to allocate new GPU
// memory to accumulate intermediate results in the kernel.
Dtype* acc_data = bottom[0]->mutable_gpu_diff();
if (top.size() == 1) {
// simple case - report only global accuracy.

// Similarly, this memory is never used elsewhere, and thus we can use it
// to avoid having to allocate additional GPU memory.
Dtype* counts = bottom[1]->mutable_gpu_diff();
// NOLINT_NEXT_LINE(whitespace/operators)
AccuracyForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
acc_data, outer_num_, dim, inner_num_, num_labels, top_k_,
has_ignore_label_, ignore_label_, counts);
Dtype acc;
caffe_gpu_asum(nthreads, acc_data, &acc);
Dtype valid_count;
caffe_gpu_asum(nthreads, counts, &valid_count);
if (valid_count > 0) {
top[0]->mutable_cpu_data()[0] = acc / valid_count;
} else {
top[0]->mutable_cpu_data()[0] = 0;
}
} else {
// need to report per-class accuracy as well

// allocate space for more detailed "counts"
nums_buffer_.ReshapeLike(*bottom[0]);
Dtype* counts = nums_buffer_.mutable_gpu_data();

caffe_gpu_set(bottom[0]->count(), Dtype(0), acc_data);
caffe_gpu_set(nums_buffer_.count(), Dtype(0), counts);

// NOLINT_NEXT_LINE(whitespace/operators)
AccuracyForwardWithPerClassGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
acc_data, counts, outer_num_, dim, inner_num_, num_labels, top_k_,
has_ignore_label_, ignore_label_);

// get the overall accuracy
Dtype acc;
caffe_gpu_asum(bottom[0]->count(), acc_data, &acc);
Dtype valid_count;
caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count);
if (valid_count > 0) {
top[0]->mutable_cpu_data()[0] = acc / valid_count;
} else {
top[0]->mutable_cpu_data()[0] = 0;
}

// get per-class accuracy
Dtype* per_class_acc = top[1]->mutable_cpu_data();
for (int l = 0; l < num_labels; l++) {
caffe_gpu_asum(nthreads, acc_data + l*nthreads, per_class_acc+l);
caffe_gpu_asum(nthreads, counts + l*nthreads, &valid_count);
if (valid_count > 0) {
per_class_acc[l] /= valid_count;
} else {
per_class_acc[l] = 0;
}
}
}
}


template <typename Dtype>
void AccuracyLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
if (propagate_down[1]) { NOT_IMPLEMENTED; }
}

INSTANTIATE_LAYER_GPU_FUNCS(AccuracyLayer);
} // namespace caffe
Loading

0 comments on commit 62e0c85

Please sign in to comment.