BVLC · Noiredd · Oct 16, 2017
diff --git a/include/caffe/layers/accuracy_layer.hpp b/include/caffe/layers/accuracy_layer.hpp
@@ -92,6 +92,8 @@ class AccuracyLayer : public Layer<Dtype> {
   int ignore_label_;
   /// Keeps counts of the number of samples per class.
   Blob<Dtype> nums_buffer_;
+  /// Intermediate results for the GPU implementation
+  Blob<Dtype> gpu_buffer_;
 };
 
 }  // namespace caffe

diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
@@ -34,6 +34,7 @@ void AccuracyLayer<Dtype>::Reshape(
       << "label count (number of labels) must be N*H*W, "
       << "with integer values in {0, 1, ..., C-1}.";
   vector<int> top_shape(0);  // Accuracy is a scalar; 0 axes.
+  gpu_buffer_.ReshapeLike(*bottom[0]);
   top[0]->Reshape(top_shape);
   if (top.size() > 1) {
     // Per-class accuracy is a vector; 1 axes.

diff --git a/src/caffe/layers/accuracy_layer.cu b/src/caffe/layers/accuracy_layer.cu
@@ -71,16 +71,11 @@ void AccuracyLayer<Dtype>::Forward_gpu(
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
   const int nthreads = outer_num_ * inner_num_;
-  // Since this memory is not used for anything,
-  // we use it here to avoid having to allocate new GPU
-  // memory to accumulate intermediate results in the kernel.
-  Dtype* acc_data = bottom[0]->mutable_gpu_diff();
+  Dtype* acc_data = gpu_buffer_.mutable_gpu_data();
   if (top.size() == 1) {
     // simple case - report only global accuracy.
 
-    // Similarly, this memory is never used elsewhere, and thus we can use it
-    // to avoid having to allocate additional GPU memory.
-    Dtype* counts = bottom[1]->mutable_gpu_diff();
+    Dtype* counts = gpu_buffer_.mutable_gpu_diff();
     // NOLINT_NEXT_LINE(whitespace/operators)
     AccuracyForwardGPU<Dtype><<<CAFFE_GET_BLOCKS(nthreads),
         CAFFE_CUDA_NUM_THREADS>>>(nthreads, bottom_data, bottom_label,
@@ -113,7 +108,7 @@ void AccuracyLayer<Dtype>::Forward_gpu(
 
     // get the overall accuracy
     Dtype acc;
-    caffe_gpu_asum(bottom[0]->count(), acc_data, &acc);
+    caffe_gpu_asum(gpu_buffer_.count(), acc_data, &acc);
     Dtype valid_count;
     caffe_gpu_asum(nums_buffer_.count(), counts, &valid_count);
     if (valid_count > 0) {