From d1d632b78ace986a50bb69f230d83df204d0bdf0 Mon Sep 17 00:00:00 2001
From: Da Zheng <zhengda1936@gmail.com>
Date: Mon, 27 Nov 2017 23:54:18 +0000
Subject: [PATCH] Make SoftmaxActivation stateless.

---
 src/operator/nn/softmax_activation-inl.h | 124 +++++++++--------------
 src/operator/nn/softmax_activation.cu    |  14 +--
 2 files changed, 51 insertions(+), 87 deletions(-)
diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h
index 5b91b6f79e98..b1d542e4068c 100644
--- a/src/operator/nn/softmax_activation-inl.h
+++ b/src/operator/nn/softmax_activation-inl.h
@@ -61,98 +61,72 @@ struct SoftmaxActivationParam : public dmlc::Parameter<SoftmaxActivationParam> {
   }
 };
 
-/**
- * \brief This is the implementation of softmax_activation operator.
- * \tparam xpu The device that the op will be executed on.
- */
-template<typename xpu>
-class SoftmaxActivationOp {
- public:
-  void Init(SoftmaxActivationParam p) {
-    this->param_ = p;
-  }
-
-  void Forward(const OpContext &ctx, const TBlob &in_data,
-               const OpReqType &req, const TBlob &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.mode == softmax_activation::kInstance) {
-      Tensor<xpu, 2> data = in_data.FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data.FlatTo2D<xpu, real_t>(s);
-      Softmax(out, data);
-    } else {
-      CHECK_GE(in_data.ndim(), 3)
-        << "Input need to have a least 3 dimensions when mode=channel";
-      int n = in_data.size(0);
-      int k = in_data.size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data.Size()/n/k));
-      Tensor<xpu, 3, real_t> data = in_data.get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3, real_t> out = out_data.get_with_shape<xpu, 3, real_t>(s3, s);
-      Softmax(out, data);
-    }
-  }
-
-  void Backward(const OpContext &ctx, const TBlob &out_grad,
-                const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // Use 3d tensor for both mode -> {instance, channel}. Get shapes
-    int total_size = in_grad.Size();
-    int batch_size = in_grad.shape_[0];
-    int channel_num = in_grad.shape_[1];
-    int rest_size = total_size / (batch_size * channel_num);
-    const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
-    // Get tensors
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 3> m_out_grad =
-      out_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_out_data =
-      out_data.get_with_shape<xpu, 3, real_t>(data_shape, s);
-    Tensor<xpu, 3> m_in_grad =
-      in_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
-    // get requested temp space
-    Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
-        Shape2(batch_size, rest_size), s);
-    workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
-    Assign(m_in_grad, req,
-        m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
-  }
-
- private:
-  SoftmaxActivationParam param_;
-};  // class SoftmaxActivationOp
-
-
 template<typename xpu>
 void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs,
                               const OpContext& ctx,
                               const std::vector<TBlob>& inputs,
-                              const std::vector<OpReqType>& req,
+                              const std::vector<OpReqType>& reqs,
                               const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
   const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
-
-  static thread_local SoftmaxActivationOp<xpu> op;
-  op.Init(param);
-  op.Forward(ctx, inputs[0], req[0], outputs[0]);
+  const TBlob &in_data = inputs[softmax_activation::kData];
+  const OpReqType &req = reqs[softmax_activation::kOut];
+  const TBlob &out_data = outputs[softmax_activation::kOut];
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (param.mode == softmax_activation::kInstance) {
+    Tensor<xpu, 2> data = in_data.FlatTo2D<xpu, real_t>(s);
+    Tensor<xpu, 2> out = out_data.FlatTo2D<xpu, real_t>(s);
+    Softmax(out, data);
+  } else {
+    CHECK_GE(in_data.ndim(), 3)
+        << "Input need to have a least 3 dimensions when mode=channel";
+    int n = in_data.size(0);
+    int k = in_data.size(1);
+    Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data.Size()/n/k));
+    Tensor<xpu, 3, real_t> data = in_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Tensor<xpu, 3, real_t> out = out_data.get_with_shape<xpu, 3, real_t>(s3, s);
+    Softmax(out, data);
+  }
 }
 
 template<typename xpu>
 void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs,
                                   const OpContext& ctx,
                                   const std::vector<TBlob>& inputs,
-                                  const std::vector<OpReqType>& req,
+                                  const std::vector<OpReqType>& reqs,
                                   const std::vector<TBlob>& outputs) {
-  const SoftmaxActivationParam& param = nnvm::get<SoftmaxActivationParam>(attrs.parsed);
+  using namespace mshadow;
+  using namespace mshadow::expr;
   CHECK_EQ(inputs.size(), 2U);
   CHECK_EQ(outputs.size(), 1);
-  CHECK_EQ(req.size(), 1);
-
-  static thread_local SoftmaxActivationOp<xpu> op;
-  op.Init(param);
-  op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
+  CHECK_EQ(reqs.size(), 1);
+  const TBlob &out_grad = inputs[0];
+  const TBlob &out_data = inputs[1];
+  const OpReqType &req = reqs[0];
+  const TBlob &in_grad = outputs[0];
+  // Use 3d tensor for both mode -> {instance, channel}. Get shapes
+  int total_size = in_grad.Size();
+  int batch_size = in_grad.shape_[0];
+  int channel_num = in_grad.shape_[1];
+  int rest_size = total_size / (batch_size * channel_num);
+  const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size);
+  // Get tensors
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  Tensor<xpu, 3> m_out_grad =
+      out_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_out_data =
+      out_data.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  Tensor<xpu, 3> m_in_grad =
+      in_grad.get_with_shape<xpu, 3, real_t>(data_shape, s);
+  // get requested temp space
+  Tensor<xpu, 2> workspace = ctx.requested[softmax_activation::kTempSpace].get_space<xpu>(
+      Shape2(batch_size, rest_size), s);
+  workspace = reduce_with_axis<red::sum, false>(m_out_grad * m_out_data, 1);
+  Assign(m_in_grad, req,
+         m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num)));
 }
 
 }  // namespace op
diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu
index 8e6e787f8072..1cfe64f7e916 100644
--- a/src/operator/nn/softmax_activation.cu
+++ b/src/operator/nn/softmax_activation.cu
@@ -32,6 +32,7 @@
 namespace mxnet {
 namespace op {
 
+#if MXNET_USE_CUDNN == 1
 template<>
 void SoftmaxActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
                                    const OpContext& ctx,
@@ -42,15 +43,9 @@ void SoftmaxActivationCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 1U);
 
-#if MXNET_USE_CUDNN == 1
   static thread_local CuDNNSoftmaxActivationOp op;
   op.Init(param);
   op.Forward(ctx, inputs[0], req[0], outputs[0]);
-#else
-  static thread_local SoftmaxActivationOp<xpu> op;
-  op.Init(param);
-  op.Forward(ctx, inputs[0], req[0], outputs[0]);
-#endif
 }
 
 template<>
@@ -64,16 +59,11 @@ void SoftmaxActivationGradCompute<gpu>(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(outputs.size(), 1);
   CHECK_EQ(req.size(), 1);
 
-#if MXNET_USE_CUDNN == 1
   static thread_local CuDNNSoftmaxActivationOp op;
   op.Init(param);
   op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
-#else
-  static thread_local SoftmaxActivationOp<xpu> op;
-  op.Init(param);
-  op.Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]);
-#endif
 }
+#endif
 
 NNVM_REGISTER_OP(SoftmaxActivation)
 .set_attr<FCompute>("FCompute<gpu>", SoftmaxActivationCompute<gpu>);