PaddlePaddle · zhiqiu · Mar 2, 2021 · Mar 1, 2021 · Mar 1, 2021 · Mar 1, 2021
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
+using Tensor = framework::Tensor;
 
-template <typename DeviceContext, typename T>
+template <typename T>
 class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
+    // default axis=-1?
+    // So, the sub_grad should do reduce if needed.
+    // For example, the shape of each variable in elementwise_sub:
+    // x, dx: [2, 3, 5]
+    // y, dy: [1, 5]
+    // out, dout: [2, 3, 5]
+    // Then, out = x - y  =>  dx = dout, dy = -dout
+    // And, the shape of dy can be computed by two stages reduce,
+    // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
+    // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
+
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      // For dx
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dx->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dx->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dx->dims().size(); ++i) {
+        if (dx->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+      }
+    }
+
+    if (dy) {
+      // For dy
+      // stage 1
+      auto reduce_ndim = dout->dims().size() - dy->dims().size();
+      std::vector<int> axes;
+      for (auto i = 0; i < reduce_ndim; ++i) {
+        axes.push_back(i);
+      }
+      Tensor* tmp_dout = const_cast<Tensor*>(dout);
+      Tensor reduced_dout(dout->type());
+      if (axes.size() != 0) {
+        std::vector<int64_t> reduced_dout_dims;
+        for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+          reduced_dout_dims.push_back(dout->dims()[i]);
+        }
+        reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+        reduced_dout.mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                  {{"axes", axes}, {"keep_dims", false}});
+        runner.Run(stream);
+        tmp_dout = &reduced_dout;
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+      }
+
+      // stage 2
+      axes.clear();
+      for (auto i = 0; i < dy->dims().size(); ++i) {
+        if (dy->dims()[i] == 1) {
+          axes.push_back(i);
+        }
+      }
+      if (axes.size() != 0) {
+        dy->mutable_data<T>(ctx.GetPlace());
+        auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
+                                  {{"axes", axes}, {"keep_dims", true}});
+        runner.Run(stream);
+      } else {
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .Wait();
+        framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
+      }
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
+                       ops::ElementwiseAddNPUKernel<plat::float16>);
 
-REGISTER_OP_NPU_KERNEL(
-    elementwise_add,
-    ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>);
-#endif
+REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
+                       ops::ElementwiseAddGradNPUKernel<float>,
+                       ops::ElementwiseAddGradNPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
                                     {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
+  ctx.Wait();
 
   std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
@@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs;
-  auto op = f::OpRegistry::CreateOp(op_type,
-    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
-    {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
+      {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
 
   auto place = ctx.GetPlace();
-    op->Run(*scope, place);
-
-    std::vector<T> dx_vec;
-    TensorToVector(*tensor_dx, ctx, &dx_vec);
-
-    std::vector<T> dy_vec;
-    TensorToVector(*tensor_dy, ctx, &dy_vec);
-
-    ctx.Wait();
-    float expected_x, expected_y;
-    if (op_type == "elementwise_add_grad") {
-      expected_x = 1.0;
-      expected_y = 6.0;
-    } else if (op_type == "elementwise_sub_grad") {
-      expected_x = 1.0;
-      expected_y = -6.0;
-    }
-
-    for (uint32_t i = 0; i < dx_vec.size(); i++) {
-      EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
-    }
-    for (uint32_t i = 0; i < dy_vec.size(); i++) {
-      EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
-    }
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<T> dx_vec;
+  TensorToVector(*tensor_dx, ctx, &dx_vec);
+
+  std::vector<T> dy_vec;
+  TensorToVector(*tensor_dy, ctx, &dy_vec);
+
+  ctx.Wait();
+  float expected_x, expected_y;
+  if (op_type == "elementwise_add_grad") {
+    expected_x = 1.0;
+    expected_y = 6.0;
+  } else if (op_type == "elementwise_sub_grad") {
+    expected_x = 1.0;
+    expected_y = -6.0;
+  }
+
+  for (uint32_t i = 0; i < dx_vec.size(); i++) {
+    EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
+  }
+  for (uint32_t i = 0; i < dy_vec.size(); i++) {
+    EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
+  }
 }
 
 TEST(elementwise_add, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "elementwise_add");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_add");
 }
 
 TEST(elementwise_sub, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "elementwise_sub");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<float>(&scope, ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub, NPU_fp16) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  Compare<p::float16>(&scope, ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub_grad, NPU) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+}
+
+TEST(elementwise_add_grad, NPU) {
+  f::Scope scope;
+  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
 }