Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] add npu kernel for elementwise_add_grad #31347

Merged
merged 6 commits into from
Mar 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 122 additions & 6 deletions paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>

#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/npu_op_runner.h"

namespace paddle {
namespace operators {
using Tensor = framework::Tensor;

template <typename DeviceContext, typename T>
template <typename T>
class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand All @@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
}
};

template <typename T>
class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));

auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();

// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.

if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
// For dx
// stage 1
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}

// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
}
}

if (dy) {
// For dy
// stage 1
auto reduce_ndim = dout->dims().size() - dy->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dout->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
}

// stage 2
axes.clear();
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
}
}
}
};

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
ops::ElementwiseAddNPUKernel<plat::float16>);

REGISTER_OP_NPU_KERNEL(
elementwise_add,
ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>);
#endif
REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
ops::ElementwiseAddGradNPUKernel<float>,
ops::ElementwiseAddGradNPUKernel<plat::float16>);
86 changes: 47 additions & 39 deletions paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs);

op->Run(*scope, place);
ctx.Wait();

std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec);
Expand Down Expand Up @@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,

// run
f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(op_type,
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
auto op = f::OpRegistry::CreateOp(
op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);

auto place = ctx.GetPlace();
op->Run(*scope, place);

std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);

std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec);

ctx.Wait();
float expected_x, expected_y;
if (op_type == "elementwise_add_grad") {
expected_x = 1.0;
expected_y = 6.0;
} else if (op_type == "elementwise_sub_grad") {
expected_x = 1.0;
expected_y = -6.0;
}

for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
}
for (uint32_t i = 0; i < dy_vec.size(); i++) {
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
op->Run(*scope, place);
ctx.Wait();

std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);

std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec);

ctx.Wait();
float expected_x, expected_y;
if (op_type == "elementwise_add_grad") {
expected_x = 1.0;
expected_y = 6.0;
} else if (op_type == "elementwise_sub_grad") {
expected_x = 1.0;
expected_y = -6.0;
}

for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
}
for (uint32_t i = 0; i < dy_vec.size(); i++) {
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
}

TEST(elementwise_add, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add");
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add");
}

TEST(elementwise_sub, NPU_fp32) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub");
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub");
}

TEST(elementwise_sub, NPU_fp16) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub");
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub");
}

TEST(elementwise_sub_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
}

TEST(elementwise_add_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
}
Loading