cherry-pick kunlun PR: 29458, 29539 (#29583)

* support mobilenet for kunlun (#29458) * add xpu ops for training transformer in kunlun (#29539) * 1.fix matmul bug 2. add one hot * add xpu error msg Co-authored-by: procr <[email protected]> Co-authored-by: taixiurong <[email protected]>
PaddlePaddle · Dec 15, 2020 · 03ddf69 · 03ddf69
1 parent d82d59e
commit 03ddf69
Show file tree

Hide file tree

Showing 13 changed files with 908 additions and 73 deletions.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -4,7 +4,7 @@ endif()
 
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_12_04.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_12_07_cdfbf0c.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")

diff --git a/paddle/fluid/operators/activation_op_xpu.cc b/paddle/fluid/operators/activation_op_xpu.cc
@@ -61,13 +61,38 @@ void xpu_activation_forward(const framework::ExecutionContext &ctx,
   const T *x_data = x->data<T>();
   T *y_data = y->mutable_data<T>(ctx.GetPlace());
   int r = 0;
-  if (xpu::Activation_t::ACT_POW == type.type) {
-    type.pow_factor = ctx.Attr<float>("factor");
-  }
   auto xpu_context = ctx.device_context<DeviceContext>().x_context();
-  r = xpu::activation_forward(xpu_context, type, x->numel(),
-                              reinterpret_cast<const float *>(x_data),
-                              reinterpret_cast<float *>(y_data));
+
+  switch (type.type) {
+    case xpu::Activation_t::HARD_SWISH: {
+      float threshold = ctx.Attr<float>("threshold");
+      float scale = ctx.Attr<float>("scale");
+      float offset = ctx.Attr<float>("offset");
+      PADDLE_ENFORCE_EQ(threshold, 6.0f,
+                        platform::errors::External(
+                            "Not support threshold [%f] in XPU", threshold));
+      PADDLE_ENFORCE_EQ(
+          scale, 6.0f,
+          platform::errors::External("Not support scale [%f] in XPU", scale));
+      PADDLE_ENFORCE_EQ(
+          offset, 3.0f,
+          platform::errors::External("Not support offset [%f] in XPU", offset));
+
+      r = xpu::hard_swish(xpu_context, reinterpret_cast<const float *>(x_data),
+                          reinterpret_cast<float *>(y_data), x->numel());
+      break;
+    }
+    case xpu::Activation_t::ACT_POW: {
+      type.pow_factor = ctx.Attr<float>("factor");
+    }
+    default: {
+      r = xpu::activation_forward(xpu_context, type, x->numel(),
+                                  reinterpret_cast<const float *>(x_data),
+                                  reinterpret_cast<float *>(y_data));
+      break;
+    }
+  }
+
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU API return wrong value[%d], please check whether "
@@ -90,12 +115,40 @@ void xpu_activation_backward(const framework::ExecutionContext &ctx,
   if (y != nullptr) y_data = y->data<T>();
   if (dOut != nullptr) y_grad = dOut->data<T>();
   T *x_grad = dX->mutable_data<T>(ctx.GetPlace());
+  int r = 0;
   auto xpu_context = ctx.device_context<DeviceContext>().x_context();
-  int r = xpu::activation_backward(xpu_context, type, dX->numel(),
+
+  switch (type.type) {
+    case xpu::Activation_t::HARD_SWISH: {
+      float threshold = ctx.Attr<float>("threshold");
+      float scale = ctx.Attr<float>("scale");
+      float offset = ctx.Attr<float>("offset");
+      PADDLE_ENFORCE_EQ(threshold, 6.0f,
+                        platform::errors::External(
+                            "Not support threshold [%f] in XPU", threshold));
+      PADDLE_ENFORCE_EQ(
+          scale, 6.0f,
+          platform::errors::External("Not support scale [%f] in XPU", scale));
+      PADDLE_ENFORCE_EQ(
+          offset, 3.0f,
+          platform::errors::External("Not support offset [%f] in XPU", offset));
+      r = xpu::hard_swish_grad(xpu_context,
+                               reinterpret_cast<const float *>(x_data),
+                               reinterpret_cast<const float *>(y_data),
+                               reinterpret_cast<const float *>(y_grad),
+                               reinterpret_cast<float *>(x_grad), dX->numel());
+      break;
+    }
+    default: {
+      r = xpu::activation_backward(xpu_context, type, dX->numel(),
                                    reinterpret_cast<const float *>(x_data),
                                    reinterpret_cast<const float *>(y_data),
                                    reinterpret_cast<const float *>(y_grad),
                                    reinterpret_cast<float *>(x_grad));
+      break;
+    }
+  }
+
   PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
                     platform::errors::External(
                         "XPU API return wrong value[%d], please check whether "
@@ -132,6 +185,8 @@ using XPULogFunctor = XPUActivationFunc<T, xpu::Activation_t::LOG>;
 template <typename T>
 using XPUSquareFunctor = XPUActivationFunc<T, xpu::Activation_t::SQUARE>;
 template <typename T>
+using XPUHardSwishFunctor = XPUActivationFunc<T, xpu::Activation_t::HARD_SWISH>;
+template <typename T>
 using XPUSuareGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQUARE>;
 template <typename T>
 using XPUReluGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::RELU>;
@@ -147,6 +202,9 @@ using XPUSqrtFunctor = XPUActivationFunc<T, xpu::Activation_t::SQRT>;
 template <typename T>
 using XPUSqrtGradFunctor = XPUActivationGradFunc<T, xpu::Activation_t::SQRT>;
 template <typename T>
+using XPUHardSwishGradFunctor =
+    XPUActivationGradFunc<T, xpu::Activation_t::HARD_SWISH>;
+template <typename T>
 using XPUACTPowFunctor = XPUActivationFunc<T, xpu::Activation_t::ACT_POW>;
 template <typename T>
 using XPUABSFunctor = XPUActivationFunc<T, xpu::Activation_t::ABS>;
@@ -169,6 +227,8 @@ REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, XPUSigmoidFunctor,
 REGISTER_ACTIVATION_XPU_KERNEL(gelu, XPUGeluFunctor, XPUGeluGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(sqrt, XPUSqrtFunctor, XPUSqrtGradFunctor)
 REGISTER_ACTIVATION_XPU_KERNEL(square, XPUSquareFunctor, XPUSuareGradFunctor)
+REGISTER_ACTIVATION_XPU_KERNEL(hard_swish, XPUHardSwishFunctor,
+                               XPUHardSwishGradFunctor)
 REGISTER_OP_XPU_KERNEL(log,
                        ops::XPUActivationKernel<ops::XPULogFunctor<float>>);
 REGISTER_OP_XPU_KERNEL(pow,

diff --git a/paddle/fluid/operators/controlflow/logical_op_xpu.h b/paddle/fluid/operators/controlflow/logical_op_xpu.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_XPU
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "xpu/refactor/math.h"
+
+namespace paddle {
+
+namespace operators {
+typedef enum { XPU_OR, XPU_AND } XpuLogicalType;
+
+std::string XpuLogicalType2Str(XpuLogicalType ty) {
+  switch (ty) {
+    case XpuLogicalType::XPU_OR:
+      return std::string("logical or");
+    case XpuLogicalType::XPU_AND:
+      return std::string("logical and");
+    default:
+      return std::string("unknown type");
+  }
+  return std::string("unknown");
+}
+
+template <XpuLogicalType xpu_type, typename T>
+class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    T* out_ptr = out->mutable_data<T>(context.GetPlace());
+    const T* x_ptr = x->data<T>();
+    const T* y_ptr = y->data<T>();
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    framework::Tensor broadcast_x;
+    framework::Tensor broadcast_y;
+    bool need_broad_cast = false;
+    if (x->numel() != out->numel()) {
+      // x need broadcast
+      T* broadcast_x_ptr =
+          broadcast_x.mutable_data<T>(context.GetPlace(), out->numel());
+      auto& out_dim = out->dims();
+      auto& x_dim = x->dims();
+      int dims = out_dim.size();
+      std::vector<int> bcast_xdims;
+      std::vector<int> bcast_ydims;
+      for (int i = 0; i < dims; ++i) {
+        if (out_dim[i] == x_dim[i]) {
+          bcast_xdims.push_back(x_dim[i]);
+          bcast_ydims.push_back(x_dim[i]);
+          continue;
+        }
+        bcast_xdims.push_back(1);
+        bcast_xdims.push_back(x_dim[i]);
+        bcast_ydims.push_back(out_dim[i] / x_dim[i]);
+        bcast_ydims.push_back(x_dim[i]);
+      }
+
+      int ret = xpu::broadcast<int8_t>(
+          dev_ctx.x_context(), reinterpret_cast<const int8_t*> x_ptr,
+          reinterpret_cast<int8_t*> broadcast_x_ptr, bcast_xdims, bcast_ydims);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU broadcast kernel return wrong value[%d %s]",
+                            ret, XPUAPIErrorMsg[ret]));
+      x_ptr = (const T*)broadcast_x_ptr;
+      need_broad_cast = true;
+    }
+    if (y->numel() != out->numel()) {
+      // y need broadcast
+      T* broadcast_y_ptr =
+          broadcast_y.mutable_data<T>(context.GetPlace(), out->numel());
+      auto& out_dim = out->dims();
+      auto& y_dim = y->dims();
+      int dims = out_dim.size();
+      std::vector<int> bcast_xdims;
+      std::vector<int> bcast_ydims;
+      for (int i = 0; i < dims; ++i) {
+        if (out_dim[i] == y_dim[i]) {
+          bcast_xdims.push_back(y_dim[i]);
+          bcast_ydims.push_back(y_dim[i]);
+          continue;
+        }
+        bcast_xdims.push_back(1);
+        bcast_xdims.push_back(y_dim[i]);
+        bcast_ydims.push_back(out_dim[i] / y_dim[i]);
+        bcast_ydims.push_back(y_dim[i]);
+      }
+
+      int ret = xpu::broadcast<int8_t>(
+          dev_ctx.x_context(), reinterpret_cast<const int8_t*> y_ptr,
+          reinterpret_cast<int8_t*> broadcast_y_ptr, bcast_xdims, bcast_ydims);
+      PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
+                        platform::errors::External(
+                            "XPU broadcast kernel return wrong value[%d %s]",
+                            ret, XPUAPIErrorMsg[ret]));
+      y_ptr = (const T*)broadcast_y_ptr;
+      need_broad_cast = true;
+    }
+
+    // logical kernel
+    int ret = XPU_SUCCESS;
+    switch (xpu_type) {
+      case XpuLogicalType::XPU_OR:
+        ret = xpu::logical_or<bool>(dev_ctx.x_context(), x_ptr, y_ptr, out_ptr,
+                                    out->numel());
+        break;
+      case XpuLogicalType::XPU_AND:
+        ret = xpu::logical_and<bool>(dev_ctx.x_context(), x_ptr, y_ptr, out_ptr,
+                                     out->numel());
+      default:
+        LOG(ERROR) << "xpu not support logical xpu type = "
+                   << XpuLogicalType2Str(xpu_type);
+        break;
+    }
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External("XPU API return wrong value[%d %s] in "
+                                   "op_name[%s].",
+                                   ret, XPUAPIErrorMsg[ret],
+                                   XpuLogicalType2Str(xpu_type)));
+
+    if (need_broad_cast && dev_ctx.x_context()->xpu_stream != nullptr) {
+      xpu_wait();
+    }
+  }
+};
+
+template <typename T>
+class UnaryLogicalOpXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    if (x->numel() == 0) {
+      return;
+    }
+    out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::XPUDeviceContext>();
+    int ret = xpu::logical_not<bool>(dev_ctx.x_context(), x->data<T>(),
+                                     out->data<T>(), x->numel());
+    PADDLE_ENFORCE_EQ(
+        ret, XPU_SUCCESS,
+        platform::errors::External("XPU API return wrong value[%d %s].", ret,
+                                   XPUAPIErrorMsg[ret]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc b/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/controlflow/logical_op_xpu.h"
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    logical_and,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, bool>);
+#endif
diff --git a/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/controlflow/logical_op_xpu.h"
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(logicalnot, ops::UnaryLogicalOpXPUKernel<bool>);
+#endif
diff --git a/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/operators/controlflow/logical_op_xpu.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    logical_or,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, bool>);
+#endif