merge develop into branch

PaddlePaddle · Feb 22, 2023 · 76c8ae1 · 76c8ae1
2 parents 2c0f308 + 433c2ff
commit 76c8ae1
Show file tree

Hide file tree

Showing 249 changed files with 6,063 additions and 9,537 deletions.
diff --git a/.gitignore b/.gitignore
@@ -26,7 +26,8 @@ paddle/phi/api/lib/tensor_operants.cc
 paddle/phi/extension.h
 paddle/phi/include/*
 paddle/phi/infermeta/generated.*
-
+paddle/fluid/prim/api/generated_prim/*.cc
+paddle/fluid/prim/api/generated_prim/*.h
 *.DS_Store
 *.vs
 build/

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -8,7 +8,7 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 
 set(XPU_BASE_DATE "20230220")
-set(XPU_XCCL_BASE_VERSION "1.0.8")
+set(XPU_XCCL_BASE_VERSION "1.0.9")
 
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
@@ -33,7 +33,7 @@ add_subdirectory(ir)
 
 # Next, (to be discusssed)
 # (1) move all source files to same folder,
-# (2) naturally, and and configure tests in only one `CMakeLists.txt`,
+# (2) naturally, and configure tests in only one `CMakeLists.txt`,
 # (3) cc tests support linking pre-built dynamic libraries. For example, use the dynamic
 # library in the installed paddle by `pip`.
 

diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt
@@ -77,6 +77,6 @@ cc_library(
        op_registry
        variable_helper
        memcpy
-       scale_op
+       generated_op
        autograd_meta
        hook_utils)
diff --git a/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt b/paddle/fluid/eager/tests/performance_tests/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT (NOT WITH_PYTHON AND ON_INFER))
          ${generated_deps}
          eager_scale
          scale_node
-         scale_op
+         generated_op
          matmul_v2_op
          dygraph_function
          eager_prim_api)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -1051,7 +1051,7 @@ if(WITH_PSCORE)
       heter_pipeline_trainer_test
       SRCS heter_pipeline_trainer_test.cc
       DEPS conditional_block_op
-           scale_op
+           generated_op
            heter_listen_and_serv_op
            executor
            heter_server
@@ -1068,7 +1068,7 @@ if(WITH_PSCORE)
       heter_pipeline_trainer_test
       SRCS heter_pipeline_trainer_test.cc
       DEPS conditional_block_op
-           scale_op
+           generated_op
            heter_listen_and_serv_op
            executor
            heter_server

diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
@@ -156,7 +156,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
+  // separate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num,
                                        bool discard_remaining_ins = false) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -227,6 +227,7 @@ if(WITH_XPU)
                ${XPU_PASS_DEPS})
   pass_library(multi_encoder_xpu_slice_fuse_pass inference DIR xpu)
   pass_library(generate_sequence_xpu_fuse_pass inference DIR xpu)
+  pass_library(link_xpu_op_max_pass inference DIR xpu)
 endif()
 
 cc_library(

diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt
@@ -76,5 +76,5 @@ cc_library(
 cc_test(
   test_reference_count_pass_last_lived_ops
   SRCS test_reference_count_pass_last_lived_ops.cc
-  DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op
+  DEPS parallel_executor elementwise_mul_op elementwise_add_op generated_op
        eigen_function)
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -159,9 +159,9 @@ Fused subgraph:
            \    |   /        |
             \   |  /         |
              fc_xpu-----------
-                |
-                |
-             act_out
+              |  \
+              |   \
+         act_out  out_max
 */
 class FcXPUFusePass : public FusePassBase {
  protected:
@@ -185,6 +185,7 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
       for (auto act_type : {
                "relu",
                "gelu",
+               "tanh",
                "",
            }) {
         ApplyImpl(graph, mul_type, with_bias, act_type);
@@ -244,6 +245,18 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
       QuantWeight<int16_t>(mul_w_tensor, mul_w_max_tensor, !transpose_w);
     }
 
+    std::string fc_out_name;
+    if (act_out) {
+      fc_out_name = act_out->Name();
+    } else if (add_out) {
+      fc_out_name = add_out->Name();
+    } else {
+      fc_out_name = mul_out->Name();
+    }
+    std::string fc_out_max_name = fc_out_name + "_max";
+    VarDesc fc_out_max_desc(fc_out_max_name);
+    Node* fc_out_max = graph->CreateVarNode(&fc_out_max_desc);
+
     // Generate fc_xpu op
     framework::OpDesc fc_xpu_op_desc(block);
     fc_xpu_op_desc.SetType("fc_xpu");
@@ -282,25 +295,21 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
             "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
       }
     }
-    if (act_out) {
-      fc_xpu_op_desc.SetOutput("out", {act_out->Name()});
-    } else if (add_out) {
-      fc_xpu_op_desc.SetOutput("out", {add_out->Name()});
-    } else {
-      fc_xpu_op_desc.SetOutput("out", {mul_out->Name()});
-    }
+    fc_xpu_op_desc.SetOutput("out", {fc_out_name});
+    fc_xpu_op_desc.SetOutput("out_max", {fc_out_max_name});
     auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
-    SAFE_IR_NODE_LINK_TO(mul_x, fc_xpu);
-    SAFE_IR_NODE_LINK_TO(mul_w, fc_xpu);
-    SAFE_IR_NODE_LINK_TO(mul_w_max, fc_xpu);
+    IR_NODE_LINK_TO(mul_x, fc_xpu);
+    IR_NODE_LINK_TO(mul_w, fc_xpu);
+    IR_NODE_LINK_TO(mul_w_max, fc_xpu);
     SAFE_IR_NODE_LINK_TO(bias, fc_xpu);
     if (act_out) {
-      SAFE_IR_NODE_LINK_TO(fc_xpu, act_out);
+      IR_NODE_LINK_TO(fc_xpu, act_out);
     } else if (add_out) {
-      SAFE_IR_NODE_LINK_TO(fc_xpu, add_out);
+      IR_NODE_LINK_TO(fc_xpu, add_out);
     } else {
-      SAFE_IR_NODE_LINK_TO(fc_xpu, mul_out);
+      IR_NODE_LINK_TO(fc_xpu, mul_out);
     }
+    IR_NODE_LINK_TO(fc_xpu, fc_out_max);
 
     // delete useless node
     std::unordered_set<const Node*> delete_nodes;

diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FusionXPUOpPattern : public PatternBase {
+  FusionXPUOpPattern(PDPattern* pattern,
+                     const std::string& name_scope,
+                     const std::string& op_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fusion_op);
+  // declare variable node's name
+  PATTERN_DECL_NODE(out);
+  PATTERN_DECL_NODE(out_max);
+
+ private:
+  std::string op_type_;
+};
+
+FusionXPUOpPattern::FusionXPUOpPattern(PDPattern* pattern,
+                                       const std::string& name_scope,
+                                       const std::string& op_type)
+    : PatternBase(pattern, name_scope, name_scope), op_type_(op_type) {
+  auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op(op_type_);
+  auto* out = pattern->NewNode(out_repr())
+                  ->assert_is_op_output(op_type_, "out")
+                  ->assert_var_not_persistable();
+  auto* out_max = pattern->NewNode(out_max_repr())
+                      ->assert_is_op_output(op_type_, "out_max")
+                      ->assert_var_not_persistable();
+  fusion_op->LinksTo({out, out_max});
+}
+
+}  // namespace patterns
+
+class LinkXPUOpMaxPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void ApplyImpl(ir::Graph* graph, const std::string& op_type) const;
+
+  const std::string name_scope_{"multi_encoder_xpu_slice_fuse_pass"};
+  // ops with x_max/out_max
+  std::set<std::string> op_types_{"fc_xpu", "conv2d_xpu"};
+};
+
+/*
+Origin subgraph:
+          fusion_xpu_op0
+            /       \
+            |       |
+          out0   out0_max
+            |
+            \
+          fusion_xpu_op1
+
+Fused subgraph:
+          fusion_xpu_op0
+            /       \
+            |       |
+          out0   out0_max
+            |       |
+            \       /
+          fusion_xpu_op1
+*/
+void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const {
+  Init(name_scope_, graph);
+  for (auto op_type : op_types_) {
+    ApplyImpl(graph, op_type);
+  }
+}
+
+void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph,
+                                 const std::string& op_type) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  GraphPatternDetector gpd;
+  patterns::FusionXPUOpPattern pattern(
+      gpd.mutable_pattern(), name_scope_, op_type);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle LinkXPUOpMaxPass fuse";
+    GET_IR_NODE(fusion_op);
+    GET_IR_NODE(out);
+    GET_IR_NODE(out_max);
+    for (auto next_op : out->outputs) {
+      auto* next_op_desc = next_op->Op();
+      if (op_types_.count(next_op_desc->Type()) == 0) continue;
+      next_op_desc->SetInput("x_max", {out_max->Name()});
+      IR_NODE_LINK_TO(out_max, next_op);
+      found_subgraph_count++;
+    }
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(link_xpu_op_max_pass, paddle::framework::ir::LinkXPUOpMaxPass);
+
+REGISTER_PASS_CAPABILITY(link_xpu_op_max_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "fc_xpu", 0));
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -759,12 +759,22 @@ bool BuildOpFuncList(const platform::Place& place,
                 op_with_kernel->Type())) {
           auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
           auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;
-
-          if (op_with_kernel->PhiKernel()->IsValid()) {
+          bool in_custom_back_list = false;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+          in_custom_back_list =
+              phi::backends::custom_device::is_in_custom_black_list(
+                  phi_kernel_name);
+#endif
+          if (op_with_kernel->PhiKernel()->IsValid() && !in_custom_back_list) {
             run_phi_kernel = true;
           } else {
-            if (!op_with_kernel->SupportsKernelType(expected_kernel_key,
-                                                    exec_ctx)) {
+            if ((!op_with_kernel->SupportsKernelType(expected_kernel_key,
+                                                     exec_ctx)) ||
+                in_custom_back_list) {
+              std::string info = in_custom_back_list ? "fluid in black list "
+                                                     : "fluid missing ";
+              VLOG(3) << info << phi_kernel_key
+                      << " kernel: " << phi_kernel_name;
               auto phi_cpu_kernel_key =
                   FallBackToCpu(phi_kernel_key, *op_with_kernel);
               op_with_kernel->ResetPhiKernel(

diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h
@@ -219,7 +219,7 @@ class OpDesc {
     return ret_val;
   }
 
-  // it it really needed? or just maintain a ptr from block?
+  // Is it really needed? Or just maintain a ptr from the block?
   proto::OpDesc desc_;
   BlockDesc *block_{nullptr};  // not_own
   // input arg name => input variable names

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1888,7 +1888,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     bool is_xpu_kp_support = (use_xpu_kp_kernel_rt || use_xpu_kp_kernel_debug);
 #endif
 
-    if (phi_kernel_->IsValid()
+    bool in_custom_back_list = false;
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    in_custom_back_list =
+        phi::backends::custom_device::is_in_custom_black_list(phi_kernel_name);
+#endif
+    if (phi_kernel_->IsValid() && !in_custom_back_list
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
@@ -1909,7 +1914,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         kernel_type_->library_type_ = LibraryType::kKP;
       }
 #endif
-
       if (kernels_iter == all_op_kernels.end() ||
           kernels_iter->second.find(*kernel_type_.get()) ==
               kernels_iter->second.end()
@@ -1918,9 +1922,15 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #endif
 #if defined(PADDLE_WITH_XPU_KP)
           || (is_xpu_unsupport && !is_xpu_kp_support)
+#endif
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+          || in_custom_back_list
 #endif
       ) {
         fallback_to_cpu = true;
+        if (in_custom_back_list) {
+          VLOG(3) << "fluid in black list: " << phi_kernel_name;
+        }
         auto phi_cpu_kernel_key = FallBackToCpu(phi_kernel_key, *this);
         phi_kernel_.reset(
             new phi::Kernel(phi::KernelFactory::Instance().SelectKernel(
@@ -3492,7 +3502,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
   // we try to add these Attrs to the RuntimeAttrs, but these OpDesc will lose
   // the RuntimeAttrs information in the process of converting the Graph to
   // the Program, so additional record configuration will be introduced,
-  // which increases the The cost of development and understanding, so we
+  // which increases the cost of development and understanding, so we
   // still use Attrs to get and the attributes set by these passes from Attrs
   // for the time being. In the future, it is necessary to clarify the
   // positioning of RuntimeAttrs and expand related functions.

diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
@@ -210,7 +210,7 @@ class VarDesc {
   proto::VarType::TensorDesc *mutable_tensor_desc();
   std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();
 
-  // it it really needed? or just mantain a ptr from block?
+  // Is it really needed? Or just mantain a ptr from the block?
   proto::VarDesc desc_;
   AttributeMap attrs_;