Optimize the ernie inference performance on xpu backend. (#50357)

* Optimize the ernie inference performance on xpu * fix enable runtime cache logic * when op's input shape has changed, should create a new runtime context * fix * set flag when input shape has changed
PaddlePaddle · Feb 21, 2023 · b39afb1 · b39afb1
1 parent 1e7dc9c
commit b39afb1
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 28 deletions.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -1618,6 +1618,57 @@ void OperatorWithKernel::CheckWhetherPreparePhiData(
   }
 }
 
+// When do we need to reset runtime context?
+// 1. When enable cache runtime context, if the program runs for the first time,
+//   runtime_ctx_.get() == nullptr, we need to create a new runtime context.
+// 2. When enable cache runtime context, if the program is not running for the
+// first time,
+//   but the input shape or tensor layout of the operator has changed, we cannot
+//   use the runtime context stored in the cache at this time, and need to
+//   create a new one.
+bool OperatorWithKernel::NeedResetRuntimeContext(const Scope& scope) const {
+  if (runtime_ctx_.get() == nullptr) return true;
+  const auto& name_map = Inputs();
+  for (auto& var_name_item : name_map) {
+    auto& name_vec = var_name_item.second;
+    std::vector<Variable*>& cache_input_vars =
+        runtime_ctx_->inputs[var_name_item.first];
+    PADDLE_ENFORCE_EQ(
+        name_vec.size(),
+        cache_input_vars.size(),
+        platform::errors::InvalidArgument(
+            "The size of input variable names (%d) must be equal to "
+            "the size of cache input variable ptrs (%d).",
+            name_vec.size(),
+            cache_input_vars.size()));
+    for (size_t i = 0; i < name_vec.size(); i++) {
+      auto var_name = name_vec[i];
+      auto* cache_input_var = cache_input_vars[i];
+      if (!VarIsTensor(*cache_input_var)) continue;
+      auto* cache_input_tensor =
+          GetMutableLoDTensorOrSelectedRowsValueFromVar(cache_input_var);
+      auto cache_input_tensor_dims = cache_input_tensor->dims();
+      auto* current_input_var = scope.FindVar(var_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          current_input_var,
+          platform::errors::NotFound(
+              "The variable %s is not found when "
+              "enable_cache_runtime_context_cache in origin scope.",
+              var_name));
+      auto* current_input_tensor =
+          GetMutableLoDTensorOrSelectedRowsValueFromVar(current_input_var);
+      auto current_input_tensor_dims = current_input_tensor->dims();
+      if (cache_input_tensor_dims != current_input_tensor_dims ||
+          NeedTransformLayout(current_input_tensor->layout(),
+                              cache_input_tensor->layout())) {
+        need_prepare_data_ = true;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1627,25 +1678,20 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (!all_kernels_must_compute_runtime_shape_ &&
       HasAttr(kAllKernelsMustComputeRuntimeShape))
     all_kernels_must_compute_runtime_shape_ = true;
-  const Scope* cur_scope = &scope;
   CheckWhetherPreparePhiData(Inputs(), Outputs(), scope);
   if (!enable_cache_runtime_context_) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
-    pre_scope_ = cur_scope;
   } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ &&
              !need_prepare_phi_data_) {
     if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) {
       this->Info().infer_shape_(impl_->getRuntimeInferShapeContext());
     }
     (*phi_kernel_)(impl_->getKernelContext());
   } else {
-    if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
+    if (NeedResetRuntimeContext(scope)) {
       std::lock_guard<std::mutex> lock(cache_update_mutex_);
-      if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) {
-        runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-        pre_scope_ = cur_scope;
-      }
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
     }
     RunImpl(scope, place, runtime_ctx_.get());
   }
@@ -2030,8 +2076,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // To solve issue #15032, have a discussion with @Luotao for cpu inference,
   // do not cache transfer scope, hence in this case delete transfer scope
   // after run to avoid memory leak
-  if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) {
-    scope.DeleteScope(transfer_scope);
+  if (cache_transfer_scope_ && !run_by_executor_ &&
+      !enable_cache_transfer_scope_) {
+    scope.DeleteScope(cache_transfer_scope_);
   }
 }
 
@@ -2566,33 +2613,25 @@ Scope* OperatorWithKernel::PrepareData(
               kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
               new_expected_kernel_key->backend() == phi::Backend::GPU ||
               new_expected_kernel_key->backend() == phi::Backend::GPUDNN) {
-            new_scope = TryCreateTransferScope(
+            cache_transfer_scope_ = TryCreateTransferScope(
                 kernel_type_for_var, *new_expected_kernel_key, &scope);
             enable_cache_transfer_scope_ = true;
+            new_scope = cache_transfer_scope_;
           }
         } else if (kernel_type_for_var.backend() == phi::Backend::GPU ||
                    kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
                    expected_kernel_key.backend() == phi::Backend::GPU ||
                    expected_kernel_key.backend() == phi::Backend::GPUDNN) {
-          new_scope = TryCreateTransferScope(
+          cache_transfer_scope_ = TryCreateTransferScope(
               kernel_type_for_var, expected_kernel_key, &scope);
           enable_cache_transfer_scope_ = true;
+          new_scope = cache_transfer_scope_;
         }
       }
 
       if (!new_scope) {
         new_scope = &scope.NewScope();
       }
-      // For inference, if a gpu model has an op which could only run on CPU,
-      // each result of different input will be the same with the first one.
-      // The reason is that if a gpu tensor is the input of a cpu kernel,
-      // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context_, we get the cpu tensor each
-      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
-      // to trigger `new RuntimeContext()` in RunImpl().
-      if (enable_cache_runtime_context_) {
-        pre_scope_ = nullptr;
-      }
 
       // Create new var with the same name in transfer scopes
       auto* trans_var = new_scope->Var(var_name);
@@ -2678,18 +2717,13 @@ Scope* OperatorWithKernel::PrepareData(
     }
   }
 
-  // If pre_scope = &scope, it means that scope is cached and the op is not in
-  // while block. If new_scope = nullptr, it means that for each input of this
-  // Op, there is no need to do PrepareData. So PrepareData could be skipped at
-  // the rest iterations to save the elapsed time.
   // We do not support skipping PrepareData in while block, because the Op's
   // input may be changed by subsequent Ops, which may cause an error.
-
   // For inference, ops that behind conditional branch aren't supported well,
   // so disable prepare optimization conservatively.
   bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
                             Attr<bool>("inference_force_prepare_data");
-  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
+  if (enable_cache_runtime_context_ && !force_prepare_data) {
     need_prepare_data_ = false;
   }
 

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
@@ -781,18 +781,19 @@ class OperatorWithKernel : public OperatorBase {
   // used for IndicateOrPromoteVarDataTypes
   phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                              const std::string& name) const;
+  bool NeedResetRuntimeContext(const Scope& scope) const;
 
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
-  mutable const Scope* pre_scope_ = nullptr;
   mutable bool need_prepare_data_ = true;
   mutable bool need_prepare_phi_data_ = false;
   mutable bool enable_cache_runtime_context_ = false;
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
+  mutable Scope* cache_transfer_scope_ = nullptr;
   // NOTE(jiahongyu): Whether fallback to plain kernel after calling
   // GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard
   // code

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
+  if (!argument->use_xpu()) return;
+
+  PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "The xpu_device_id field should be valid"));
+
+  LOG(INFO) << "Sync params from CPU to XPU: "
+            << "xpu_device_id - " << argument->xpu_device_id();
+
+  platform::Place place = platform::XPUPlace(argument->xpu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));
+
+    if (var->IsType<phi::DenseTensor>()) {
+      auto *t = var->GetMutable<phi::DenseTensor>();
+
+      platform::CPUPlace cpu_place;
+      phi::DenseTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      t->clear();
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+#endif
+
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   PADDLE_ENFORCE_EQ(
       argument->scope_valid(),
@@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   if (argument->use_custom_device_valid()) {
     CopyParamsToCustomDevice(argument);
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (argument->use_xpu_valid()) {
+    CopyParamsToXpu(argument);
+  }
 #endif
   paddle::memory::Release(platform::CPUPlace());
 }

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   void CopyParamsToCustomDevice(Argument *argument);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  void CopyParamsToXpu(Argument *argument);
+#endif
 };
 
 }  // namespace analysis

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1281,6 +1281,18 @@ void AnalysisPredictor::PrepareArgument() {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  argument_->SetUseXpu(config_.use_xpu_);
+  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+  argument_->SetXpuLocked(config_.xpu_locked_);
+  argument_->SetXpuAutotune(config_.xpu_autotune_);
+  argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
+  argument_->SetXpuPrecision(config_.xpu_precision_);
+  argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
+  argument_->SetXpuDeviceId(config_.xpu_device_id_);
+  argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
+#endif
+
   auto *pass_builder = config_.pass_builder();
   // TODO(inference): Need to reconstruct the pass_builder, pass should be
   // processed in a single