Optimize the ernie inference performance on xpu

PaddlePaddle · Feb 9, 2023 · 26cd92a · 26cd92a
1 parent 7e8ef32
commit 26cd92a
Show file tree

Hide file tree

Showing 4 changed files with 72 additions and 14 deletions.
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
@@ -2552,16 +2552,20 @@ Scope* OperatorWithKernel::PrepareData(
         if (new_expected_kernel_key) {
           if (kernel_type_for_var.backend() == phi::Backend::GPU ||
               kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
+              kernel_type_for_var.backend() == phi::Backend::XPU ||
               new_expected_kernel_key->backend() == phi::Backend::GPU ||
-              new_expected_kernel_key->backend() == phi::Backend::GPUDNN) {
+              new_expected_kernel_key->backend() == phi::Backend::GPUDNN ||
+              new_expected_kernel_key->backend() == phi::Backend::XPU) {
             new_scope = TryCreateTransferScope(
                 kernel_type_for_var, *new_expected_kernel_key, &scope);
             enable_cache_transfer_scope_ = true;
           }
         } else if (kernel_type_for_var.backend() == phi::Backend::GPU ||
                    kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
+                   kernel_type_for_var.backend() == phi::Backend::XPU ||
                    expected_kernel_key.backend() == phi::Backend::GPU ||
-                   expected_kernel_key.backend() == phi::Backend::GPUDNN) {
+                   expected_kernel_key.backend() == phi::Backend::GPUDNN ||
+                   expected_kernel_key.backend() == phi::Backend::XPU) {
           new_scope = TryCreateTransferScope(
               kernel_type_for_var, expected_kernel_key, &scope);
           enable_cache_transfer_scope_ = true;
@@ -2571,16 +2575,6 @@ Scope* OperatorWithKernel::PrepareData(
       if (!new_scope) {
         new_scope = &scope.NewScope();
       }
-      // For inference, if a gpu model has an op which could only run on CPU,
-      // each result of different input will be the same with the first one.
-      // The reason is that if a gpu tensor is the input of a cpu kernel,
-      // we will create a new cpu tensor in new scope.
-      // However, if enable_cache_runtime_context_, we get the cpu tensor each
-      // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
-      // to trigger `new RuntimeContext()` in RunImpl().
-      if (enable_cache_runtime_context_) {
-        pre_scope_ = nullptr;
-      }
 
       // Create new var with the same name in transfer scopes
       auto* trans_var = new_scope->Var(var_name);
@@ -2676,8 +2670,14 @@ Scope* OperatorWithKernel::PrepareData(
   // so disable prepare optimization conservatively.
   bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
                             Attr<bool>("inference_force_prepare_data");
-  if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
-    need_prepare_data_ = false;
+  if (enable_cache_runtime_context_) {
+    if ((pre_scope_ == &scope && !force_prepare_data)) {
+      need_prepare_data_ = false;
+    }
+  } else {
+    if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
+      need_prepare_data_ = false;
+    }
   }
 
   return new_scope;

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
+  if (!argument->use_xpu()) return;
+
+  PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "The xpu_device_id field should be valid"));
+
+  LOG(INFO) << "Sync params from CPU to XPU: "
+            << "xpu_device_id - " << argument->xpu_device_id();
+
+  platform::Place place = platform::XPUPlace(argument->xpu_device_id());
+  auto *scope = argument->scope_ptr();
+  std::vector<std::string> all_vars = scope->LocalVarNames();
+
+  for (auto &var_name : all_vars) {
+    auto *var = scope->FindLocalVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var,
+        platform::errors::PreconditionNotMet("The var should not be nullptr"));
+
+    if (var->IsType<phi::DenseTensor>()) {
+      auto *t = var->GetMutable<phi::DenseTensor>();
+
+      platform::CPUPlace cpu_place;
+      phi::DenseTensor temp_tensor;
+      temp_tensor.Resize(t->dims());
+
+      paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+      t->clear();
+      paddle::framework::TensorCopySync(temp_tensor, place, t);
+    }
+  }
+}
+#endif
+
 void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   PADDLE_ENFORCE_EQ(
       argument->scope_valid(),
@@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   if (argument->use_custom_device_valid()) {
     CopyParamsToCustomDevice(argument);
   }
+#endif
+#ifdef PADDLE_WITH_XPU
+  if (argument->use_xpu_valid()) {
+    CopyParamsToXpu(argument);
+  }
 #endif
   paddle::memory::Release(platform::CPUPlace());
 }

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   void CopyParamsToCustomDevice(Argument *argument);
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  void CopyParamsToXpu(Argument *argument);
+#endif
 };
 
 }  // namespace analysis

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1256,6 +1256,18 @@ void AnalysisPredictor::PrepareArgument() {
   }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+  argument_->SetUseXpu(config_.use_xpu_);
+  argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
+  argument_->SetXpuLocked(config_.xpu_locked_);
+  argument_->SetXpuAutotune(config_.xpu_autotune_);
+  argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
+  argument_->SetXpuPrecision(config_.xpu_precision_);
+  argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
+  argument_->SetXpuDeviceId(config_.xpu_device_id_);
+  argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
+#endif
+
   auto *pass_builder = config_.pass_builder();
   // TODO(inference): Need to reconstruct the pass_builder, pass should be
   // processed in a single