diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index ab43de2fbad09..3787c61b304bf 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1618,6 +1618,57 @@ void OperatorWithKernel::CheckWhetherPreparePhiData( } } +// When do we need to reset runtime context? +// 1. When enable cache runtime context, if the program runs for the first time, +// runtime_ctx_.get() == nullptr, we need to create a new runtime context. +// 2. When enable cache runtime context, if the program is not running for the +// first time, +// but the input shape or tensor layout of the operator has changed, we cannot +// use the runtime context stored in the cache at this time, and need to +// create a new one. +bool OperatorWithKernel::NeedResetRuntimeContext(const Scope& scope) const { + if (runtime_ctx_.get() == nullptr) return true; + const auto& name_map = Inputs(); + for (auto& var_name_item : name_map) { + auto& name_vec = var_name_item.second; + std::vector& cache_input_vars = + runtime_ctx_->inputs[var_name_item.first]; + PADDLE_ENFORCE_EQ( + name_vec.size(), + cache_input_vars.size(), + platform::errors::InvalidArgument( + "The size of input variable names (%d) must be equal to " + "the size of cache input variable ptrs (%d).", + name_vec.size(), + cache_input_vars.size())); + for (size_t i = 0; i < name_vec.size(); i++) { + auto var_name = name_vec[i]; + auto* cache_input_var = cache_input_vars[i]; + if (!VarIsTensor(*cache_input_var)) continue; + auto* cache_input_tensor = + GetMutableLoDTensorOrSelectedRowsValueFromVar(cache_input_var); + auto cache_input_tensor_dims = cache_input_tensor->dims(); + auto* current_input_var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + current_input_var, + platform::errors::NotFound( + "The variable %s is not found when " + "enable_cache_runtime_context_cache in origin scope.", + var_name)); + auto* current_input_tensor = + GetMutableLoDTensorOrSelectedRowsValueFromVar(current_input_var); + auto current_input_tensor_dims = current_input_tensor->dims(); + if (cache_input_tensor_dims != current_input_tensor_dims || + NeedTransformLayout(current_input_tensor->layout(), + cache_input_tensor->layout())) { + need_prepare_data_ = true; + return true; + } + } + } + return false; +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { // To reduce the elapsed time of HasAttr, we use bool variable to record the @@ -1627,12 +1678,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_ && HasAttr(kAllKernelsMustComputeRuntimeShape)) all_kernels_must_compute_runtime_shape_ = true; - const Scope* cur_scope = &scope; CheckWhetherPreparePhiData(Inputs(), Outputs(), scope); if (!enable_cache_runtime_context_) { RuntimeContext ctx(Inputs(), Outputs(), scope); RunImpl(scope, place, &ctx); - pre_scope_ = cur_scope; } else if (run_phi_kernel_ && impl_ != nullptr && !need_prepare_data_ && !need_prepare_phi_data_) { if (!all_kernels_must_compute_runtime_shape_ && impl_->NeedInferShape()) { @@ -1640,12 +1689,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } (*phi_kernel_)(impl_->getKernelContext()); } else { - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + if (NeedResetRuntimeContext(scope)) { std::lock_guard lock(cache_update_mutex_); - if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { - runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); - pre_scope_ = cur_scope; - } + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); } RunImpl(scope, place, runtime_ctx_.get()); } @@ -2030,8 +2076,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // To solve issue #15032, have a discussion with @Luotao for cpu inference, // do not cache transfer scope, hence in this case delete transfer scope // after run to avoid memory leak - if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) { - scope.DeleteScope(transfer_scope); + if (cache_transfer_scope_ && !run_by_executor_ && + !enable_cache_transfer_scope_) { + scope.DeleteScope(cache_transfer_scope_); } } @@ -2566,33 +2613,25 @@ Scope* OperatorWithKernel::PrepareData( kernel_type_for_var.backend() == phi::Backend::GPUDNN || new_expected_kernel_key->backend() == phi::Backend::GPU || new_expected_kernel_key->backend() == phi::Backend::GPUDNN) { - new_scope = TryCreateTransferScope( + cache_transfer_scope_ = TryCreateTransferScope( kernel_type_for_var, *new_expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; + new_scope = cache_transfer_scope_; } } else if (kernel_type_for_var.backend() == phi::Backend::GPU || kernel_type_for_var.backend() == phi::Backend::GPUDNN || expected_kernel_key.backend() == phi::Backend::GPU || expected_kernel_key.backend() == phi::Backend::GPUDNN) { - new_scope = TryCreateTransferScope( + cache_transfer_scope_ = TryCreateTransferScope( kernel_type_for_var, expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; + new_scope = cache_transfer_scope_; } } if (!new_scope) { new_scope = &scope.NewScope(); } - // For inference, if a gpu model has an op which could only run on CPU, - // each result of different input will be the same with the first one. - // The reason is that if a gpu tensor is the input of a cpu kernel, - // we will create a new cpu tensor in new scope. - // However, if enable_cache_runtime_context_, we get the cpu tensor each - // time, not the gpu tensor. Thus, we set pre_scope_ = nullptr - // to trigger `new RuntimeContext()` in RunImpl(). - if (enable_cache_runtime_context_) { - pre_scope_ = nullptr; - } // Create new var with the same name in transfer scopes auto* trans_var = new_scope->Var(var_name); @@ -2678,18 +2717,13 @@ Scope* OperatorWithKernel::PrepareData( } } - // If pre_scope = &scope, it means that scope is cached and the op is not in - // while block. If new_scope = nullptr, it means that for each input of this - // Op, there is no need to do PrepareData. So PrepareData could be skipped at - // the rest iterations to save the elapsed time. // We do not support skipping PrepareData in while block, because the Op's // input may be changed by subsequent Ops, which may cause an error. - // For inference, ops that behind conditional branch aren't supported well, // so disable prepare optimization conservatively. bool force_prepare_data = HasAttr("inference_force_prepare_data") && Attr("inference_force_prepare_data"); - if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) { + if (enable_cache_runtime_context_ && !force_prepare_data) { need_prepare_data_ = false; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index f2de07db96df0..2bb5ca2c4a13f 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -781,18 +781,19 @@ class OperatorWithKernel : public OperatorBase { // used for IndicateOrPromoteVarDataTypes phi::DenseTensor* GetTensorFormInputSafely(const ExecutionContext& ctx, const std::string& name) const; + bool NeedResetRuntimeContext(const Scope& scope) const; protected: mutable std::unique_ptr kernel_type_; mutable std::unique_ptr kernel_func_; mutable std::unique_ptr runtime_ctx_; - mutable const Scope* pre_scope_ = nullptr; mutable bool need_prepare_data_ = true; mutable bool need_prepare_phi_data_ = false; mutable bool enable_cache_runtime_context_ = false; mutable bool all_kernels_must_compute_runtime_shape_ = false; mutable std::mutex cache_update_mutex_; mutable bool enable_cache_transfer_scope_ = false; + mutable Scope* cache_transfer_scope_ = nullptr; // NOTE(jiahongyu): Whether fallback to plain kernel after calling // GetExpectedKernelType, use this bool flag to solve mkldnn and cudnn hard // code diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 7557daa49e2fd..e1b2f999d7daa 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice( } #endif +#ifdef PADDLE_WITH_XPU +void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) { + if (!argument->use_xpu()) return; + + PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(), + true, + platform::errors::PreconditionNotMet( + "The xpu_device_id field should be valid")); + + LOG(INFO) << "Sync params from CPU to XPU: " + << "xpu_device_id - " << argument->xpu_device_id(); + + platform::Place place = platform::XPUPlace(argument->xpu_device_id()); + auto *scope = argument->scope_ptr(); + std::vector all_vars = scope->LocalVarNames(); + + for (auto &var_name : all_vars) { + auto *var = scope->FindLocalVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, + platform::errors::PreconditionNotMet("The var should not be nullptr")); + + if (var->IsType()) { + auto *t = var->GetMutable(); + + platform::CPUPlace cpu_place; + phi::DenseTensor temp_tensor; + temp_tensor.Resize(t->dims()); + + paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor); + t->clear(); + paddle::framework::TensorCopySync(temp_tensor, place, t); + } + } +} +#endif + void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { PADDLE_ENFORCE_EQ( argument->scope_valid(), @@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { if (argument->use_custom_device_valid()) { CopyParamsToCustomDevice(argument); } +#endif +#ifdef PADDLE_WITH_XPU + if (argument->use_xpu_valid()) { + CopyParamsToXpu(argument); + } #endif paddle::memory::Release(platform::CPUPlace()); } diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index bc91bd6a1aea1..3ffecc72a50f5 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass { #ifdef PADDLE_WITH_CUSTOM_DEVICE void CopyParamsToCustomDevice(Argument *argument); #endif + +#ifdef PADDLE_WITH_XPU + void CopyParamsToXpu(Argument *argument); +#endif }; } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index dfdbd63767c9e..a36eeb86ddb98 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1281,6 +1281,18 @@ void AnalysisPredictor::PrepareArgument() { } #endif +#ifdef PADDLE_WITH_XPU + argument_->SetUseXpu(config_.use_xpu_); + argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); + argument_->SetXpuLocked(config_.xpu_locked_); + argument_->SetXpuAutotune(config_.xpu_autotune_); + argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_); + argument_->SetXpuPrecision(config_.xpu_precision_); + argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); + argument_->SetXpuDeviceId(config_.xpu_device_id_); + argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_); +#endif + auto *pass_builder = config_.pass_builder(); // TODO(inference): Need to reconstruct the pass_builder, pass should be // processed in a single