Skip to content

Commit

Permalink
Optimize the ernie inference performance on xpu
Browse files Browse the repository at this point in the history
  • Loading branch information
csy0225 committed Feb 9, 2023
1 parent 7e8ef32 commit 26cd92a
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 14 deletions.
28 changes: 14 additions & 14 deletions paddle/fluid/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2552,16 +2552,20 @@ Scope* OperatorWithKernel::PrepareData(
if (new_expected_kernel_key) {
if (kernel_type_for_var.backend() == phi::Backend::GPU ||
kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
kernel_type_for_var.backend() == phi::Backend::XPU ||
new_expected_kernel_key->backend() == phi::Backend::GPU ||
new_expected_kernel_key->backend() == phi::Backend::GPUDNN) {
new_expected_kernel_key->backend() == phi::Backend::GPUDNN ||
new_expected_kernel_key->backend() == phi::Backend::XPU) {
new_scope = TryCreateTransferScope(
kernel_type_for_var, *new_expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
}
} else if (kernel_type_for_var.backend() == phi::Backend::GPU ||
kernel_type_for_var.backend() == phi::Backend::GPUDNN ||
kernel_type_for_var.backend() == phi::Backend::XPU ||
expected_kernel_key.backend() == phi::Backend::GPU ||
expected_kernel_key.backend() == phi::Backend::GPUDNN) {
expected_kernel_key.backend() == phi::Backend::GPUDNN ||
expected_kernel_key.backend() == phi::Backend::XPU) {
new_scope = TryCreateTransferScope(
kernel_type_for_var, expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
Expand All @@ -2571,16 +2575,6 @@ Scope* OperatorWithKernel::PrepareData(
if (!new_scope) {
new_scope = &scope.NewScope();
}
// For inference, if a gpu model has an op which could only run on CPU,
// each result of different input will be the same with the first one.
// The reason is that if a gpu tensor is the input of a cpu kernel,
// we will create a new cpu tensor in new scope.
// However, if enable_cache_runtime_context_, we get the cpu tensor each
// time, not the gpu tensor. Thus, we set pre_scope_ = nullptr
// to trigger `new RuntimeContext()` in RunImpl().
if (enable_cache_runtime_context_) {
pre_scope_ = nullptr;
}

// Create new var with the same name in transfer scopes
auto* trans_var = new_scope->Var(var_name);
Expand Down Expand Up @@ -2676,8 +2670,14 @@ Scope* OperatorWithKernel::PrepareData(
// so disable prepare optimization conservatively.
bool force_prepare_data = HasAttr("inference_force_prepare_data") &&
Attr<bool>("inference_force_prepare_data");
if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
need_prepare_data_ = false;
if (enable_cache_runtime_context_) {
if ((pre_scope_ == &scope && !force_prepare_data)) {
need_prepare_data_ = false;
}
} else {
if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) {
need_prepare_data_ = false;
}
}

return new_scope;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,43 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToCustomDevice(
}
#endif

#ifdef PADDLE_WITH_XPU
void IrParamsSyncAmongDevicesPass::CopyParamsToXpu(Argument *argument) {
if (!argument->use_xpu()) return;

PADDLE_ENFORCE_EQ(argument->xpu_device_id_valid(),
true,
platform::errors::PreconditionNotMet(
"The xpu_device_id field should be valid"));

LOG(INFO) << "Sync params from CPU to XPU: "
<< "xpu_device_id - " << argument->xpu_device_id();

platform::Place place = platform::XPUPlace(argument->xpu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();

for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE_NOT_NULL(
var,
platform::errors::PreconditionNotMet("The var should not be nullptr"));

if (var->IsType<phi::DenseTensor>()) {
auto *t = var->GetMutable<phi::DenseTensor>();

platform::CPUPlace cpu_place;
phi::DenseTensor temp_tensor;
temp_tensor.Resize(t->dims());

paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
t->clear();
paddle::framework::TensorCopySync(temp_tensor, place, t);
}
}
}
#endif

void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE_EQ(
argument->scope_valid(),
Expand All @@ -231,6 +268,11 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
if (argument->use_custom_device_valid()) {
CopyParamsToCustomDevice(argument);
}
#endif
#ifdef PADDLE_WITH_XPU
if (argument->use_xpu_valid()) {
CopyParamsToXpu(argument);
}
#endif
paddle::memory::Release(platform::CPUPlace());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
void CopyParamsToCustomDevice(Argument *argument);
#endif

#ifdef PADDLE_WITH_XPU
void CopyParamsToXpu(Argument *argument);
#endif
};

} // namespace analysis
Expand Down
12 changes: 12 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1256,6 +1256,18 @@ void AnalysisPredictor::PrepareArgument() {
}
#endif

#ifdef PADDLE_WITH_XPU
argument_->SetUseXpu(config_.use_xpu_);
argument_->SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_);
argument_->SetXpuLocked(config_.xpu_locked_);
argument_->SetXpuAutotune(config_.xpu_autotune_);
argument_->SetXpuAutotuneFile(config_.xpu_autotune_file_);
argument_->SetXpuPrecision(config_.xpu_precision_);
argument_->SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_);
argument_->SetXpuDeviceId(config_.xpu_device_id_);
argument_->SetXpuEnableMultiStream(config_.xpu_enable_multi_stream_);
#endif

auto *pass_builder = config_.pass_builder();
// TODO(inference): Need to reconstruct the pass_builder, pass should be
// processed in a single
Expand Down

0 comments on commit 26cd92a

Please sign in to comment.