Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ControlFlow] replace executor in run method of control flow ops with standalone_executor #45696

Merged
merged 31 commits into from
Oct 31, 2022
Merged
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
29afcee
replace executor in conditional_block_op.run with standalone_executor
kangguangli Sep 2, 2022
5bb4895
add block_id as the argument of standalone executor's method run; add…
kangguangli Sep 6, 2022
f7de3f6
fix scope bug about conditional block op
kangguangli Sep 13, 2022
6b1f942
resolve conflicts
kangguangli Sep 13, 2022
5d61b37
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
kangguangli Sep 13, 2022
7f7c8c9
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
kangguangli Sep 13, 2022
e03caad
fix bug: unnecessary return of fetch value
kangguangli Sep 14, 2022
9577393
resolve conflicts
kangguangli Sep 14, 2022
fd25693
fix typo
kangguangli Sep 15, 2022
ff6c12e
fix: quantization will set variable persistable, and these variables …
kangguangli Sep 16, 2022
9080a1f
add interpretercore cache for conditional block op but not activate i…
kangguangli Sep 16, 2022
c5d8db7
fix bug: local scope reuse for conditional block op
kangguangli Sep 21, 2022
20ce6de
reset scope when conditional block op runs
kangguangli Sep 26, 2022
65a9610
resolve conflicts
kangguangli Sep 26, 2022
1aab0c4
fix typo
kangguangli Sep 26, 2022
c02ffcf
fix typo and code style
kangguangli Sep 26, 2022
6fdff03
add build scope for conditional block op
kangguangli Sep 28, 2022
9fd3e24
add skip for transfer_layout kernel
kangguangli Oct 10, 2022
cb1f389
refind code
kangguangli Oct 12, 2022
ebe93f9
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
kangguangli Oct 12, 2022
c7e8e3e
resovle confict
kangguangli Oct 12, 2022
8989127
resovle confict
kangguangli Oct 17, 2022
de1bcec
fix reset_scope
kangguangli Oct 18, 2022
edaab42
fix reset_scope
kangguangli Oct 19, 2022
dc66aa3
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
kangguangli Oct 19, 2022
54f6e04
refine code
kangguangli Oct 19, 2022
248c539
refine code
kangguangli Oct 25, 2022
e393d72
refine code
kangguangli Oct 26, 2022
4c03964
refine code
kangguangli Oct 26, 2022
61a687a
remove the use of FLAGS_control_flow_use_new_executor_cache
kangguangli Oct 28, 2022
98110e4
change FLAGS_control_flow_use_new_executor to false
kangguangli Oct 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ namespace interpreter {
struct ExecutionConfig {
bool used_for_jit{false};
bool create_local_scope{true};
bool used_for_control_flow_op{false};

size_t host_num_threads;
size_t deivce_num_threads;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,14 @@ void BuildVariableScope(const framework::BlockDesc& block,
}

if (var_desc->Persistable()) {
auto* ptr = inner_scope->Var(var_name);
// In principle, we should put all trainable parameters in global scope,
// which means the root of the scope tree. Some cases like quantization
// will look up these parameters in global scope.
const Scope* ancestor_scope = inner_scope;
while (ancestor_scope->parent()) {
ancestor_scope = ancestor_scope->parent();
}
auto* ptr = const_cast<Scope*>(ancestor_scope)->Var(var_name);
kangguangli marked this conversation as resolved.
Show resolved Hide resolved

VLOG(3) << "Initialize Variable " << var_name;
// NOTE(zhiqiu): if var exists in scope and the type is right,
Expand Down Expand Up @@ -301,16 +308,17 @@ std::tuple<VariableValueMap, VariableIdMap> BuildVariableMap(
vars.reserve(item.second.size());

for (auto& var_name : item.second) {
auto* var = local_scope->FindVar(var_name);

if (!var_scope->HasVar(var_name)) {
if (allow_var_not_in_program && local_scope->FindVar(var_name)) {
if (allow_var_not_in_program && var) {
VLOG(3) << "Add " << var_name << " to var_scope";
var_scope->AddVar(var_name, nullptr);
} else if (allow_var_not_in_scope) {
VLOG(4) << var_name << " don't exist in variable scope, skip it!";
continue;
}
}
auto* var = local_scope->FindVar(var_name);
auto var_id = var_scope->VarId(var_name);
vars.push_back(var);
ids.push_back(var_id);
Expand Down Expand Up @@ -420,7 +428,8 @@ void BuildOpFuncList(const platform::Place& place,
std::vector<OpFuncNode>* vec_func_list,
VariableScope* var_scope,
bool use_local_scope,
bool used_for_jit) {
bool used_for_jit,
bool used_for_control_flow_op) {
Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope()
: var_scope->GetMutableScope();
std::vector<std::unique_ptr<OperatorBase>>
Expand Down Expand Up @@ -479,6 +488,11 @@ void BuildOpFuncList(const platform::Place& place,
bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type);
bool allow_var_not_in_scope = ops_with_var_not_in_scope.count(op_type);

// ops in the control flow block may not find its inputs or outputs
// in VariableScope of the sub-block, so we need search it in parent scope.
allow_var_not_in_program =
used_for_control_flow_op || allow_var_not_in_program;

framework::VariableNameMap& input_name_map = op->Inputs();
VariableValueMap ins_map;
VariableIdMap ins_name2id;
Expand All @@ -495,7 +509,7 @@ void BuildOpFuncList(const platform::Place& place,
BuildVariableMap(output_name_map,
var_scope,
local_scope,
/*allow_var_not_in_program=*/false,
allow_var_not_in_program,
kangguangli marked this conversation as resolved.
Show resolved Hide resolved
allow_var_not_in_scope);

// step 1: build OpFuncNode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ void BuildOpFuncList(const platform::Place& place,
std::vector<OpFuncNode>* vec_func_list,
VariableScope* scope,
bool use_local_scope = true,
bool used_for_jit = false);
bool used_for_jit = false,
bool used_for_control_flow_op = false);

void AddFetch(const std::vector<std::string>& fetch_names,
framework::BlockDesc* block);
Expand Down
61 changes: 42 additions & 19 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

#include <unordered_set>

#include "gflags/gflags.h"

#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
Expand Down Expand Up @@ -47,6 +49,12 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
true,
"Use local_scope in new executor(especially used "
"in UT), can turn off for better performance");
PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
true,
"Use new executor in control flow op");
PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor_cache,
true,
"Cache new executor in control flow op");

DECLARE_bool(check_nan_inf);
DECLARE_bool(benchmark);
Expand Down Expand Up @@ -224,7 +232,7 @@ paddle::framework::FetchList InterpreterCore::Run(
}

paddle::framework::FetchList InterpreterCore::Run(
const std::vector<std::string>& feed_names) {
const std::vector<std::string>& feed_names, bool need_fetch) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since there is no fetch var in the running Program of conditional_block_op, whether the need_fetch parameter may not be required?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For ConditionalBlockOP , it should return values when it does not need. This will make the variable fetched deconstructed unexpectedly. So we need this parameter to tell executor whether it should return values.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we can decide by whether the block has the fetch op? @zhiqiu

SetDeviceId(place_);

#ifdef PADDLE_WITH_MKLDNN
Expand All @@ -244,11 +252,12 @@ paddle::framework::FetchList InterpreterCore::Run(
&op_func_nodes,
&var_scope_,
HasLocalScope(),
execution_config_.used_for_jit);
is_build_ = true;
execution_config_.used_for_jit,
execution_config_.used_for_control_flow_op);
kangguangli marked this conversation as resolved.
Show resolved Hide resolved
SetFeedVarsInplaceSkip(feed_names);
// convert vec func_list to graph
Convert(&op_func_nodes);
is_build_ = true;
} else {
// For the program that only run once, it is no need to
// create work_queue, so the async_work_queue_ is created
Expand Down Expand Up @@ -281,7 +290,7 @@ paddle::framework::FetchList InterpreterCore::Run(
Scope* inner_scope =
HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
if (fetch_var) {
if (fetch_var && need_fetch) {
return std::move(*fetch_var->GetMutable<framework::FetchList>());
} else {
return {};
Expand Down Expand Up @@ -311,9 +320,18 @@ void InterpreterCore::reset_scope(Scope* new_scope) {
var_scope_.SetScope(new_scope);
auto& var_list = var_scope_.MutableVarList();
for (size_t i = 0; i < var_list.size(); i++) {
var_list[i] = new_scope->FindVar(var_scope_.GetNameById(i));
const auto& var_name = var_scope_.GetNameById(i);
var_list[i] = new_scope->FindVar(var_name);
}
for (size_t i = 0; i < vec_instruction_.size(); ++i) {
// The index should assured valid, cause the InterpreterCore may not be fully
// built, but was still cached and used. For example, see unit test
// `test_assert.py`, it may exit before `InterpreterCore::Convert`, but still
// was cached and used by later tests.
for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
refs_[i]->ResetVariable(var_list[i]);
}

for (size_t i = 0; i < vec_instruction_.size(); i++) {
BuildAndCacheInstructionCtx(&vec_instruction_[i]);
}
}
Expand Down Expand Up @@ -540,6 +558,10 @@ void InterpreterCore::Convert(
if (var_desc && ins.count(item.first) &&
!info.IsInArgBufferNeeded(var_desc->Name())) {
continue;
} else if (!block_.HasVar(var_scope_.GetNameById(id))) {
VLOG(10) << "[gc_check_inputs] skip gc: "
<< var_scope_.GetNameById(id);
continue;
}
gc_check_vars.insert(id);
}
Expand Down Expand Up @@ -661,9 +683,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {

#ifdef PADDLE_WITH_ASCEND_CL
if (platform::is_npu_place(place)) {
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
// values, but only through special `float_status` to checks whether
// the operation is overflow. More about `float_status`, see:
// NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
// variable values, but only through special `float_status` to checks
// whether the operation is overflow. More about `float_status`, see:
// https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
if (FLAGS_check_nan_inf) {
framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
Expand Down Expand Up @@ -734,7 +756,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
}
}

VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);

if (!instr_node.InplaceBackMap().empty()) {
platform::RecordEvent inplaceback_event(
Expand Down Expand Up @@ -965,9 +987,9 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
if (platform::is_gpu_place(place)) {
memory::RecordStream(allocation, stream);
} else if (platform::is_cuda_pinned_place(place)) {
// TODO(Ruibiao): Here should do something to make sure that the tensor is
// not freed until the H2D copies done. However, simplely launch a CUDA
// runtime callback to the H2D stream may lead a high performance
// TODO(Ruibiao): Here should do something to make sure that the tensor
// is not freed until the H2D copies done. However, simplely launch a
// CUDA runtime callback to the H2D stream may lead a high performance
// overhead. As all the cases we meet in H2D are copies from CPUPlace at
// present, we just log a WARNING here. A better design is required.
LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
Expand All @@ -984,8 +1006,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
* instr.GCCheckVars.
* 2. The stream which initializes this tensor is different from the stream
* which the instruction run in.
* 3. The tensor is the instruction's input, cause we assume that instruction
* will initialize all output tensors with its running stream.
* 3. The tensor is the instruction's input, cause we assume that
* instruction will initialize all output tensors with its running stream.
* 4. In the OP function of this instruction, the tensor is an input of a
* async CUDA kernel.
*
Expand All @@ -995,8 +1017,8 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
* initialized this tensor has less time overhead. Conversely, it may take
* more time if we try to extract those cross-stream input vars from
* instr.GCCheckVars.
* 2. Now the instruction has no idea of which vars involving async running in
* OP function, and thus we can not recognize condition 4. It should be
* 2. Now the instruction has no idea of which vars involving async running
* in OP function, and thus we can not recognize condition 4. It should be
* supported later.
*/
for (int var_id : instr.GCCheckVars()) {
Expand Down Expand Up @@ -1100,11 +1122,12 @@ void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
&op_func_nodes,
&var_scope_,
HasLocalScope(),
execution_config_.used_for_jit);
is_build_ = true;
execution_config_.used_for_jit,
execution_config_.used_for_control_flow_op);
SetFeedVarsInplaceSkip(feed_names);
// convert vec func_list to graph
Convert(&op_func_nodes);
is_build_ = true;
}
// NOTE: Because feed_tensor will be GC after
// paddle::framework::BuildOpFuncList, so we should
Expand Down
16 changes: 15 additions & 1 deletion paddle/fluid/framework/new_executor/interpretercore.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
#include "paddle/fluid/memory/allocation/spin_lock.h"
#include "paddle/fluid/platform/device_event.h"

DECLARE_bool(new_executor_use_local_scope);
DECLARE_bool(control_flow_use_new_executor);
DECLARE_bool(control_flow_use_new_executor_cache);

namespace paddle {
namespace framework {

Expand All @@ -55,7 +59,8 @@ class InterpreterCore {
const std::vector<std::string>& feed_names,
const std::vector<phi::DenseTensor>& feed_tensors);

paddle::framework::FetchList Run(const std::vector<std::string>& feed_names);
paddle::framework::FetchList Run(const std::vector<std::string>& feed_names,
bool need_fetch = true);

void ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src);

Expand All @@ -67,6 +72,15 @@ class InterpreterCore {

void reset_scope(Scope* new_scope);

const platform::Place& GetPlace() const { return place_; }

void SetUsedForControlFlowOp(bool new_value) {
execution_config_.used_for_control_flow_op = new_value;
}
bool UsedForControlFlowOp() const {
return execution_config_.used_for_control_flow_op;
}

private:
// build graph
void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,7 @@ class VarRefInfo {
dynamic_ref_ = static_ref_;
}
}
void ResetVariable(Variable* new_var) { var_ = new_var; }
bool CheckAndDecrease() {
return static_ref_ == 1 || (dynamic_ref_.fetch_sub(1) == 1);
}
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/standalone_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ paddle::framework::FetchList StandaloneExecutor::Run(
const std::vector<std::string>& fetch_names) {
platform::RecordEvent record_event(
"StandaloneExecutor::run", platform::TracerEventType::UserDefined, 1);

auto core = GetInterpreterCore(scope, prog_, feed_names, fetch_names, false);

VLOG(4) << "StandaloneExecutor: " << this << ", InterpreterCore: " << core;
return core->Run(feed_names);
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/controlflow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ register_operators(EXCLUDES conditional_block_op DEPS naive_executor)
cc_library(
conditional_block_op
SRCS conditional_block_op.cc
DEPS executor)
DEPS standalone_executor executor)
cc_library(
op_variant
SRCS op_variant.cc
Expand All @@ -29,7 +29,7 @@ cc_library(
cc_test(
conditional_block_op_test
SRCS conditional_block_op_test.cc
DEPS conditional_block_op executor)
DEPS conditional_block_op standalone_executor executor)

if(WITH_UNITY_BUILD)
target_link_libraries(paddle_operators_controlflow_unity conditional_block_op)
Expand Down
Loading