diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index dd1c0d885efdd..e26f45a84673a 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -22,14 +22,17 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/core/kernel_context.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif -PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, +PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, + true, "Use inplace in new executor"); -PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, +PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, + true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); @@ -167,8 +170,8 @@ paddle::framework::FetchList InterpreterCore::Run( // scope? } global_scope_->SetLocalScope(local_scope_); - paddle::framework::interpreter::build_variable_scope(block_, global_scope_, - create_local_scope_); + paddle::framework::interpreter::build_variable_scope( + block_, global_scope_, create_local_scope_); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( place_, block_, &op_func_nodes, global_scope_, create_local_scope_); @@ -490,7 +493,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { // If it is OperatorBase, InferShape do nothing. if (op_with_kernel != nullptr) { platform::RecordEvent infershape_event( - "infer_shape", platform::TracerEventType::OperatorInner, 1, + "infer_shape", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); // see OperatorWithKernel::RunImpl in operator.cc for why @@ -499,6 +504,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { op_with_kernel->Info().infer_shape_( instr_node.InnerInferShapeContext().get()); } + infershape_event.End(); + platform::RecordOpInfoSupplement(op->Type(), + op->Attrs(), + *(instr_node.InnerInferShapeContext()), + *(instr_node.InnerRuntimeContext())); } } @@ -516,7 +526,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { { platform::RecordEvent compute_event( - "compute", platform::TracerEventType::OperatorInner, 1, + "compute", + platform::TracerEventType::OperatorInner, + 1, platform::EventRole::kInnerOp); if (op_with_kernel == nullptr) { instr_node.OpBase()->Run(*local_scope, place_); @@ -571,7 +583,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { if (op_with_kernel != nullptr && FLAGS_check_nan_inf) { VLOG(4) << "Check nan/inf"; framework::details::CheckOpHasNanOrInf( - *op, *global_scope_, + *op, + *global_scope_, place); // TODO(xiongkun03) change it to inner scope. } } @@ -596,10 +609,14 @@ void InterpreterCore::ExecuteInstructionList( for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [ - this, i, atomic_deps = atomic_deps.get(), - atomic_var_ref = atomic_var_ref.get() - ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); }); + async_work_queue_->AddTask(vec_instr.at(i).KernelType(), + [this, + i, + atomic_deps = atomic_deps.get(), + atomic_var_ref = atomic_var_ref.get()] { + RunInstructionAsync( + i, atomic_deps, atomic_var_ref); + }); } } @@ -615,7 +632,8 @@ void InterpreterCore::ExecuteInstructionList( } VLOG(4) << "Cancel ok"; PADDLE_ENFORCE_EQ( - main_thread_blocker_.Clear(), 0, + main_thread_blocker_.Clear(), + 0, platform::errors::PreconditionNotMet( "main_thread_blocker_.Clear() return -1, clear failed")); VLOG(4) << "clear ok"; @@ -624,7 +642,8 @@ void InterpreterCore::ExecuteInstructionList( } void InterpreterCore::RunNextInstructions( - const Instruction& instr, std::queue* reserved_next_ops, + const Instruction& instr, + std::queue* reserved_next_ops, std::vector>* atomic_deps, std::vector>* atomic_var_ref) { auto& next_instr = instr.NextInstructions(); @@ -691,7 +710,8 @@ void InterpreterCore::RunNextInstructions( } void InterpreterCore::RunInstructionAsync( - size_t instr_id, std::vector>* atomic_deps, + size_t instr_id, + std::vector>* atomic_deps, std::vector>* atomic_var_ref) { std::queue ready_ops; ready_ops.push(instr_id); @@ -700,10 +720,10 @@ void InterpreterCore::RunInstructionAsync( ready_ops.pop(); auto& instr_node = vec_instruction_.at(instr_id); VLOG(5) << __func__ << " OP id:" << instr_node.Id() - << " name:" << instr_node.OpBase()->Type() - << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync - ? "kQueueSync" - : "kQueueAsync") + << " name:" << instr_node.OpBase()->Type() << " type:" + << (instr_node.KernelType() == OpFuncType::kQueueSync + ? "kQueueSync" + : "kQueueAsync") << " runs on " << platform::GetCurrentThreadName(); auto* op = instr_node.OpBase(); @@ -877,12 +897,14 @@ void InterpreterCore::CheckGC( } else { static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), + &gc_event_.at(instr_id), &instr.DeviceContext()); } #else static_cast(gc_.get())->Add( - var_scope.Var(var_id), &gc_event_.at(instr_id), + var_scope.Var(var_id), + &gc_event_.at(instr_id), &instr.DeviceContext()); #endif } @@ -891,20 +913,24 @@ void InterpreterCore::CheckGC( void InterpreterCore::Prepare( const std::vector& feed_names, - const std::vector& feed_tensors, bool prepare_feed) { - PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(), + const std::vector& feed_tensors, + bool prepare_feed) { + PADDLE_ENFORCE_EQ(feed_names.size(), + feed_tensors.size(), platform::errors::PreconditionNotMet( "Required feed_names.size() == feed_tensors.size(), " "but received %d != %d", - feed_names.size(), feed_tensors.size())); + feed_names.size(), + feed_tensors.size())); auto FeedInput = [&] { VLOG(4) << "Feed inputs"; for (size_t i = 0; i < feed_names.size(); ++i) { auto* feed_var = global_scope_->FindVar(feed_names[i]); PADDLE_ENFORCE_NOT_NULL( - feed_var, platform::errors::NotFound( - "Variable %s should not be nullptr.", feed_names[i])); + feed_var, + platform::errors::NotFound("Variable %s should not be nullptr.", + feed_names[i])); auto feed_tensor = feed_var->GetMutable(); feed_tensor->ShareDataWith(feed_tensors[i]); @@ -913,8 +939,8 @@ void InterpreterCore::Prepare( }; if (!is_build_) { - paddle::framework::interpreter::build_variable_scope(block_, global_scope_, - create_local_scope_); + paddle::framework::interpreter::build_variable_scope( + block_, global_scope_, create_local_scope_); FeedInput(); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index f06ed0b496e9b..140525384c3e3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -33,6 +33,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/core/kernel_context.h" @@ -59,7 +60,8 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, + 0, "number of threads for inner op"); DECLARE_bool(run_kp_kernel); DECLARE_bool(enable_host_event_recorder_hook); @@ -74,7 +76,8 @@ std::vector> kKernelPriority = { std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), }; -static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, +static DDim GetDimsDebug(const ScopeBase& scope, + const std::string& name, bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { @@ -268,7 +271,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { Type(), platform::TracerEventType::Operator, 1); auto op_name = platform::OpName(outputs_, Type()); platform::RecordEvent op_name_record_event( - op_name, platform::TracerEventType::Operator, + op_name, + platform::TracerEventType::Operator, FLAGS_enable_host_event_recorder_hook ? 20 : 1, platform::EventRole::kUniqueOp); RunImpl(scope, place); @@ -297,9 +301,11 @@ bool OperatorBase::HasInputs(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE( - ins.size(), 1UL, + ins.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's input %s should contain only one variable.", type_, + "Operator %s's input %s should contain only one variable.", + type_, name)); return ins.empty() ? kEmptyVarName : ins[0]; } @@ -308,9 +314,10 @@ const std::vector& OperatorBase::Inputs( const std::string& name) const { auto it = inputs_.find(name); PADDLE_ENFORCE_NE( - it, inputs_.end(), - platform::errors::NotFound("Operator %s does not have the input %s.", - type_, name)); + it, + inputs_.end(), + platform::errors::NotFound( + "Operator %s does not have the input %s.", type_, name)); return it->second; } @@ -325,9 +332,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const { auto& outs = Outputs(name); PADDLE_ENFORCE_LE( - outs.size(), 1UL, + outs.size(), + 1UL, platform::errors::InvalidArgument( - "Operator %s's output %s should contain only one variable.", type_, + "Operator %s's output %s should contain only one variable.", + type_, name)); return outs.empty() ? kEmptyVarName : outs[0]; } @@ -336,7 +345,8 @@ const std::vector& OperatorBase::Outputs( const std::string& name) const { auto it = outputs_.find(name); PADDLE_ENFORCE_NE( - it, outputs_.end(), + it, + outputs_.end(), platform::errors::NotFound( "Operator %s does not have an output called %s.", type_, name)); return it->second; @@ -484,18 +494,20 @@ void OperatorBase::CheckAllInputOutputSet() const { for (auto& in : info_->Proto().inputs()) { if (!in.dispensable() && !in.extra()) { PADDLE_ENFORCE_NE( - inputs_.find(in.name()), inputs_.end(), - platform::errors::NotFound("Operator %s's input (%s) is not set.", - Type(), in.name())); + inputs_.find(in.name()), + inputs_.end(), + platform::errors::NotFound( + "Operator %s's input (%s) is not set.", Type(), in.name())); } } for (auto& out : info_->Proto().outputs()) { if (!out.dispensable() && !out.extra()) { PADDLE_ENFORCE_NE( - outputs_.find(out.name()), outputs_.end(), - platform::errors::NotFound("Operator %s's output (%s) is not set.", - Type(), out.name())); + outputs_.find(out.name()), + outputs_.end(), + platform::errors::NotFound( + "Operator %s's output (%s) is not set.", Type(), out.name())); } } } @@ -568,10 +580,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { if (it == ctx_.inputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's input %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -580,10 +594,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { if (it == ctx_.outputs.end()) return nullptr; PADDLE_ENFORCE_LE( - it->second.size(), 1UL, + it->second.size(), + 1UL, platform::errors::InvalidArgument( "Operator %s's output %s should contain only one variable.", - op_.Type(), name)); + op_.Type(), + name)); return it->second.empty() ? nullptr : it->second[0]; } @@ -598,10 +614,13 @@ const std::vector ExecutionContext::MultiInput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](const Variable* var) -> const Tensor* { if (var == nullptr) return nullptr; - PADDLE_ENFORCE_EQ(var->IsType(), true, + PADDLE_ENFORCE_EQ(var->IsType(), + true, platform::errors::InvalidArgument( "Input variable should be LoDTensor, " "but the received type is %s.", @@ -621,7 +640,9 @@ std::vector ExecutionContext::MultiOutput( } std::vector res; res.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(res), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(res), [&](Variable* var) -> Tensor* { return var == nullptr ? nullptr : var->GetMutable(); @@ -679,7 +700,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const auto& in = it->second; if (in.size() == 0) return false; PADDLE_ENFORCE_EQ( - in.size(), 1UL, + in.size(), + 1UL, platform::errors::InvalidArgument( "Input %s should not contain more than one inputs.", name)); return in[0] != nullptr; @@ -697,7 +719,8 @@ class RuntimeInferShapeContext : public InferShapeContext { return false; } PADDLE_ENFORCE_EQ( - out.size(), 1UL, + out.size(), + 1UL, platform::errors::InvalidArgument( "Output %s should not contain more than one outputs.", name)); return out[0] != nullptr; @@ -754,11 +777,14 @@ class RuntimeInferShapeContext : public InferShapeContext { std::string GetInputNameByIdx(size_t idx) const override { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; - PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(), + PADDLE_ENFORCE_LT(idx, + op_proto->inputs().size(), platform::errors::OutOfRange( "The index should be less than the size of inputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->inputs().size())); + op_.Type(), + idx, + op_proto->inputs().size())); return op_proto->inputs()[idx].name(); } @@ -766,42 +792,55 @@ class RuntimeInferShapeContext : public InferShapeContext { auto& op_proto = paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; PADDLE_ENFORCE_LT( - idx, op_proto->outputs().size(), + idx, + op_proto->outputs().size(), platform::errors::OutOfRange( "The index should be less than the size of outputs of " "operator %s, but got index is %d and size is %d", - op_.Type(), idx, op_proto->outputs().size())); + op_.Type(), + idx, + op_proto->outputs().size())); return op_proto->outputs()[idx].name(); } - void ShareDim(const std::string& in, const std::string& out, size_t i = 0, + void ShareDim(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second[i]; Variable* out_var = out_it->second[j]; PADDLE_ENFORCE_EQ( - in_var->Type(), out_var->Type(), + in_var->Type(), + out_var->Type(), platform::errors::InvalidArgument( - "The type of input (%s) and output (%s) are inconsistent.", in, + "The type of input (%s) and output (%s) are inconsistent.", + in, out)); if (in_var->IsType()) { @@ -825,19 +864,22 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::string& out) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); - PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(), + PADDLE_ENFORCE_NE(in_it, + ctx_.inputs.end(), platform::errors::NotFound( "Input [%s] found error in Op [%s]", in, op_.Type())); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), - platform::errors::NotFound("Output [%s] found error in Op [%s]", out, - op_.Type())); + out_it, + ctx_.outputs.end(), + platform::errors::NotFound( + "Output [%s] found error in Op [%s]", out, op_.Type())); auto& in_var_list = in_it->second; auto& out_var_list = out_it->second; PADDLE_ENFORCE_EQ( - in_var_list.size(), out_var_list.size(), + in_var_list.size(), + out_var_list.size(), platform::errors::PreconditionNotMet( "Op [%s]: Input var size should be equal with output var size", op_.Type())); @@ -852,10 +894,12 @@ class RuntimeInferShapeContext : public InferShapeContext { Variable* in_var = in_var_list[i]; if (!in_var->IsType()) return; Variable* out_var = out_var_list[i]; - PADDLE_ENFORCE_EQ(out_var->IsType(), true, + PADDLE_ENFORCE_EQ(out_var->IsType(), + true, platform::errors::PreconditionNotMet( "The %d-th output of Output(%s) must be LoDTensor.", - i, out_var_names[i])); + i, + out_var_names[i])); auto& in_tensor = in_var->Get(); auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); @@ -866,32 +910,41 @@ class RuntimeInferShapeContext : public InferShapeContext { } } - void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + void ShareLoD(const std::string& in, + const std::string& out, + size_t i = 0, size_t j = 0) const override { auto in_it = ctx_.inputs.find(in); auto out_it = ctx_.outputs.find(out); PADDLE_ENFORCE_NE( - in_it, ctx_.inputs.end(), + in_it, + ctx_.inputs.end(), platform::errors::NotFound("Input %s does not exist.", in)); PADDLE_ENFORCE_NE( - out_it, ctx_.outputs.end(), + out_it, + ctx_.outputs.end(), platform::errors::NotFound("Output %s does not exist.", out)); - PADDLE_ENFORCE_LT(i, in_it->second.size(), + PADDLE_ENFORCE_LT(i, + in_it->second.size(), platform::errors::InvalidArgument( "The index of input dimension is out of range, " "excepted index less than %zu, but received %zu.", - in_it->second.size(), i)); - PADDLE_ENFORCE_LT(j, out_it->second.size(), + in_it->second.size(), + i)); + PADDLE_ENFORCE_LT(j, + out_it->second.size(), platform::errors::InvalidArgument( "The index of output dimension is out of range, " "excepted index less than %zu, but received %zu.", - out_it->second.size(), j)); + out_it->second.size(), + j)); Variable* in_var = in_it->second.at(i); if (!in_var->IsType()) return; Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE_EQ( - out_var->IsType(), true, + out_var->IsType(), + true, platform::errors::InvalidArgument( "The %zu-th output of Output(%s) must be LoDTensor.", j, out)); auto& in_tensor = in_var->Get(); @@ -926,7 +979,8 @@ class RuntimeInferShapeContext : public InferShapeContext { "set in the runtime kernel.")); } - void SetLoDLevel(const std::string& out, int32_t lod_level, + void SetLoDLevel(const std::string& out, + int32_t lod_level, size_t j = 0) const override { PADDLE_THROW(platform::errors::PreconditionNotMet( "SetLoDLevel is only used in compile time. The calculation of " @@ -969,10 +1023,12 @@ class RuntimeInferShapeContext : public InferShapeContext { DDim GetInputDim(const std::string& name) const override { const std::vector& vars = InputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument( "Input(%s) should hold one element, but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); return this->GetDim(vars[0]); } @@ -998,10 +1054,12 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetOutputDim(const std::string& name, const DDim& dim) override { auto& vars = OutputVars(name); PADDLE_ENFORCE_EQ( - vars.size(), 1UL, + vars.size(), + 1UL, platform::errors::InvalidArgument("Output(%s) should hold one element, " "but now it holds %zu elements.", - name, vars.size())); + name, + vars.size())); SetDim(vars[0], dim); } @@ -1038,7 +1096,9 @@ class RuntimeInferShapeContext : public InferShapeContext { std::vector GetDims(const std::vector& vars) const { std::vector ret; ret.reserve(vars.size()); - std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + std::transform(vars.begin(), + vars.end(), + std::back_inserter(ret), [this](Variable* var) { return this->GetDim(var); }); return ret; } @@ -1064,12 +1124,14 @@ class RuntimeInferShapeContext : public InferShapeContext { void SetDims(const std::vector& vars, const std::vector& dims) { size_t length = vars.size(); - PADDLE_ENFORCE_EQ(length, dims.size(), + PADDLE_ENFORCE_EQ(length, + dims.size(), platform::errors::InvalidArgument( "The number of input variables do not match the " "number of input dimensions, the number of variables " "is %zu, the number of dimensions is %zu.", - length, dims.size())); + length, + dims.size())); for (size_t i = 0; i < length; ++i) { if (vars[i] == nullptr) { continue; @@ -1088,9 +1150,12 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& vars) const { std::vector retv; retv.resize(vars.size()); - std::transform(vars.begin(), vars.end(), retv.begin(), + std::transform(vars.begin(), + vars.end(), + retv.begin(), std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), - this, std::placeholders::_1)); + this, + std::placeholders::_1)); return retv; } @@ -1102,7 +1167,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& InputVars(const std::string& name) const { auto it = ctx_.inputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.inputs.end(), + it, + ctx_.inputs.end(), platform::errors::NotFound( "Operator (%s) does not have the input (%s).", op_.Type(), name)); return it->second; @@ -1111,7 +1177,8 @@ class RuntimeInferShapeContext : public InferShapeContext { const std::vector& OutputVars(const std::string& name) const { auto it = ctx_.outputs.find(name); PADDLE_ENFORCE_NE( - it, ctx_.outputs.end(), + it, + ctx_.outputs.end(), platform::errors::NotFound( "Operator (%s) does not have the outputs (%s).", op_.Type(), name)); return it->second; @@ -1132,20 +1199,23 @@ static void CheckTensorNANOrInf(const std::string& op_type, return; } PADDLE_ENFORCE_NE( - framework::TensorContainsInf(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", - op_type, name)); + framework::TensorContainsInf(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains Inf.", op_type, name)); PADDLE_ENFORCE_NE( - framework::TensorContainsNAN(tensor), true, - platform::errors::Fatal("Operator %s output Tensor %s contains NAN.", - op_type, name)); + framework::TensorContainsNAN(tensor), + true, + platform::errors::Fatal( + "Operator %s output Tensor %s contains NAN.", op_type, name)); } bool OperatorWithKernel::SupportGPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::GPU; }); @@ -1158,7 +1228,8 @@ bool OperatorWithKernel::SupportGPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_gpu_place(kern_pair.first.place_); }); @@ -1170,7 +1241,8 @@ bool OperatorWithKernel::SupportNPU() const { auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( phi::TransToPhiKernelName(type_)); auto has_phi_kernel = - std::any_of(phi_kernels.begin(), phi_kernels.end(), + std::any_of(phi_kernels.begin(), + phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::NPU; }); @@ -1183,7 +1255,8 @@ bool OperatorWithKernel::SupportNPU() const { } else { auto& op_kernels = kernel_iter->second; return std::any_of( - op_kernels.begin(), op_kernels.end(), + op_kernels.begin(), + op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { return platform::is_npu_place(kern_pair.first.place_); }); @@ -1195,14 +1268,16 @@ bool OperatorWithKernel::SupportsMKLDNN( const proto::VarType::Type data_type) const { auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_); if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) { - VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid " - "Registered Kernels. And We don't " - "search its kernels in phi lib, " - "SupportsMKLDNN() return false."; + VLOG(6) << "Warning: " << type_ + << " don't find its MKLDNN Kernel in Fluid " + "Registered Kernels. And We don't " + "search its kernels in phi lib, " + "SupportsMKLDNN() return false."; return false; } auto& op_kernels = op_kernel_iter->second; - return std::any_of(op_kernels.begin(), op_kernels.end(), + return std::any_of(op_kernels.begin(), + op_kernels.end(), [data_type](OpKernelMap::const_reference kern_pair) { return platform::is_cpu_place(kern_pair.first.place_) && kern_pair.first.library_type_ == @@ -1366,7 +1441,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP) && !is_xpu_unsupport #endif - ) { + ) { run_phi_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); @@ -1399,7 +1474,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #if defined(PADDLE_WITH_XPU_KP) || (is_xpu_unsupport && !is_xpu_kp_support) #endif - ) { + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( @@ -1429,10 +1504,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("prepare_data", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (need_prepare_data_) { - transfer_scope = PrepareData(scope, *kernel_type_, - &transfered_inplace_vars, runtime_ctx); + transfer_scope = PrepareData( + scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); } } // exec scope is the scope that kernel actually executed on. @@ -1442,9 +1518,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (!all_kernels_must_compute_runtime_shape_) { platform::RecordEvent record_event("infer_shape", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); this->Info().infer_shape_(&infer_shape_ctx); + record_event.End(); + platform::RecordOpInfoSupplement( + Type(), Attrs(), infer_shape_ctx, *runtime_ctx); } if (FLAGS_enable_unused_var_check) { @@ -1456,7 +1536,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, { platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, - 1, platform::EventRole::kInnerOp); + 1, + platform::EventRole::kInnerOp); if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; // Do data transform before building KernelContext @@ -1584,7 +1665,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), + kernels_iter, + all_op_kernels.end(), platform::errors::Unavailable( "There are no kernels which are registered in the %s operator.", type_)); @@ -1706,10 +1788,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { kernel_iter = kernels.find(expected_kernel_key); } #endif - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", type_, - KernelTypeToString(expected_kernel_key))); + PADDLE_ENFORCE_NE( + kernel_iter, + kernels.end(), + platform::errors::NotFound("Operator (%s) does not have kernel for %s.", + type_, + KernelTypeToString(expected_kernel_key))); std::lock_guard lock(cache_update_mutex_); if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { @@ -1719,7 +1803,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { } void OperatorWithKernel::TransferInplaceVarsBack( - const Scope& scope, const std::vector& inplace_vars, + const Scope& scope, + const std::vector& inplace_vars, const Scope& transfer_scope) const { for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; @@ -1730,8 +1815,9 @@ void OperatorWithKernel::TransferInplaceVarsBack( auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); auto* var = transfer_scope.FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument( - "The variable[%s] is nullptr.", var_name)); + PADDLE_ENFORCE_NOT_NULL(var, + platform::errors::InvalidArgument( + "The variable[%s] is nullptr.", var_name)); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto original_dims = original_tensor->dims(); original_tensor->ShareDataWith(*transformed_tensor); @@ -1811,7 +1897,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad( } Scope* OperatorWithKernel::PrepareData( - const Scope& scope, const OpKernelType& expected_kernel_key, + const Scope& scope, + const OpKernelType& expected_kernel_key, std::vector* transfered_inplace_vars, RuntimeContext* ctx) const { Scope* new_scope = nullptr; @@ -1867,8 +1954,8 @@ Scope* OperatorWithKernel::PrepareData( input_vars[i] = trans_var; auto out = trans_var->GetMutable(); out->Resize(tensor_in->dims()); - platform::MatchShapeToLayout(out, tensor_in->layout(), - DataLayout::kNHWC); + platform::MatchShapeToLayout( + out, tensor_in->layout(), DataLayout::kNHWC); VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " "but kNHWC layout" << var_name_item.first << " in Operator " << type_; @@ -1915,8 +2002,8 @@ Scope* OperatorWithKernel::PrepareData( if (!run_by_executor_ && (platform::is_gpu_place(kernel_type_for_var.place_) || platform::is_gpu_place(expected_kernel_key.place_))) { - new_scope = TryCreateTransferScope(kernel_type_for_var, - expected_kernel_key, &scope); + new_scope = TryCreateTransferScope( + kernel_type_for_var, expected_kernel_key, &scope); enable_cache_transfer_scope_ = true; } if (!new_scope) { @@ -1978,7 +2065,8 @@ Scope* OperatorWithKernel::PrepareData( } void OperatorWithKernel::ParseInputDataType( - const Variable* var, const std::string& name, + const Variable* var, + const std::string& name, proto::VarType::Type* data_type) const { if (var != nullptr) { const Tensor* t = nullptr; @@ -1998,17 +2086,20 @@ void OperatorWithKernel::ParseInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); *data_type = paddle::framework::TransToProtoVarType(t->dtype()); } } } void OperatorWithKernel::ParseMultiInputDataType( - const std::vector& vars, const std::string& name, + const std::vector& vars, + const std::string& name, proto::VarType::Type* data_type) const { proto::VarType::Type default_data_type = static_cast(-1); @@ -2032,10 +2123,12 @@ void OperatorWithKernel::ParseMultiInputDataType( } if (t != nullptr) { PADDLE_ENFORCE_EQ( - t->IsInitialized(), true, + t->IsInitialized(), + true, platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " "contains uninitialized Tensor.", - Type(), name)); + Type(), + name)); proto::VarType::Type tmp = paddle::framework::TransToProtoVarType(t->dtype()); PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, @@ -2045,7 +2138,9 @@ void OperatorWithKernel::ParseMultiInputDataType( "consistent or reigster GetExpectedKernelType. The " "current variable type is (%s), but the " "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), + Type(), + name, + DataTypeToString(tmp), DataTypeToString(*data_type))); *data_type = tmp; } @@ -2066,7 +2161,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( } } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::NotFound( "DataType should be indicated by input Variable at %s.", Type())); return data_type; @@ -2083,12 +2179,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType( ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type); } PADDLE_ENFORCE_NE( - data_type, dafault_data_type, + data_type, + dafault_data_type, platform::errors::InvalidArgument( "The Input Variable(%s) of (%s) Operator used to determine kernel " "data type is empty or not LoDTensor or SelectedRows or " "LoDTensorArray.", - name, Type())); + name, + Type())); return data_type; } @@ -2120,11 +2218,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( t, platform::errors::InvalidArgument( "The Tensor of variable %s is nullptr when promote complex types.")); - PADDLE_ENFORCE_EQ(t->IsInitialized(), true, + PADDLE_ENFORCE_EQ(t->IsInitialized(), + true, platform::errors::InvalidArgument( "The Tensor in the %s Op's Input Variable %s(%s) is " "not initialized.", - Type(), name, ctx.InputName(name))); + Type(), + name, + ctx.InputName(name))); return t; } @@ -2136,7 +2237,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( * the kernel data type. */ proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes( - const ExecutionContext& ctx, const std::string& name1, + const ExecutionContext& ctx, + const std::string& name1, const std::string& name2) const { // 1. Get tensor auto* tensor_a = GetTensorFormInputSafely(ctx, name1); @@ -2158,10 +2260,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( } OpKernelType OperatorWithKernel::GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, + const Tensor& tensor, const OpKernelType& expected_kernel_type) const { - return OpKernelType(expected_kernel_type.data_type_, tensor.place(), - tensor.layout()); + return OpKernelType( + expected_kernel_type.data_type_, tensor.place(), tensor.layout()); } phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( @@ -2172,8 +2275,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( if (arg_map_fn) { arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn)); } else { - auto func = [this]( - const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { + auto func = + [this]( + const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature { return phi::DefaultKernelSignatureMap::Instance().Get(type_); }; arg_map_fn_.reset(new phi::ArgumentMappingFn(func)); @@ -2183,16 +2287,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( } Scope* OperatorWithKernel::PreparePhiData( - const Scope& scope, const phi::Kernel& pt_kernel, + const Scope& scope, + const phi::Kernel& pt_kernel, const phi::KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const { const auto& input_names = pt_kernel_signature.input_names; auto input_defs = pt_kernel.args_def().input_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); Scope* new_scope = nullptr; auto& name_map = Inputs(); const std::unordered_set* no_buffer_ins = nullptr; @@ -2279,7 +2386,8 @@ Scope* OperatorWithKernel::PreparePhiData( } void OperatorWithKernel::BuildPhiKernelContext( - const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, + const RuntimeContext& ctx, + platform::DeviceContext* dev_ctx, phi::KernelContext* pt_kernel_context) const { pt_kernel_context->SetDeviceContext(dev_ctx); @@ -2291,23 +2399,29 @@ void OperatorWithKernel::BuildPhiKernelContext( auto attr_defs = pt_kernel_->args_def().attribute_defs(); auto output_defs = pt_kernel_->args_def().output_defs(); - PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), + PADDLE_ENFORCE_EQ(input_names.size(), + input_defs.size(), platform::errors::InvalidArgument( "The size of inputs_args names (%d) must be equal to " "the size of kernel input_defs (%d).", - input_names.size(), input_defs.size())); + input_names.size(), + input_defs.size())); - PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(), + PADDLE_ENFORCE_EQ(output_names.size(), + output_defs.size(), platform::errors::InvalidArgument( "The size of outputs_args names (%d) must be equal to " "the size of kernel output_defs (%d).", - output_names.size(), output_defs.size())); + output_names.size(), + output_defs.size())); - PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(), + PADDLE_ENFORCE_EQ(attr_names.size(), + attr_defs.size(), platform::errors::InvalidArgument( "The size of attribute_args names (%d) must be equal " "to the size of kernel attribute_defs (%d).", - attr_names.size(), attr_defs.size())); + attr_names.size(), + attr_defs.size())); for (size_t i = 0; i < input_names.size(); ++i) { auto it = ctx.inputs.find(input_names[i]); @@ -2489,7 +2603,8 @@ void OperatorWithKernel::BuildPhiKernelContext( break; case phi::AttributeType::SCALARS: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); @@ -2553,7 +2668,8 @@ void OperatorWithKernel::BuildPhiKernelContext( } break; default: { PADDLE_ENFORCE_NE( - attr_iter, Attrs().end(), + attr_iter, + Attrs().end(), platform::errors::NotFound("(%s) is not found in AttributeMap when " "buildind static KernelContext.", attr_names[i])); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt old mode 100644 new mode 100755 index 5af13f76b36bd..ef9cde883fb01 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,137 +1,264 @@ -cc_library(allocator SRCS allocator.cc DEPS place stats) -cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) -cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) -cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) -cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) -cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) -cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator) -cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) - -if (WITH_MKLDNN) +cc_library( + allocator + SRCS allocator.cc + DEPS place stats profiler) +cc_library( + cpu_allocator + SRCS cpu_allocator.cc + DEPS allocator) +cc_library( + locked_allocator + SRCS locked_allocator.cc + DEPS allocator) +cc_library( + buffered_allocator + SRCS buffered_allocator.cc + DEPS allocator) +cc_library( + best_fit_allocator + SRCS best_fit_allocator.cc + DEPS allocator) +cc_library( + naive_best_fit_allocator + SRCS naive_best_fit_allocator.cc + DEPS allocator buddy_allocator) +cc_test( + naive_best_fit_allocator_test + SRCS naive_best_fit_allocator_test.cc + DEPS naive_best_fit_allocator) +cc_test( + buffered_allocator_test + SRCS buffered_allocator_test.cc + DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) + +if(WITH_MKLDNN) set(MKLDNN_CTX_DEPS mkldnn) -else () +else() set(MKLDNN_CTX_DEPS) endif() -if (WITH_GPU) - nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) - nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) - nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) - nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) +if(WITH_GPU) + nv_library( + cuda_allocator + SRCS cuda_allocator.cc + DEPS allocator cuda_device_guard stats) + nv_library( + cuda_managed_allocator + SRCS cuda_managed_allocator.cc + DEPS allocator cuda_device_guard gpu_info) + nv_library( + pinned_allocator + SRCS pinned_allocator.cc + DEPS allocator) + nv_library( + stream_safe_cuda_allocator + SRCS stream_safe_cuda_allocator.cc + DEPS allocator cuda_graph) + nv_library( + thread_local_allocator + SRCS thread_local_allocator.cc + DEPS allocator) - cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) + cc_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS thread_local_allocator) if(CUDA_VERSION GREATER_EQUAL 10.2) - nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda) + nv_library( + cuda_virtual_mem_allocator + SRCS cuda_virtual_mem_allocator.cc + DEPS dynload_cuda) endif() endif() -if (WITH_ROCM) - hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats) - hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) - hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) - hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) - - cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) +if(WITH_ROCM) + hip_library( + cuda_allocator + SRCS cuda_allocator.cc + DEPS allocator cuda_device_guard stats) + hip_library( + cuda_managed_allocator + SRCS cuda_managed_allocator.cc + DEPS allocator cuda_device_guard gpu_info) + hip_library( + pinned_allocator + SRCS pinned_allocator.cc + DEPS allocator) + hip_library( + stream_safe_cuda_allocator + SRCS stream_safe_cuda_allocator.cc + DEPS allocator) + hip_library( + thread_local_allocator + SRCS thread_local_allocator.cc + DEPS allocator) + + cc_test( + thread_local_allocator_test + SRCS thread_local_allocator_test.cc + DEPS thread_local_allocator) endif() -if (WITH_ASCEND_CL) - cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) - cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info) +if(WITH_ASCEND_CL) + cc_library( + npu_allocator + SRCS npu_allocator.cc + DEPS allocator npu_info) + cc_library( + npu_pinned_allocator + SRCS npu_pinned_allocator.cc + DEPS allocator npu_info) endif() -cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) +cc_library( + retry_allocator + SRCS retry_allocator.cc + DEPS allocator) -if (WITH_GPU OR WITH_ROCM) - set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context) - if(CUDA_VERSION GREATER_EQUAL 10.2) - list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) - endif() +if(WITH_GPU OR WITH_ROCM) + set(AllocatorFacadeDeps + gpu_info + cuda_allocator + cuda_managed_allocator + pinned_allocator + cuda_device_guard + thread_local_allocator + stream_safe_cuda_allocator + device_context) + if(CUDA_VERSION GREATER_EQUAL 10.2) + list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator) + endif() elseif(WITH_XPU) - set(AllocatorFacadeDeps xpu_info) + set(AllocatorFacadeDeps xpu_info) elseif(WITH_IPU) - set(AllocatorFacadeDeps ipu_info) + set(AllocatorFacadeDeps ipu_info) elseif(WITH_ASCEND) - set(AllocatorFacadeDeps ascend_npu_info) -else () - set(AllocatorFacadeDeps) + set(AllocatorFacadeDeps ascend_npu_info) +else() + set(AllocatorFacadeDeps) endif() -if (WITH_CUSTOM_DEVICE) - cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager) +if(WITH_CUSTOM_DEVICE) + cc_library( + custom_allocator + SRCS custom_allocator.cc + DEPS allocator device_manager) set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator) endif() -if (WITH_GPU) - nv_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - best_fit_allocator_test.cu - DEPS best_fit_allocator - locked_allocator - cpu_allocator - cuda_allocator - device_context - memcpy) -elseif (WITH_ROCM) - hip_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - best_fit_allocator_test.cu - DEPS best_fit_allocator - locked_allocator - cpu_allocator - cuda_allocator - device_context - memcpy) +if(WITH_GPU) + nv_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu + DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator + device_context memcpy) +elseif(WITH_ROCM) + hip_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu + DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator + device_context memcpy) else() - cc_test(best_fit_allocator_test - SRCS best_fit_allocator_test.cc - DEPS best_fit_allocator - locked_allocator - cpu_allocator) + cc_test( + best_fit_allocator_test + SRCS best_fit_allocator_test.cc + DEPS best_fit_allocator locked_allocator cpu_allocator) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator) +list( + APPEND + AllocatorFacadeDeps + cpu_allocator + locked_allocator + aligned_allocator + retry_allocator + buffered_allocator + naive_best_fit_allocator + auto_growth_best_fit_allocator + virtual_memory_auto_growth_best_fit_allocator + best_fit_allocator) -if (WITH_ASCEND_CL) - list(APPEND AllocatorFacadeDeps npu_pinned_allocator) +if(WITH_ASCEND_CL) + list(APPEND AllocatorFacadeDeps npu_pinned_allocator) endif() +cc_library( + aligned_allocator + SRCS aligned_allocator.cc + DEPS allocator) +cc_test( + test_aligned_allocator + SRCS test_aligned_allocator.cc + DEPS aligned_allocator) +cc_library( + allocator_strategy + SRCS allocator_strategy.cc + DEPS gflags ${AllocatorFacadeDeps}) +cc_library( + allocator_facade + SRCS allocator_facade.cc + DEPS allocator_strategy stats) -cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) -cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats) - -if (WITH_GPU) +if(WITH_GPU) target_link_libraries(allocator_facade cuda_graph) endif() -cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) -if (WITH_TESTING) - if ((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test) +cc_test( + retry_allocator_test + SRCS retry_allocator_test.cc + DEPS retry_allocator locked_allocator cpu_allocator) +if(WITH_TESTING) + if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test) target_link_libraries(retry_allocator_test cuda_allocator) endif() - if (TEST retry_allocator_test) - set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") + if(TEST retry_allocator_test) + set_tests_properties(retry_allocator_test PROPERTIES LABELS + "RUN_TYPE=EXCLUSIVE") endif() endif() -cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) +cc_test( + allocator_facade_abs_flags_test + SRCS allocator_facade_abs_flags_test.cc + DEPS allocator_facade) -cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) +cc_test( + allocator_facade_frac_flags_test + SRCS allocator_facade_frac_flags_test.cc + DEPS allocator_facade) -cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags) -cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) -cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator) +cc_library( + auto_growth_best_fit_allocator + SRCS auto_growth_best_fit_allocator.cc + DEPS allocator aligned_allocator flags) +cc_test( + auto_growth_best_fit_allocator_facade_test + SRCS auto_growth_best_fit_allocator_facade_test.cc + DEPS cpu_allocator auto_growth_best_fit_allocator) +cc_test( + auto_growth_best_fit_allocator_test + SRCS auto_growth_best_fit_allocator_test.cc + DEPS auto_growth_best_fit_allocator) -cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) +cc_library( + virtual_memory_auto_growth_best_fit_allocator + SRCS virtual_memory_auto_growth_best_fit_allocator.cc + DEPS allocator aligned_allocator) if(NOT WIN32) - cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) - cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) - if (WITH_GPU) - cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + cc_library( + mmap_allocator + SRCS mmap_allocator.cc + DEPS allocator) + cc_test( + mmap_allocator_test + SRCS mmap_allocator_test.cc + DEPS mmap_allocator allocator) + if(WITH_GPU) + cc_library( + cuda_ipc_allocator + SRCS cuda_ipc_allocator.cc + DEPS allocator) endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 5efbfce7fedd6..52cb4dd18a814 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -33,7 +33,8 @@ #endif PADDLE_DEFINE_EXPORTED_bool( - init_allocated_mem, false, + init_allocated_mem, + false, "It is a mistake that the values of the memory allocated by " "BuddyAllocator are always zeroed in some op's implementation. " "To find this error in time, we use init_allocated_mem to indicate " @@ -78,7 +79,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { std::call_once(init_flag, []() { a = new detail::BuddyAllocator( std::unique_ptr(new detail::CPUAllocator), - platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); + platform::CpuMinChunkSize(), + platform::CpuMaxChunkSize()); }); return a; @@ -96,7 +98,8 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { } template <> -void Free(const platform::CPUPlace &place, void *p, +void Free(const platform::CPUPlace &place, + void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); @@ -126,7 +129,8 @@ void *Alloc(const platform::IPUPlace &place, size_t size) { return p; } template <> -void Free(const platform::IPUPlace &place, void *p, +void Free(const platform::IPUPlace &place, + void *p, size_t size) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); @@ -155,7 +159,8 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { ret = xpu_malloc(reinterpret_cast(&p), size); } PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, + ret, + XPU_SUCCESS, platform::errors::External( "XPU API return wrong value[%d], no enough memory", ret)); if (FLAGS_init_allocated_mem) { @@ -172,7 +177,8 @@ void *Alloc(const platform::XPUPlace &place, size_t size) { } template <> -void Free(const platform::XPUPlace &place, void *p, +void Free(const platform::XPUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_XPU VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); @@ -235,11 +241,13 @@ class NPUBuddyAllocatorList { BuddyAllocator *Get(int npu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetNPUDeviceId(devices_[pos]); @@ -247,7 +255,8 @@ class NPUBuddyAllocatorList { new BuddyAllocator(std::unique_ptr( new detail::NPUAllocator(devices_[pos])), platform::NPUMinChunkSize(), - platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE)); + platform::NPUMaxChunkSize(), + EXTRA_PADDING_SIZE)); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "'FLAGS_fraction_of_gpu_memory_to_use' " @@ -313,8 +322,10 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " "%s, NpuMaxChunkSize %s, NPU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -332,7 +343,8 @@ void *Alloc(const platform::NPUPlace &place, size_t size) { } template <> -void Free(const platform::NPUPlace &place, void *p, +void Free(const platform::NPUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_ASCEND_CL VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -385,7 +397,8 @@ void *Alloc(const platform::NPUPinnedPlace &place, template <> void Free(const platform::NPUPinnedPlace &place, - void *p, size_t size) { + void *p, + size_t size) { #ifdef PADDLE_WITH_ASCEND_CL GetNPUPinnedBuddyAllocator()->Free(p); #else @@ -431,18 +444,21 @@ class GPUBuddyAllocatorList { BuddyAllocator *Get(int gpu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetDeviceId(devices_[pos]); - allocators_[pos].reset(new BuddyAllocator( - std::unique_ptr( - new detail::GPUAllocator(devices_[pos])), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); + allocators_[pos].reset( + new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(devices_[pos])), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize())); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "'FLAGS_fraction_of_gpu_memory_to_use' " @@ -494,8 +510,10 @@ void *Alloc(const platform::CUDAPlace &place, PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " "%s, GpuMaxChunkSize %s, GPU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -516,7 +534,8 @@ void *Alloc(const platform::CUDAPlace &place, } template <> -void Free(const platform::CUDAPlace &place, void *p, +void Free(const platform::CUDAPlace &place, + void *p, size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GetGPUBuddyAllocator(place.device)->Free(p); @@ -585,7 +604,8 @@ void *Alloc(const platform::CUDAPinnedPlace &place, template <> void Free(const platform::CUDAPinnedPlace &place, - void *p, size_t size) { + void *p, + size_t size) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) GetCUDAPinnedBuddyAllocator()->Free(p); #else @@ -631,18 +651,21 @@ class MLUBuddyAllocatorList { BuddyAllocator *Get(int mlu_id) { auto pos = std::distance( devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id)); - PADDLE_ENFORCE_LT(pos, devices_.size(), + PADDLE_ENFORCE_LT(pos, + devices_.size(), platform::errors::OutOfRange( "The index exceeds the size of devices, the size of " "devices is %d, the index is %d", - devices_.size(), pos)); + devices_.size(), + pos)); std::call_once(*init_flags_[pos], [this, pos] { platform::SetMLUDeviceId(devices_[pos]); - allocators_[pos].reset(new BuddyAllocator( - std::unique_ptr( - new detail::MLUAllocator(devices_[pos])), - platform::MLUMinChunkSize(), platform::MLUMaxChunkSize())); + allocators_[pos].reset( + new BuddyAllocator(std::unique_ptr( + new detail::MLUAllocator(devices_[pos])), + platform::MLUMinChunkSize(), + platform::MLUMaxChunkSize())); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " << "(mlu reuse gpu GFlags) " @@ -694,8 +717,10 @@ void *Alloc(const platform::MLUPlace &place, size_t size) { PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize " "%s, MLUMinChunkSize %s, MLU memory used: %s.", - string::HumanReadableSize(size), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(Used(place)))); @@ -712,7 +737,8 @@ void *Alloc(const platform::MLUPlace &place, size_t size) { } template <> -void Free(const platform::MLUPlace &place, void *p, +void Free(const platform::MLUPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_MLU VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -760,10 +786,12 @@ class BuddyAllocatorList { } BuddyAllocator *Get(int dev_id) { - PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), + PADDLE_ENFORCE_NE(init_flags_.find(dev_id), + init_flags_.end(), platform::errors::OutOfRange( "Cannot find %s %d, please check visible devices.", - device_type_, dev_id)); + device_type_, + dev_id)); std::call_once(*init_flags_[dev_id], [this, dev_id] { phi::DeviceManager::SetDevice(device_type_, dev_id); @@ -774,7 +802,8 @@ class BuddyAllocatorList { new detail::CustomAllocator(device_type_, dev_id)), phi::DeviceManager::GetMinChunkSize(place), phi::DeviceManager::GetMaxChunkSize(place), - phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); + phi::DeviceManager::GetExtraPaddingSize(place), + device_type_)); }); return allocators_[dev_id].get(); @@ -814,8 +843,11 @@ void *Alloc(const platform::CustomPlace &place, PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "%s. ", - string::HumanReadableSize(size), place.GetDeviceType(), place.device, - string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(size), + place.GetDeviceType(), + place.device, + string::HumanReadableSize(avail), + string::HumanReadableSize(total), string::HumanReadableSize(total - avail))); } else { if (FLAGS_init_allocated_mem) { @@ -831,7 +863,8 @@ void *Alloc(const platform::CustomPlace &place, } template <> -void Free(const platform::CustomPlace &place, void *p, +void Free(const platform::CustomPlace &place, + void *p, size_t size) { #ifdef PADDLE_WITH_CUSTOM_DEVICE VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); @@ -923,8 +956,6 @@ namespace allocation { phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size)); auto *tmp_alloc = new Allocation(ptr, size, place_); - platform::MemEvenRecorder::Instance().PushMemRecord( - static_cast(tmp_alloc), place_, size); return tmp_alloc; } @@ -932,8 +963,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) { paddle::platform::VisitPlace( allocation->place(), legacy::FreeVisitor(allocation->ptr(), allocation->size())); - platform::MemEvenRecorder::Instance().PopMemRecord( - static_cast(allocation), place_); delete allocation; } diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 276c6bb0e69b8..f1c0178fafc02 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/memory/stats.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" namespace paddle { namespace memory { namespace allocation { @@ -24,6 +26,11 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { #else PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); #endif + HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size()); + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::ReservedFree); delete allocation; } phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { @@ -33,6 +40,11 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { #else PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); #endif + HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + platform::RecordMemEvent(ptr, + platform::CUDAPinnedPlace(), + size, + platform::TracerMemEventType::ReservedAllocate); return new Allocation(ptr, size, platform::CUDAPinnedPlace()); } } // namespace allocation diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h index 71569366c2446..ef999dddf4591 100644 --- a/paddle/fluid/memory/allocation/stat_allocator.h +++ b/paddle/fluid/memory/allocation/stat_allocator.h @@ -16,6 +16,7 @@ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/stats.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" namespace paddle { namespace memory { @@ -30,16 +31,38 @@ class StatAllocator : public Allocator { protected: void FreeImpl(phi::Allocation* allocation) override { - MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - -allocation->size()); + if (platform::is_cpu_place(allocation->place()) || + platform::is_cuda_pinned_place(allocation->place())) { + HOST_MEMORY_STAT_UPDATE( + Allocated, allocation->place().GetDeviceId(), -allocation->size()); + } else { + DEVICE_MEMORY_STAT_UPDATE( + Allocated, allocation->place().GetDeviceId(), -allocation->size()); + } + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::Free); underlying_allocator_->Free(allocation); } phi::Allocation* AllocateImpl(size_t size) override { phi::Allocator::AllocationPtr allocation = underlying_allocator_->Allocate(size); - MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), - allocation->size()); + + const platform::Place& place = allocation->place(); + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + HOST_MEMORY_STAT_UPDATE( + Allocated, place.GetDeviceId(), allocation->size()); + } else { + DEVICE_MEMORY_STAT_UPDATE( + Allocated, place.GetDeviceId(), allocation->size()); + } + platform::RecordMemEvent(allocation->ptr(), + allocation->place(), + allocation->size(), + platform::TracerMemEventType::Allocate); return allocation.release(); } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 37ac0b4483291..f79b97de18414 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -39,6 +39,7 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" DECLARE_bool(use_pinned_memory); DECLARE_double(fraction_of_gpu_memory_to_use); @@ -62,12 +63,14 @@ void* AlignedMalloc(size_t size) { #else int error = posix_memalign(&p, alignment, size); PADDLE_ENFORCE_EQ( - error, 0, + error, + 0, platform::errors::ResourceExhausted( "Fail to alloc memory of %ld size, error code is %d.", size, error)); #endif - PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted( - "Fail to alloc memory of %ld size.", size)); + PADDLE_ENFORCE_NOT_NULL(p, + platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size.", size)); return p; } @@ -92,6 +95,9 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { } } + HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate); return p; } @@ -108,6 +114,10 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { #else free(p); #endif + + HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree); } bool CPUAllocator::UseGpu() const { return false; } @@ -140,7 +150,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum GPU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -155,20 +166,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - gpu_id_, string::HumanReadableSize(size), gpu_id_, - string::HumanReadableSize(allocated), string::HumanReadableSize(avail), - gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + gpu_id_, + string::HumanReadableSize(size), + gpu_id_, + string::HumanReadableSize(allocated), + string::HumanReadableSize(avail), + gpu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void GPUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(gpu_alloc_size_, size, + PADDLE_ENFORCE_EQ(index, + 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(gpu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, gpu_alloc_size_)); + size, + gpu_alloc_size_)); gpu_alloc_size_ -= size; platform::RecordedGpuFree(p, size, gpu_id_); @@ -205,6 +225,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { if (result == gpuSuccess) { *index = 1; // PINNED memory cuda_pinnd_alloc_size_ += size; + HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate); return p; } else { LOG(WARNING) << "cudaHostAlloc failed."; @@ -216,20 +239,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { gpuError_t err; - PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( - "The index should be 1, but got %d", index)); + PADDLE_ENFORCE_EQ(index, + 1, + platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); - PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size, + PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated cuda pinned memory (%d)", - size, cuda_pinnd_alloc_size_)); + size, + cuda_pinnd_alloc_size_)); cuda_pinnd_alloc_size_ -= size; #ifdef PADDLE_WITH_HIP err = hipHostFree(p); if (err != hipErrorDeinitialized) { PADDLE_ENFORCE_EQ( - err, hipSuccess, + err, + hipSuccess, platform::errors::Fatal( "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } @@ -243,12 +271,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { // cudaFreeHost succeeds. if (err != cudaErrorCudartUnloading) { PADDLE_ENFORCE_EQ( - err, 0, + err, + 0, platform::errors::Fatal( "cudaFreeHost failed in GPUPinnedAllocator, error code is %d", err)); } #endif + HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); + platform::RecordMemEvent( + p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree); } bool CUDAPinnedAllocator::UseGpu() const { return false; } @@ -279,7 +311,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum GPU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -294,21 +327,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - npu_id_, string::HumanReadableSize(size), npu_id_, - string::HumanReadableSize(avail), npu_id_, - FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + npu_id_, + string::HumanReadableSize(size), + npu_id_, + string::HumanReadableSize(avail), + npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void NPUAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(npu_alloc_size_, size, + PADDLE_ENFORCE_EQ(index, + 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(npu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, npu_alloc_size_)); + size, + npu_alloc_size_)); npu_alloc_size_ -= size; platform::RecordedNPUFree(p, size, npu_id_); @@ -347,20 +388,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { aclError err; - PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( - "The index should be 1, but got %d", index)); + PADDLE_ENFORCE_EQ(index, + 1, + platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); - PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, + PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated npu pinned memory (%d)", - size, npu_pinnd_alloc_size_)); + size, + npu_pinnd_alloc_size_)); npu_pinnd_alloc_size_ -= size; err = platform::NPUHostFree(p); if (err != ACL_ERROR_NONE) { PADDLE_ENFORCE_EQ( - err, 0, + err, + 0, platform::errors::Fatal( "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err)); } @@ -395,7 +441,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "maximum MLU memory usage is limited to %d MB.\n" " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", - limit_size, limit_size); + limit_size, + limit_size); } PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( @@ -410,20 +457,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { "please set it to a higher value but less than 1.0.\n" " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", - mlu_id_, string::HumanReadableSize(size), mlu_id_, - string::HumanReadableSize(allocated), string::HumanReadableSize(avail), - mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + mlu_id_, + string::HumanReadableSize(size), + mlu_id_, + string::HumanReadableSize(allocated), + string::HumanReadableSize(avail), + mlu_id_, + FLAGS_fraction_of_gpu_memory_to_use, + err_msg)); } } void MLUAllocator::Free(void* p, size_t size, size_t index) { - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(mlu_alloc_size_, size, + PADDLE_ENFORCE_EQ(index, + 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(mlu_alloc_size_, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, mlu_alloc_size_)); + size, + mlu_alloc_size_)); mlu_alloc_size_ -= size; platform::RecordedMLUFree(p, size, mlu_id_); @@ -452,7 +508,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { "\n\nOut of memory error on %s %d. " "total memory is %s, used memory is %s, " "available memory is only %s.\n\n", - dev_type_, dev_id_, string::HumanReadableSize(total), + dev_type_, + dev_id_, + string::HumanReadableSize(total), string::HumanReadableSize(total - avail), string::HumanReadableSize(avail))); } @@ -461,13 +519,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void CustomAllocator::Free(void* p, size_t size, size_t index) { VLOG(4) << "CustomAllocator::Free " << p << " size " << size; - PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( - "The index should be 0, index is %d", index)); - PADDLE_ENFORCE_GE(plug_alloc_size, size, + PADDLE_ENFORCE_EQ(index, + 0, + platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(plug_alloc_size, + size, platform::errors::InvalidArgument( "The size of memory (%d) to free exceeds the size of " "allocated gpu memory (%d)", - size, plug_alloc_size)); + size, + plug_alloc_size)); plug_alloc_size -= size; auto place = platform::CustomPlace(dev_type_, dev_id_); auto device = phi::DeviceManager::GetDeviceWithPlace(place); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 3198b4f8d935e..ae2c0aa612e77 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/phi/common/place.h" #ifdef PADDLE_WITH_XPU @@ -33,8 +33,12 @@ namespace memory { #ifdef PADDLE_WITH_CUSTOM_DEVICE template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -52,8 +56,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); @@ -70,8 +78,12 @@ void Copy( template <> void Copy( - platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, - const void* src, size_t num, void* stream) { + platform::CustomPlace dst_place, + void* dst, + platform::CustomPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; auto src_type = platform::PlaceHelper::GetDeviceType(src_place); @@ -102,9 +114,11 @@ void Copy( #endif // PADDLE_WITH_CUSTOM_DEVICE template <> -void Copy(platform::CPUPlace, void* dst, +void Copy(platform::CPUPlace, + void* dst, platform::CPUPlace, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); @@ -115,7 +129,8 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -123,7 +138,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } @@ -131,15 +147,18 @@ template <> void Copy(platform::IPUPlace dst_place, void* dst, platform::IPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (UNLIKELY(num == 0)) return; std::memcpy(dst, src, num); } // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). template <> -void Copy(phi::IPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::IPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -152,8 +171,10 @@ void Copy(phi::IPUPlace dst_place, void* dst, // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::IPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::IPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -170,7 +191,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -182,7 +204,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; @@ -194,7 +217,8 @@ template <> void Copy(platform::XPUPlace dst_place, void* dst, platform::XPUPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { if (num <= 0) { VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; @@ -204,8 +228,10 @@ void Copy(platform::XPUPlace dst_place, // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace). template <> -void Copy(phi::XPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::XPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (src_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_src; @@ -218,8 +244,10 @@ void Copy(phi::XPUPlace dst_place, void* dst, // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::XPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::XPUPlace src_place, + const void* src, size_t num) { if (dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst; @@ -236,7 +264,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -248,7 +277,10 @@ void Copy(platform::NPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -267,7 +299,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -279,7 +312,10 @@ void Copy(platform::CPUPlace dst_place, if (stream) { platform::RecordEvent record_event( "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -295,7 +331,8 @@ template <> void Copy(platform::NPUPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -307,7 +344,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -329,7 +369,10 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_DEVICE, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = @@ -346,8 +389,11 @@ void Copy(platform::NPUPlace dst_place, template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -356,8 +402,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -366,8 +415,11 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, - platform::NPUPinnedPlace src_place, const void* src, size_t num) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -376,8 +428,12 @@ void Copy( template <> void Copy( - platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPinnedPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(src_place.device); @@ -389,7 +445,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_DEVICE_TO_HOST, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -404,8 +463,12 @@ void Copy( template <> void Copy( - platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, - const void* src, size_t num, void* stream) { + platform::NPUPlace dst_place, + void* dst, + platform::NPUPinnedPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetNPUDeviceId(dst_place.device); @@ -417,7 +480,10 @@ void Copy( platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", platform::TracerEventType::UserDefined, 1); - platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, + platform::NPUMemcpyAsync(dst, + src, + num, + ACL_MEMCPY_HOST_TO_DEVICE, reinterpret_cast(stream)); } else { // On NPU, async operation after sync operation is ok, while sync operation @@ -435,9 +501,12 @@ void Copy( // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -504,52 +573,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) template <> -void Copy(phi::NPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::NPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::NPUPlace src_place, const void* src, - size_t num, aclrtStream stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::NPUPlace src_place, + const void* src, + size_t num, + aclrtStream stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, aclrtStream stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, aclrtStream stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -557,16 +650,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (NPUPinnedPlace) template <> void Copy(phi::NPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (NPUPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::NPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -608,8 +705,12 @@ inline void SyncCUDAStream() { template <> void Copy( - platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); @@ -619,10 +720,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -642,8 +749,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(dst_place.device); @@ -653,10 +764,16 @@ void Copy( platform::RecordEvent record_event( "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -676,8 +793,12 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, - const void* src, size_t num, void* stream) { + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -689,10 +810,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToDevice, reinterpret_cast(stream)); #endif } else { @@ -710,22 +837,29 @@ void Copy( platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::GpuMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU", platform::TracerEventType::UserDefined, 1); - platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::GpuMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } template <> void Copy( - platform::CPUPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CPUPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -734,8 +868,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CPUPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -744,8 +881,11 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num) { VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; if (UNLIKELY(num == 0)) return; @@ -754,8 +894,12 @@ void Copy( template <> void Copy( - platform::CUDAPinnedPlace dst_place, void* dst, - platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { + platform::CUDAPinnedPlace dst_place, + void* dst, + platform::CUDAPlace src_place, + const void* src, + size_t num, + void* stream) { if (UNLIKELY(num == 0)) return; platform::SetDeviceId(src_place.device); VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -765,10 +909,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyDeviceToHost, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyDeviceToHost, reinterpret_cast(stream)); #endif } else { @@ -785,8 +935,11 @@ void Copy( template <> void Copy( - platform::CUDAPlace dst_place, void* dst, - platform::CUDAPinnedPlace src_place, const void* src, size_t num, + platform::CUDAPlace dst_place, + void* dst, + platform::CUDAPinnedPlace src_place, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -798,10 +951,16 @@ void Copy( platform::TracerEventType::UserDefined, 1); #ifdef PADDLE_WITH_HIP - platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + hipMemcpyHostToDevice, reinterpret_cast(stream)); #else - platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, + platform::GpuMemcpyAsync(dst, + src, + num, + cudaMemcpyHostToDevice, reinterpret_cast(stream)); #endif } else { @@ -818,9 +977,12 @@ void Copy( // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -887,52 +1049,76 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace) template <> -void Copy(phi::GPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::GPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::GPUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::GPUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num, + void* dst, + phi::Place src_place, + const void* src, + size_t num, void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -940,16 +1126,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace) template <> void Copy(phi::GPUPinnedPlace dst_place, - void* dst, phi::Place src_place, - const void* src, size_t num) { + void* dst, + phi::Place src_place, + const void* src, + size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); } // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace) template <> -void Copy(phi::Place dst_place, void* dst, +void Copy(phi::Place dst_place, + void* dst, phi::GPUPinnedPlace src_place, - const void* src, size_t num) { + const void* src, + size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); } #endif @@ -959,7 +1149,8 @@ template <> void Copy(platform::CPUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -970,8 +1161,8 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2HAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2HAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -988,7 +1179,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::CPUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -999,8 +1191,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyH2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyH2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); @@ -1017,7 +1209,8 @@ template <> void Copy(platform::MLUPlace dst_place, void* dst, platform::MLUPlace src_place, - const void* src, size_t num, + const void* src, + size_t num, void* stream) { if (UNLIKELY(num == 0)) return; @@ -1029,8 +1222,8 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyD2DAsync(dst, src, num, - reinterpret_cast(stream)); + platform::MLUMemcpyD2DAsync( + dst, src, num, reinterpret_cast(stream)); } else { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); @@ -1050,25 +1243,32 @@ void Copy(platform::MLUPlace dst_place, platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, - num, reinterpret_cast(stream)); + platform::MLUMemcpyPeerAsync(dst, + dst_place.device, + src, + src_place.device, + num, + reinterpret_cast(stream)); } else { VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place; platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU", platform::TracerEventType::UserDefined, 1); - platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, - num); + platform::MLUMemcpyPeerSync( + dst, dst_place.device, src, src_place.device, num); } } } // NOTE: only for CPUPlace and MLUPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && dst_place.GetType() == phi::AllocationType::CPU) { platform::CPUPlace place_dst, place_src; @@ -1110,35 +1310,55 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace) template <> -void Copy(phi::MLUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { - Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, - src, num, stream); +void Copy(phi::MLUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { + Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), + dst, + src_place, + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace) template <> -void Copy(phi::Place dst_place, void* dst, - phi::MLUPlace src_place, const void* src, - size_t num, void* stream) { - Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), - src, num, stream); +void Copy(phi::Place dst_place, + void* dst, + phi::MLUPlace src_place, + const void* src, + size_t num, + void* stream) { + Copy(dst_place, + dst, + phi::Place(src_place.GetType(), src_place.GetDeviceId()), + src, + num, + stream); } // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream. template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream. template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } @@ -1146,8 +1366,10 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { if (UNLIKELY(num == 0)) return; VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " @@ -1224,16 +1446,20 @@ void Copy(phi::Place dst_place, void* dst, // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, size_t num) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num); } // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace). template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, size_t num) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num); } @@ -1243,9 +1469,12 @@ void Copy(phi::CPUPlace dst_place, void* dst, !defined(PADDLE_WITH_MLU) template <> -void Copy(phi::Place dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT dst_place.GetType() == phi::AllocationType::CUSTOM) { platform::CPUPlace place_src; @@ -1265,17 +1494,23 @@ void Copy(phi::Place dst_place, void* dst, } template <> -void Copy(phi::CPUPlace dst_place, void* dst, - phi::Place src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::CPUPlace dst_place, + void* dst, + phi::Place src_place, + const void* src, + size_t num, + void* stream) { Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); } // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). template <> -void Copy(phi::Place dst_place, void* dst, - phi::CPUPlace src_place, const void* src, - size_t num, void* stream) { +void Copy(phi::Place dst_place, + void* dst, + phi::CPUPlace src_place, + const void* src, + size_t num, + void* stream) { Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); } #endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt old mode 100644 new mode 100755 index f29546c5210d9..5bd28e3a96307 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,229 +1,463 @@ -proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) +proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto + simple_threadpool) if(WITH_GPU) proto_library(external_error_proto SRCS external_error.proto) endif(WITH_GPU) -if (WITH_PYTHON) +if(WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) - add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) + add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E + touch __init__.py) add_dependencies(profiler_py_proto profiler_py_proto_init) - if (NOT WIN32) - add_custom_command(TARGET profiler_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + if(NOT WIN32) + add_custom_command( + TARGET profiler_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMENT + "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) else(NOT WIN32) - string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") - add_custom_command(TARGET profiler_py_proto POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler - COMMAND copy /Y *.py ${proto_dstpath} - COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + string(REPLACE "/" "\\" proto_dstpath + "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") + add_custom_command( + TARGET profiler_py_proto + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory + ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT + "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) endif() -cc_library(flags SRCS flags.cc DEPS gflags boost) -cc_library(denormal SRCS denormal.cc DEPS) +cc_library( + flags + SRCS flags.cc + DEPS gflags boost) +cc_library( + denormal + SRCS denormal.cc + DEPS) -cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) +cc_test( + errors_test + SRCS errors_test.cc + DEPS errors enforce) set(enforce_deps flags errors boost flags phi_enforce) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() -cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps}) +cc_library( + enforce INTERFACE + SRCS enforce.cc + DEPS ${enforce_deps}) cc_library(monitor SRCS monitor.cc) -cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) +cc_test( + enforce_test + SRCS enforce_test.cc + DEPS stringpiece enforce) set(CPU_INFO_DEPS gflags glog enforce) -IF(WITH_XBYAK) - list(APPEND CPU_INFO_DEPS xbyak) -ENDIF() -cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) -cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) -cc_library(os_info SRCS os_info.cc DEPS enforce) -cc_test(os_info_test SRCS os_info_test.cc DEPS os_info) - -IF(WITH_GPU) - nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) -ELSE() - cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) -ENDIF() - -cc_library(place SRCS place.cc DEPS enforce boost phi_place) -cc_test(place_test SRCS place_test.cc DEPS place glog gflags) - -IF(WITH_MKLDNN) - set(MKLDNN_CTX_DEPS mkldnn) -ELSE() - set(MKLDNN_CTX_DEPS) -ENDIF() +if(WITH_XBYAK) + list(APPEND CPU_INFO_DEPS xbyak) +endif() +cc_library( + cpu_info + SRCS cpu_info.cc + DEPS ${CPU_INFO_DEPS}) +cc_test( + cpu_info_test + SRCS cpu_info_test.cc + DEPS cpu_info) +cc_library( + os_info + SRCS os_info.cc + DEPS enforce) +cc_test( + os_info_test + SRCS os_info_test.cc + DEPS os_info) + +if(WITH_GPU) + nv_library( + cuda_graph_with_memory_pool + SRCS cuda_graph_with_memory_pool.cc + DEPS device_context allocator_facade cuda_graph) +else() + cc_library( + cuda_graph_with_memory_pool + SRCS cuda_graph_with_memory_pool.cc + DEPS device_context allocator_facade) +endif() + +cc_library( + place + SRCS place.cc + DEPS enforce boost phi_place) +cc_test( + place_test + SRCS place_test.cc + DEPS place glog gflags) + +if(WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +else() + set(MKLDNN_CTX_DEPS) +endif() add_subdirectory(device) add_subdirectory(dynload) add_subdirectory(stream) -cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce) -cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper) +cc_library( + cpu_helper + SRCS cpu_helper.cc + DEPS cblas enforce) +cc_test( + cpu_helper_test + SRCS cpu_helper_test.cc + DEPS cpu_helper) set(dgc_deps "") -IF(WITH_DGC) - set(dgc_deps dgc) -ENDIF() - -IF(WITH_GPU OR WITH_ROCM) - set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) -ENDIF() - -IF(WITH_IPU) - set(IPU_CTX_DEPS ipu_info) -ELSE() - set(IPU_CTX_DEPS) -ENDIF(WITH_IPU) - -IF(WITH_ASCEND_CL) - set(NPU_CTX_DEPS npu_stream npu_info) -ENDIF() - -IF(WITH_MLU) - set(MLU_CTX_DEPS mlu_device_context) -ENDIF() - -IF(WITH_ASCEND_CL OR WITH_MLU) -cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() - -IF(WITH_GPU) - nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() -IF(WITH_ROCM) - hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) -ENDIF() - -IF(WITH_GPU OR WITH_ROCM) +if(WITH_DGC) + set(dgc_deps dgc) +endif() + +if(WITH_GPU OR WITH_ROCM) + set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) +endif() + +if(WITH_IPU) + set(IPU_CTX_DEPS ipu_info) +else() + set(IPU_CTX_DEPS) +endif(WITH_IPU) + +if(WITH_ASCEND_CL) + set(NPU_CTX_DEPS npu_stream npu_info) +endif() + +if(WITH_MLU) + set(MLU_CTX_DEPS mlu_device_context) +endif() + +if(WITH_ASCEND_CL OR WITH_MLU) + cc_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() + +if(WITH_GPU) + nv_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() +if(WITH_ROCM) + hip_library( + stream_callback_manager + SRCS stream_callback_manager.cc + DEPS simple_threadpool enforce) +endif() + +if(WITH_GPU OR WITH_ROCM) set(STREAM_CALLBACK_DEPS stream_callback_manager) -ELSEIF(WITH_ASCEND_CL) +elseif(WITH_ASCEND_CL) set(STREAM_CALLBACK_DEPS stream_callback_manager) -ELSE() +else() set(STREAM_CALLBACK_DEPS) -ENDIF() +endif() if(WITH_GLOO) - cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce) + cc_library( + gloo_context + SRCS gloo_context.cc + DEPS framework_proto gloo_wrapper enforce) endif() -cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) +cc_library( + cudnn_workspace_helper + SRCS cudnn_workspace_helper.cc + DEPS boost) # seperate init from device_context to avoid cycle dependencies -cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool) +cc_library( + init + SRCS init.cc + DEPS device_context custom_kernel context_pool) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} - ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator) +cc_library( + device_context + SRCS device_context.cc + DEPS simple_threadpool + malloc + xxhash + ${STREAM_CALLBACK_DEPS} + place + phi_place + eigen3 + stringpiece + cpu_helper + cpu_info + framework_proto + ${IPU_CTX_DEPS} + ${GPU_CTX_DEPS} + ${NPU_CTX_DEPS} + ${MKLDNN_CTX_DEPS} + ${dgc_deps} + dlpack + cudnn_workspace_helper + ${XPU_CTX_DEPS} + ${MLU_CTX_DEPS} + eigen3 + cpu_context + generator) if(WITH_XPU) target_link_libraries(device_context xpu_context) endif() -cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) +cc_library( + collective_helper + SRCS collective_helper.cc gen_comm_id_helper.cc + DEPS framework_proto device_context enforce) if(WITH_ASCEND_CL) - target_link_libraries(collective_helper npu_collective_helper) + target_link_libraries(collective_helper npu_collective_helper) endif() if(WITH_CNCL) - target_link_libraries(collective_helper mlu_collective_helper) + target_link_libraries(collective_helper mlu_collective_helper) endif() if(WITH_GPU OR WITH_ROCM) - target_link_libraries(device_context gpu_info gpu_context phi_gpu_info) - target_link_libraries(device_context gpu_resource_pool) + target_link_libraries(device_context gpu_info gpu_context phi_gpu_info) + target_link_libraries(device_context gpu_resource_pool) endif() -if (WITH_CUSTOM_DEVICE) - target_link_libraries(device_context custom_context) +if(WITH_CUSTOM_DEVICE) + target_link_libraries(device_context custom_context) endif() if(WITH_ASCEND_CL) - target_link_libraries(device_context npu_resource_pool) + target_link_libraries(device_context npu_resource_pool) endif() if(WITH_MLU) - target_link_libraries(device_context mlu_resource_pool) + target_link_libraries(device_context mlu_resource_pool) endif() if(WITH_CUSTOM_DEVICE) - target_link_libraries(device_context custom_context) + target_link_libraries(device_context custom_context) endif() -cc_test(init_test SRCS init_test.cc DEPS device_context) +cc_test( + init_test + SRCS init_test.cc + DEPS device_context) # Manage all device event library set(DEVICE_EVENT_LIBS) -cc_library(device_event_base SRCS device_event_base.cc DEPS place enforce device_context op_registry) -set(DEVICE_EVENT_LIBS device_event_base CACHE INTERNAL "device event libs") - +cc_library( + device_event_base + SRCS device_event_base.cc + DEPS place enforce device_context op_registry) +set(DEVICE_EVENT_LIBS + device_event_base + CACHE INTERNAL "device event libs") if(WITH_GPU) - nv_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base) - set(DEVICE_EVENT_LIBS device_event_gpu CACHE INTERNAL "device event libs") - nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) - - nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + nv_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") + nv_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu) + + nv_test( + device_context_test + SRCS device_context_test.cu + DEPS device_context gpu_info) + nv_test( + transform_test + SRCS transform_test.cu + DEPS memory place device_context) endif() if(WITH_ROCM) - hip_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base) - set(DEVICE_EVENT_LIBS device_event_gpu CACHE INTERNAL "device event libs") - hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu) - - hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) - hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context) + hip_library( + device_event_gpu + SRCS device_event_gpu.cc + DEPS device_event_base) + set(DEVICE_EVENT_LIBS + device_event_gpu + CACHE INTERNAL "device event libs") + hip_test( + device_event_test + SRCS device_event_test.cc + DEPS device_event_gpu) + + hip_test( + device_context_test + SRCS device_context_test.cu + DEPS device_context gpu_info) + hip_test( + transform_test + SRCS transform_test.cu + DEPS memory place device_context) endif() cc_library(timer SRCS timer.cc) -cc_test(timer_test SRCS timer_test.cc DEPS timer) - -cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto) -cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) +cc_test( + timer_test + SRCS timer_test.cc + DEPS timer) + +cc_library( + lodtensor_printer + SRCS lodtensor_printer.cc + DEPS ddim + place + tensor + scope + lod_tensor + variable_helper + framework_proto) +cc_test( + lodtensor_printer_test + SRCS lodtensor_printer_test.cc + DEPS lodtensor_printer) add_subdirectory(profiler) -cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) +cc_library( + device_tracer + SRCS device_tracer.cc + DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) - nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats) - nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) + nv_library( + profiler + SRCS profiler.cc profiler.cu + DEPS os_info + device_tracer + gpu_info + enforce + dynload_cuda + new_profiler + stats + op_proto_maker + shape_inference) + nv_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info gpu_info place) elseif(WITH_ROCM) - hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats) - hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place) + hip_library( + profiler + SRCS profiler.cc profiler.cu + DEPS os_info + device_tracer + gpu_info + enforce + new_profiler + stats + op_proto_maker + shape_inference) + hip_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info gpu_info place) else() - cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats) - cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place) + cc_library( + profiler + SRCS profiler.cc + DEPS os_info + device_tracer + enforce + new_profiler + stats + op_proto_maker + shape_inference) + cc_library( + device_memory_aligment + SRCS device_memory_aligment.cc + DEPS cpu_info place) endif() -cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) -cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) -cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor) +cc_test( + profiler_test + SRCS profiler_test.cc + DEPS profiler) +cc_test( + float16_test + SRCS float16_test.cc + DEPS lod_tensor) +cc_test( + bfloat16_test + SRCS bfloat16_test.cc + DEPS lod_tensor) +cc_test( + complex_test + SRCS complex_test.cc + DEPS lod_tensor) -IF(WITH_GPU) - nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) - nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor) - nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor) - nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) - nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) -ENDIF() +if(WITH_GPU) + nv_test( + float16_gpu_test + SRCS float16_test.cu + DEPS lod_tensor) + nv_test( + bfloat16_gpu_test + SRCS bfloat16_test.cu + DEPS lod_tensor) + nv_test( + complex_gpu_test + SRCS complex_test.cu + DEPS lod_tensor) + nv_test( + test_limit_gpu_memory + SRCS test_limit_gpu_memory.cu + DEPS gpu_info flags) + nv_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() -IF(WITH_ROCM) - hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) - hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) - hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) -ENDIF() +if(WITH_ROCM) + hip_test( + float16_gpu_test + SRCS float16_test.cu + DEPS lod_tensor) + hip_test( + test_limit_gpu_memory + SRCS test_limit_gpu_memory.cu + DEPS gpu_info flags) + hip_library( + cuda_device_guard + SRCS cuda_device_guard.cc + DEPS gpu_info) +endif() if(NOT APPLE AND NOT WIN32) - cc_library(device_code SRCS device_code.cc DEPS device_context) + cc_library( + device_code + SRCS device_code.cc + DEPS device_context) if(WITH_GPU OR WITH_ROCM) - cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor) + cc_test( + device_code_test + SRCS device_code_test.cc + DEPS device_code lod_tensor) endif() endif() diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc index 89e3b74bb3aca..179c04b75f8f4 100644 --- a/paddle/fluid/platform/device/gpu/gpu_info.cc +++ b/paddle/fluid/platform/device/gpu/gpu_info.cc @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" #include "paddle/fluid/string/split.h" #include "paddle/phi/backends/gpu/gpu_info.h" @@ -49,6 +50,15 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_uint64(gpu_memory_limit_mb); +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, + false, + "Whether to print the message of gpu memory usage " + "at exit, mainly used for UT and CI."); +PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, + true, + "Whether to print the message of gpu memory usage " + "MB as a unit of measurement."); + constexpr static float fraction_reserve_gpu_memory = 0.05f; USE_GPU_MEM_STAT; @@ -57,7 +67,10 @@ namespace platform { void GpuMemoryUsage(size_t *available, size_t *total) { size_t actual_available, actual_total; - RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total, + RecordedGpuMemGetInfo(available, + total, + &actual_available, + &actual_total, platform::GetCurrentDeviceId()); } @@ -85,17 +98,20 @@ size_t GpuMaxAllocSize() { static size_t GpuAllocSize(bool realloc) { size_t available_to_alloc = GpuAvailableMemToAlloc(); PADDLE_ENFORCE_GT( - available_to_alloc, 0, + available_to_alloc, + 0, platform::errors::ResourceExhausted("Not enough available GPU memory.")); // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be // allocated by fraction size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb : FLAGS_initial_gpu_memory_in_mb; size_t alloc_bytes = - (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * - FLAGS_fraction_of_gpu_memory_to_use); + (flag_mb > 0ul + ? flag_mb << 20 + : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); PADDLE_ENFORCE_GE( - available_to_alloc, alloc_bytes, + available_to_alloc, + alloc_bytes, platform::errors::ResourceExhausted("Not enough available GPU memory.")); VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) << " MiB, is it Re-alloc: " << realloc; @@ -153,13 +169,16 @@ class RecordedGpuMallocHelper { }); PADDLE_ENFORCE_GE( - dev_id, 0, + dev_id, + 0, platform::errors::OutOfRange( "Device id must be not less than 0, but got %d.", dev_id)); PADDLE_ENFORCE_LT( - dev_id, instances_.size(), + dev_id, + instances_.size(), platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", - dev_id, instances_.size())); + dev_id, + instances_.size())); return instances_[dev_id].get(); } @@ -168,7 +187,8 @@ class RecordedGpuMallocHelper { * or cudaSuccess would be returned, and the cudaGetLastError() flag * would be clear. */ - gpuError_t Malloc(void **ptr, size_t size, + gpuError_t Malloc(void **ptr, + size_t size, bool malloc_managed_memory = false) { LockGuardPtr lock(mtx_); if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { @@ -196,8 +216,11 @@ class RecordedGpuMallocHelper { if (result == gpuSuccess) { cur_size_.fetch_add(size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - MEMORY_STAT_UPDATE(Reserved, dev_id_, size); - + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); + platform::RecordMemEvent(ptr, + GPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedAllocate); #ifdef PADDLE_WITH_TESTING gpu_ptrs.insert(*ptr); #endif @@ -235,7 +258,11 @@ class RecordedGpuMallocHelper { PADDLE_ENFORCE_GPU_SUCCESS(err); cur_size_.fetch_sub(size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); - MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); + DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); + platform::RecordMemEvent(ptr, + GPUPlace(dev_id_), + size, + platform::TracerMemEventType::ReservedFree); } else { platform::GpuGetLastError(); // clear the error flag when // cudaErrorCudartUnloading / @@ -261,7 +288,9 @@ class RecordedGpuMallocHelper { #endif } - bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, + bool GetMemInfo(size_t *avail, + size_t *total, + size_t *actual_avail, size_t *actual_total) { { CUDADeviceGuard guard(dev_id_); @@ -296,7 +325,8 @@ class RecordedGpuMallocHelper { #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 - CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, + CUresult MemCreate(CUmemGenericAllocationHandle *handle, + size_t size, const CUmemAllocationProp *prop, unsigned long long flags) { // NOLINT auto result = @@ -335,7 +365,9 @@ std::once_flag RecordedGpuMallocHelper::once_flag_; std::vector> RecordedGpuMallocHelper::instances_; -gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id, +gpuError_t RecordedGpuMalloc(void **ptr, + size_t size, + int dev_id, bool malloc_managed_memory) { return RecordedGpuMallocHelper::Instance(dev_id)->Malloc( ptr, size, malloc_managed_memory); @@ -347,22 +379,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { #ifdef PADDLE_WITH_CUDA #if CUDA_VERSION >= 10020 -CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, +CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, + size_t size, const CUmemAllocationProp *prop, - unsigned long long flags, int dev_id) { // NOLINT - return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size, - prop, flags); + unsigned long long flags, + int dev_id) { // NOLINT + return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate( + handle, size, prop, flags); } -CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, +CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, + size_t size, int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); } #endif #endif -bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, - size_t *actual_total, int dev_id) { +bool RecordedGpuMemGetInfo(size_t *avail, + size_t *total, + size_t *actual_avail, + size_t *actual_total, + int dev_id) { return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo( avail, total, actual_avail, actual_total); } @@ -457,26 +495,35 @@ void GpuDestroyStream(gpuStream_t stream) { void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); } -void GpuMemcpyAsync(void *dst, const void *src, size_t count, - gpuMemcpyKind kind, gpuStream_t stream) { +void GpuMemcpyAsync(void *dst, + const void *src, + size_t count, + gpuMemcpyKind kind, + gpuStream_t stream) { phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream); } -void GpuMemcpySync(void *dst, const void *src, size_t count, +void GpuMemcpySync(void *dst, + const void *src, + size_t count, gpuMemcpyKind kind) { phi::backends::gpu::GpuMemcpySync(dst, src, count, kind); } -void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, - int src_device, size_t count, gpuStream_t stream) { - phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device, - count, stream); +void GpuMemcpyPeerAsync(void *dst, + int dst_device, + const void *src, + int src_device, + size_t count, + gpuStream_t stream) { + phi::backends::gpu::GpuMemcpyPeerAsync( + dst, dst_device, src, src_device, count, stream); } -void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, - int src_device, size_t count) { - phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device, - count); +void GpuMemcpyPeerSync( + void *dst, int dst_device, const void *src, int src_device, size_t count) { + phi::backends::gpu::GpuMemcpyPeerSync( + dst, dst_device, src, src_device, count); } void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 8fa48ffcfb158..6467c4fdc403c 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -29,12 +29,16 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/dynload/nvtx.h" #endif +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/os_info.h" -PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, +PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, + false, "Enable rpc profiler or not."); -DEFINE_bool(enable_host_event_recorder_hook, false, +DEFINE_bool(enable_host_event_recorder_hook, + false, "enable HostEventRecorder, hook Profiler"); namespace paddle { @@ -42,8 +46,11 @@ namespace platform { MemEvenRecorder MemEvenRecorder::recorder; -Event::Event(EventType type, std::string name, uint32_t thread_id, - EventRole role, std::string attr) +Event::Event(EventType type, + std::string name, + uint32_t thread_id, + EventRole role, + std::string attr) : type_(type), name_(name), thread_id_(thread_id), @@ -67,8 +74,10 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const char *name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const char *name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -99,8 +108,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, - uint32_t level, const EventRole role) { +RecordEvent::RecordEvent(const std::string &name, + const TracerEventType type, + uint32_t level, + const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -129,8 +140,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, start_ns_ = PosixInNsec(); } -RecordEvent::RecordEvent(const std::string &name, const std::string &attr, - const TracerEventType type, uint32_t level, +RecordEvent::RecordEvent(const std::string &name, + const std::string &attr, + const TracerEventType type, + uint32_t level, const EventRole role) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA @@ -195,11 +208,11 @@ void RecordEvent::End() { shallow_copy_name_, start_ns_, end_ns, role_, type_); } else if (name_ != nullptr) { if (attr_ == nullptr) { - HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, - role_, type_); + HostEventRecorder::GetInstance().RecordEvent( + *name_, start_ns_, end_ns, role_, type_); } else { - HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, - role_, type_, *attr_); + HostEventRecorder::GetInstance().RecordEvent( + *name_, start_ns_, end_ns, role_, type_, *attr_); delete attr_; } delete name_; @@ -214,8 +227,8 @@ void RecordEvent::End() { DeviceTracer *tracer = GetDeviceTracer(); if (tracer) { uint64_t end_ns = PosixInNsec(); - tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(*name_, role_); @@ -225,30 +238,217 @@ void RecordEvent::End() { is_enabled_ = false; } -RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, +RecordInstantEvent::RecordInstantEvent(const char *name, + TracerEventType type, uint32_t level) { if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { return; } auto start_end_ns = PosixInNsec(); - HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns, - EventRole::kOrdinary, type); + HostEventRecorder::GetInstance().RecordEvent( + name, start_end_ns, start_end_ns, EventRole::kOrdinary, type); +} + +RecordOpInfoSupplement::RecordOpInfoSupplement( + const std::string &type, + const framework::AttributeMap &attrs, + const framework::InferShapeContext &shape_ctx, + const framework::RuntimeContext &ctx) { + if (FLAGS_enable_host_event_recorder_hook == false) { + return; + } + std::map> input_shapes; + std::map> dtypes; + for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) { + input_shapes[it->first] = shape_ctx.GetInputsDim(it->first); + dtypes[it->first] = shape_ctx.GetInputsVarType(it->first); + } + + const std::vector *callstack_ptr = nullptr; + std::vector callstack; + auto iter = attrs.find( + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + if (iter != attrs.end()) { + callstack_ptr = &BOOST_GET_CONST(std::vector, iter->second); + callstack = *callstack_ptr; + } + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), type, input_shapes, dtypes, callstack); +} + +RecordMemEvent::RecordMemEvent(const void *ptr, + const phi::Place &place, + size_t size, + const TracerMemEventType type) { + if (g_state == ProfilerState::kDisabled && + FLAGS_enable_host_event_recorder_hook == false) { + return; + } + if (type == TracerMemEventType::Allocate) { + uint64_t current_allocated; + uint64_t peak_allocated; + uint64_t current_reserved = 0; // 0 means keep the same as before + uint64_t peak_reserved = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PushMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::ReservedAllocate) { + uint64_t current_reserved; + uint64_t peak_reserved; + uint64_t current_allocated = 0; // 0 means keep the same as before + uint64_t peak_allocated = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PushMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::Free) { + uint64_t current_allocated; + uint64_t peak_allocated; + uint64_t current_reserved = 0; // 0 means keep the same as before + uint64_t peak_reserved = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_allocated = + HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } else { + current_allocated = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId()); + peak_allocated = + DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PopMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } else if (type == TracerMemEventType::ReservedFree) { + uint64_t current_reserved; + uint64_t peak_reserved; + uint64_t current_allocated = 0; // 0 means keep the same as before + uint64_t peak_allocated = 0; // 0 means keep the same as before + if (platform::is_cpu_place(place) || + platform::is_cuda_pinned_place(place)) { + current_reserved = + HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } else { + current_reserved = + DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId()); + peak_reserved = + DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId()); + } + + platform::MemEvenRecorder::Instance().PopMemRecord(ptr, + place, + size, + type, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + } } -void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, +void MemEvenRecorder::PushMemRecord(const void *ptr, + const Place &place, size_t size) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; - PADDLE_ENFORCE_EQ(events.count(ptr), 0, + PADDLE_ENFORCE_EQ(events.count(ptr), + 0, platform::errors::InvalidArgument( "The Place can't exist in the stage of PushMemRecord")); - events.emplace(ptr, std::unique_ptr( - new MemEvenRecorder::RecordMemEvent(place, size))); + events.emplace(ptr, + std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); +} + +void MemEvenRecorder::PushMemRecord(const void *ptr, + const Place &place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) { + std::lock_guard guard(mtx_); + if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), + reinterpret_cast(ptr), + type, + size, + place, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + return; + } + if (type == TracerMemEventType::ReservedAllocate) { + // old profiler only analyse memory managed by paddle. + return; + } + if (g_state == ProfilerState::kDisabled) return; + auto &events = address_memevent_[place]; + PADDLE_ENFORCE_EQ(events.count(ptr), + 0, + platform::errors::InvalidArgument( + "The Place can't exist in the stage of PushMemRecord")); + events.emplace(ptr, + std::unique_ptr( + new MemEvenRecorder::RecordMemEvent(place, size))); } void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { - if (g_state == ProfilerState::kDisabled) return; + if (g_state == ProfilerState::kDisabled) { + return; + } std::lock_guard guard(mtx_); auto &events = address_memevent_[place]; auto iter = events.find(ptr); @@ -258,6 +458,41 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { } } +void MemEvenRecorder::PopMemRecord(const void *ptr, + const Place &place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) { + std::lock_guard guard(mtx_); + if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord + HostEventRecorder::GetInstance().RecordEvent( + PosixInNsec(), + reinterpret_cast(ptr), + type, + -size, + place, + current_allocated, + current_reserved, + peak_allocated, + peak_reserved); + return; + } + if (type == TracerMemEventType::ReservedFree) { + // old profiler only analyse memory managed by paddle. + return; + } + if (g_state == ProfilerState::kDisabled) return; + auto &events = address_memevent_[place]; + auto iter = events.find(ptr); + // The ptr maybe not in address_memevent + if (iter != events.end()) { + events.erase(iter); + } +} + void MemEvenRecorder::Flush() { std::lock_guard guard(mtx_); address_memevent_.clear(); @@ -278,8 +513,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { auto annotation_free = CurAnnotationName(); if (tracer) { - tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, - annotation_free, g_mem_thread_id); + tracer->AddMemInfoRecord(start_ns_, + end_ns_, + bytes_, + place_, + alloc_in_, + annotation_free, + g_mem_thread_id); } PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); } @@ -306,22 +546,38 @@ RecordBlock::~RecordBlock() { if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. - tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - g_thread_id); + tracer->AddCPURecords( + name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id); } ClearCurBlock(); } -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, - place, g_mem_thread_id, annotation); -} - -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place &place, const std::string &annotation) { - GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, - g_mem_thread_id, annotation); +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPushRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); +} + +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place &place, + const std::string &annotation) { + GetMemEventList().Record(EventType::kPopRange, + start_ns, + end_ns, + bytes, + place, + g_mem_thread_id, + annotation); } void Mark(const std::string &name) { @@ -333,17 +589,19 @@ void Mark(const std::string &name) { GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event *PushEvent(const std::string &name, const EventRole role, +Event *PushEvent(const std::string &name, + const EventRole role, std::string attr) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, - attr); + return GetEventList().Record( + EventType::kPushRange, name, g_thread_id, role, attr); } void PopEvent(const std::string &name, const EventRole role, std::string attr) { GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); } void EnableProfiler(ProfilerState state) { - PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, + PADDLE_ENFORCE_NE(state, + ProfilerState::kDisabled, platform::errors::InvalidArgument( "Can't enable profiling, since the input state is" "ProfilerState::kDisabled")); @@ -379,7 +637,8 @@ void ResetProfiler() { (*it)->Clear(); } for (auto it = g_all_mem_event_lists.begin(); - it != g_all_mem_event_lists.end(); ++it) { + it != g_all_mem_event_lists.end(); + ++it) { (*it)->Clear(); } } @@ -573,8 +832,8 @@ static void EmulateEventPushAndPop(const HostEventSection &host_sec, std::string name = prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; const char *attr = (evt.attr == nullptr ? "none" : evt.attr); - Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, - evt.role, attr); + Event *orig_evt = cur_thr_list->Record( + EventType::kPushRange, name, tid, evt.role, attr); (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); } @@ -589,8 +848,8 @@ static void EmulateCPURecordsAdd(const HostEventSection &host_sec) { for (const auto &thr_sec : host_sec.thr_sections) { uint64_t tid = thr_sec.thread_id; for (const auto &evt : thr_sec.events) { - tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), - tid); + tracer->AddCPURecords( + evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid); } } } diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 78275341cbbf7..4773b1a177ba0 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -30,6 +30,8 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/platform/profiler/mem_tracing.h" +#include "paddle/fluid/platform/profiler/supplement_tracing.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/device/gpu/gpu_info.h" #endif @@ -102,6 +104,22 @@ struct MemEvenRecorder { public: void PushMemRecord(const void* ptr, const Place& place, size_t size); void PopMemRecord(const void* ptr, const Place& place); + void PushMemRecord(const void* ptr, + const Place& place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved); + void PopMemRecord(const void* ptr, + const Place& place, + size_t size, + TracerMemEventType type, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved); void Flush(); static MemEvenRecorder& Instance() { return recorder; } @@ -160,7 +178,8 @@ struct EventList { std::vector Reduce() { std::vector result; for (auto& block : event_blocks) { - result.insert(result.begin(), std::make_move_iterator(block.begin()), + result.insert(result.begin(), + std::make_move_iterator(block.begin()), std::make_move_iterator(block.end())); } event_blocks.clear(); @@ -173,13 +192,21 @@ struct EventList { }; void Mark(const std::string& name); -void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, - const Place& place, const std::string& annotation); -Event* PushEvent(const std::string& name, const EventRole role, +void PushMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +void PopMemEvent(uint64_t start_ns, + uint64_t end_ns, + size_t bytes, + const Place& place, + const std::string& annotation); +Event* PushEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); -void PopEvent(const std::string& name, const EventRole role, +void PopEvent(const std::string& name, + const EventRole role, const std::string attr = "none"); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt index 084bc44dbc78b..1daed7db1e701 100755 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,14 +1,52 @@ -cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) -cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) +cc_library( + host_tracer + SRCS host_tracer.cc + DEPS enforce ddim var_type_traits) +cc_library( + cuda_tracer + SRCS cuda_tracer.cc cupti_data_process.cc + DEPS workqueue_utils enforce glog) add_subdirectory(mlu) -cc_library(event_node SRCS event_node.cc DEPS enforce) -cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) +cc_library( + event_node + SRCS event_node.cc + DEPS enforce place) +cc_library( + profiler_utils + SRCS utils.cc + DEPS enforce glog) add_subdirectory(dump) -cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils) -cc_library(event_bind SRCS event_python.cc DEPS profiler_logger) -cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer) -cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger) -cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) -cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler) +cc_library( + profiler_logger + SRCS chrometracing_logger.cc dump/serialization_logger.cc + dump/deserialization_reader.cc + DEPS nodetreeproto event_node profiler_utils) +cc_library( + event_bind + SRCS event_python.cc + DEPS profiler_logger) +cc_library( + cpu_utilization + SRCS cpu_utilization.cc + DEPS cpu_info os_info enforce glog) +cc_library( + new_profiler + SRCS profiler.cc + DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind + mlu_tracer) +cc_test( + test_event_node + SRCS test_event_node.cc + DEPS event_node profiler_logger) +cc_test( + test_extra_info + SRCS test_extra_info.cc + DEPS profiler_utils) +cc_test( + test_serialization_logger + SRCS dump/test_serialization_logger.cc + DEPS event_bind) +cc_test( + new_profiler_test + SRCS profiler_test.cc + DEPS new_profiler) diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc index 1e26c0a94408c..e8fe541272137 100644 --- a/paddle/fluid/platform/profiler/chrometracing_logger.cc +++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include +#include #include "glog/logging.h" @@ -128,27 +129,32 @@ void ChromeTracingLogger::LogMemTraceEventNode( std::string( R"JSON( { - "name": "[memory]", "pid": %lld, "tid": "%lld", + "name": "[memory]", "pid": %lld, "tid": "%lld(C++)", "ts": %lld, "ph": "i", "cat": "%s", "args": { "place": "%s", "addr": "%llu", + "increase_bytes": %lld, "current_allocated": %llu, "current_reserved": %llu, - "increase_bytes": %lld + "peak_allocated": %llu, + "peak_reserved": %llu } }, )JSON"), mem_node.ProcessId(), mem_node.ThreadId(), - mem_node.TimeStampNs(), + nsToUs(mem_node.TimeStampNs()), StringTracerMemEventType(mem_node.Type()), mem_node.Place().c_str(), mem_node.Addr(), + mem_node.IncreaseBytes(), mem_node.CurrentAllocated(), mem_node.CurrentReserved(), - mem_node.IncreaseBytes()); + mem_node.PeakAllocated(), + mem_node.PeakReserved()); + pid_tid_set_.insert({mem_node.ProcessId(), mem_node.ThreadId()}); } void ChromeTracingLogger::LogHostTraceEventNode( @@ -172,6 +178,8 @@ void ChromeTracingLogger::LogHostTraceEventNode( input_shapes = op_supplement_node->InputShapes(); input_dtypes = op_supplement_node->Dtypes(); callstack = op_supplement_node->CallStack(); + callstack = std::regex_replace(callstack, std::regex("\""), "\'"); + callstack = std::regex_replace(callstack, std::regex("\n"), "\\n"); } switch (host_node.Type()) { case TracerEventType::ProfileStep: diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h index cfdc3be110a5b..3e166d1d04db9 100644 --- a/paddle/fluid/platform/profiler/common_event.h +++ b/paddle/fluid/platform/profiler/common_event.h @@ -17,16 +17,22 @@ #include #include #include + +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/profiler/trace_event.h" +#include "paddle/phi/core/ddim.h" namespace paddle { namespace platform { struct CommonEvent { public: - CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + CommonEvent(const char *name, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : name(name), start_ns(start_ns), end_ns(end_ns), @@ -34,8 +40,12 @@ struct CommonEvent { type(type) {} CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type, const std::string &attr_str) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type, + const std::string &attr_str) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -46,8 +56,11 @@ struct CommonEvent { } CommonEvent(std::function arena_allocator, - const std::string &name_str, uint64_t start_ns, uint64_t end_ns, - EventRole role, TracerEventType type) + const std::string &name_str, + uint64_t start_ns, + uint64_t end_ns, + EventRole role, + TracerEventType type) : start_ns(start_ns), end_ns(end_ns), role(role), type(type) { auto buf = static_cast(arena_allocator(name_str.length() + 1)); strncpy(buf, name_str.c_str(), name_str.length() + 1); @@ -62,5 +75,61 @@ struct CommonEvent { const char *attr = nullptr; // not owned, designed for performance }; +struct CommonMemEvent { + public: + CommonMemEvent(uint64_t timestamp_ns, + uint64_t addr, + TracerMemEventType type, + int64_t increase_bytes, + const Place &place, + uint64_t current_allocated, + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) + : timestamp_ns(timestamp_ns), + addr(addr), + type(type), + increase_bytes(increase_bytes), + place(place), + peak_allocated(peak_allocated), + peak_reserved(peak_reserved) {} + uint64_t timestamp_ns; + uint64_t addr; + TracerMemEventType type; + int64_t increase_bytes; + Place place; + uint64_t current_allocated; + uint64_t current_reserved; + uint64_t peak_allocated; + uint64_t peak_reserved; +}; + +struct OperatorSupplementOriginEvent { + public: + OperatorSupplementOriginEvent( + std::function arena_allocator, + uint64_t timestamp_ns, + const std::string &type_name, + const std::map> &input_shapes, + const std::map> + &dtypes, + const std::vector callstack) + : timestamp_ns(timestamp_ns), + input_shapes(input_shapes), + dtypes(dtypes), + callstack(callstack) { + auto buf = static_cast(arena_allocator(type_name.length() + 1)); + strncpy(buf, type_name.c_str(), type_name.length() + 1); + op_type = buf; + } + uint64_t timestamp_ns; + const char *op_type = nullptr; // not owned, designed for performance + // input shapes + std::map> input_shapes; + std::map> dtypes; + // call stack + const std::vector callstack; +}; + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc index de3411579d3e9..d17aa9e9ce2aa 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc @@ -45,7 +45,8 @@ std::unique_ptr DeserializationReader::Parse() { ExtraInfo extrainfo; for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) { ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx); - extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"), + extrainfo.AddExtraInfo(extra_info_map.key(), + std::string("%s"), extra_info_map.value().c_str()); } // restore NodeTrees @@ -90,6 +91,26 @@ std::unique_ptr DeserializationReader::Parse() { device_node); // insert into runtime_node } } + // handle mem node + for (int mem_node_index = 0; + mem_node_index < host_node_proto.mem_nodes_size(); + mem_node_index++) { + const MemTraceEventNodeProto& mem_node_proto = + host_node_proto.mem_nodes(mem_node_index); + MemTraceEventNode* mem_node = RestoreMemTraceEventNode(mem_node_proto); + host_node->AddMemNode(mem_node); + } + // handle op supplement node + for (int op_supplement_node_index = 0; + op_supplement_node_index < + host_node_proto.op_supplement_nodes_size(); + op_supplement_node_index++) { + const OperatorSupplementEventNodeProto& op_supplement_node_proto = + host_node_proto.op_supplement_nodes(op_supplement_node_index); + OperatorSupplementEventNode* op_supplement_node = + RestoreOperatorSupplementEventNode(op_supplement_node_proto); + host_node->SetOperatorSupplementNode(op_supplement_node); + } } // restore parent-child relationship for (auto it = child_parent_map.begin(); it != child_parent_map.end(); @@ -174,6 +195,64 @@ HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode( return new HostTraceEventNode(host_event); } +MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode( + const MemTraceEventNodeProto& mem_node_proto) { + const MemTraceEventProto& mem_event_proto = mem_node_proto.mem_event(); + MemTraceEvent mem_event; + mem_event.timestamp_ns = mem_event_proto.timestamp_ns(); + mem_event.addr = mem_event_proto.addr(); + mem_event.type = static_cast(mem_event_proto.type()); + mem_event.process_id = mem_event_proto.process_id(); + mem_event.thread_id = mem_event_proto.thread_id(); + mem_event.increase_bytes = mem_event_proto.increase_bytes(); + mem_event.place = mem_event_proto.place(); + mem_event.current_allocated = mem_event_proto.current_allocated(); + mem_event.current_reserved = mem_event_proto.current_reserved(); + mem_event.peak_allocated = mem_event_proto.peak_allocated(); + mem_event.peak_reserved = mem_event_proto.peak_reserved(); + return new MemTraceEventNode(mem_event); +} + +OperatorSupplementEventNode* +DeserializationReader::RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto& op_supplement_node_proto) { + const OperatorSupplementEventProto& op_supplement_event_proto = + op_supplement_node_proto.op_supplement_event(); + OperatorSupplementEvent op_supplement_event; + op_supplement_event.timestamp_ns = op_supplement_event_proto.timestamp_ns(); + op_supplement_event.op_type = op_supplement_event_proto.op_type(); + op_supplement_event.callstack = op_supplement_event_proto.callstack(); + op_supplement_event.process_id = op_supplement_event_proto.process_id(); + op_supplement_event.thread_id = op_supplement_event_proto.thread_id(); + std::map>> input_shapes; + std::map> dtypes; + auto input_shape_proto = op_supplement_event_proto.input_shapes(); + for (int i = 0; i < input_shape_proto.key_size(); i++) { + auto input_shape_vec = input_shapes[input_shape_proto.key(i)]; + auto shape_vectors_proto = input_shape_proto.shape_vecs(i); + for (int j = 0; j < shape_vectors_proto.shapes_size(); j++) { + auto shape_vector_proto = shape_vectors_proto.shapes(j); + std::vector shape; + for (int k = 0; k < shape_vector_proto.size_size(); k++) { + shape.push_back(shape_vector_proto.size(k)); + } + input_shape_vec.push_back(shape); + } + } + op_supplement_event.input_shapes = input_shapes; + auto dtype_proto = op_supplement_event_proto.dtypes(); + for (int i = 0; i < dtype_proto.key_size(); i++) { + auto dtype_vec = dtypes[dtype_proto.key(i)]; + auto dtype_vec_proto = dtype_proto.dtype_vecs(i); + for (int j = 0; j < dtype_vec_proto.dtype_size(); j++) { + auto dtype_string = dtype_vec_proto.dtype(j); + dtype_vec.push_back(dtype_string); + } + } + op_supplement_event.dtypes = dtypes; + return new OperatorSupplementEventNode(op_supplement_event); +} + KernelEventInfo DeserializationReader::HandleKernelEventInfoProto( const DeviceTraceEventProto& device_event_proto) { const KernelEventInfoProto& kernel_info_proto = @@ -203,11 +282,14 @@ MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto( device_event_proto.memcpy_info(); MemcpyEventInfo memcpy_info; memcpy_info.num_bytes = memcpy_info_proto.num_bytes(); - std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(), + std::strncpy(memcpy_info.copy_kind, + memcpy_info_proto.copy_kind().c_str(), kMemKindMaxLen - 1); - std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(), + std::strncpy(memcpy_info.src_kind, + memcpy_info_proto.src_kind().c_str(), kMemKindMaxLen - 1); - std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(), + std::strncpy(memcpy_info.dst_kind, + memcpy_info_proto.dst_kind().c_str(), kMemKindMaxLen - 1); return memcpy_info; } @@ -218,7 +300,8 @@ MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto( device_event_proto.memset_info(); MemsetEventInfo memset_info; memset_info.num_bytes = memset_info_proto.num_bytes(); - std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(), + std::strncpy(memset_info.memory_kind, + memset_info_proto.memory_kind().c_str(), kMemKindMaxLen - 1); memset_info.value = memset_info_proto.value(); return memset_info; diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h index e6feb4f9489e8..7df93b7703c32 100644 --- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h +++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h @@ -36,6 +36,9 @@ class DeserializationReader { KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&); MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&); MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&); + MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&); + OperatorSupplementEventNode* RestoreOperatorSupplementEventNode( + const OperatorSupplementEventNodeProto&); std::string filename_; std::ifstream input_file_stream_; NodeTreesProto* node_trees_proto_; diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto index 7016745059d40..4ebfb6e73b331 100644 --- a/paddle/fluid/platform/profiler/dump/nodetree.proto +++ b/paddle/fluid/platform/profiler/dump/nodetree.proto @@ -46,6 +46,19 @@ enum TracerEventTypeProto { PythonOp = 13; // Used to mark python level userdefined PythonUserDefined = 14; + // Used to mark mlu runtime record returned by cnpapi + MluRuntime = 15; +}; + +enum TracerMemEventTypeProto { + // Used to mark memory allocation which is managed by paddle + Allocate = 0; + // Used to mark memory free which is managed by paddle + Free = 1; + // Used to mark reserved memory allocation which is applied from device. + ReservedAllocate = 2; + // Used to mark reserved memory free which is released to device. + ReservedFree = 3; }; message KernelEventInfoProto { @@ -121,6 +134,62 @@ message HostTraceEventProto { required uint64 thread_id = 6; } +message MemTraceEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // memory manipulation type + required TracerMemEventTypeProto type = 2; + // memory addr of allocation or free + required uint64 addr = 3; + // process id of the record + required uint64 process_id = 4; + // thread id of the record + required uint64 thread_id = 5; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + required int64 increase_bytes = 6; + // place + required string place = 7; + // current total allocated memory + required uint64 current_allocated = 8; + // current total reserved memory + required uint64 current_reserved = 9; + // current peak allocated memory + required uint64 peak_allocated = 10; + // current peak reserved memory + required uint64 peak_reserved = 11; +} + +message OperatorSupplementEventProto { + // timestamp of the record + required uint64 timestamp_ns = 1; + // op type name + required string op_type = 2; + // process id of the record + required uint64 process_id = 3; + // thread id of the record + required uint64 thread_id = 4; + // input shapes + message input_shape_proto { + repeated string key = 1; + message shape_vector { + message shape { repeated uint64 size = 1; } + repeated shape shapes = 1; + } + repeated shape_vector shape_vecs = 2; + } + required input_shape_proto input_shapes = 5; + // dtypes + message dtype_proto { + repeated string key = 1; + message dtype_vector { repeated string dtype = 1; } + repeated dtype_vector dtype_vecs = 2; + } + required dtype_proto dtypes = 6; + // call stack + required string callstack = 7; +} + message CudaRuntimeTraceEventProto { // record name required string name = 1; @@ -166,6 +235,12 @@ message DeviceTraceEventProto { } } +message OperatorSupplementEventNodeProto { + required OperatorSupplementEventProto op_supplement_event = 1; +} + +message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; } + message DeviceTraceEventNodeProto { required DeviceTraceEventProto device_event = 1; } @@ -180,6 +255,9 @@ message HostTraceEventNodeProto { required int64 parentid = 2; required HostTraceEventProto host_trace_event = 3; repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4; + // below is added in version 1.0.1 + repeated MemTraceEventNodeProto mem_nodes = 5; + repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6; } message ThreadNodeTreeProto { diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc index 73021f4362af5..cbb86e76d3a1e 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc @@ -20,19 +20,19 @@ namespace paddle { namespace platform { static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb"; -static const char* version = "1.0.0"; +static const char* version = "1.0.1"; static uint32_t span_indx = 0; static std::string DefaultFileName() { auto pid = GetProcessId(); - return string_format(std::string(kDefaultFilename), pid, - GetStringFormatLocalTime().c_str()); + return string_format( + std::string(kDefaultFilename), pid, GetStringFormatLocalTime().c_str()); } void SerializationLogger::OpenFile() { - output_file_stream_.open(filename_, std::ofstream::out | - std::ofstream::trunc | - std::ofstream::binary); + output_file_stream_.open( + filename_, + std::ofstream::out | std::ofstream::trunc | std::ofstream::binary); if (!output_file_stream_) { LOG(WARNING) << "Unable to open file for writing profiling data." << std::endl; @@ -50,7 +50,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { thread2host_event_nodes = node_trees.Traverse(true); for (auto it = thread2host_event_nodes.begin(); - it != thread2host_event_nodes.end(); ++it) { + it != thread2host_event_nodes.end(); + ++it) { // 1. order every node an index, every node a parent std::map node_index_map; std::map node_parent_map; @@ -64,7 +65,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { for (auto hostnode = it->second.begin(); hostnode != it->second.end(); ++hostnode) { for (auto childnode = (*hostnode)->GetChildren().begin(); - childnode != (*hostnode)->GetChildren().end(); ++childnode) { + childnode != (*hostnode)->GetChildren().end(); + ++childnode) { node_parent_map[(*childnode)] = node_index_map[(*hostnode)]; // mark each node's parent } @@ -106,10 +108,36 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) { (*devicenode)->LogMe(this); // fill detail information } } + for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin(); + memnode != (*hostnode)->GetMemTraceEventNodes().end(); + ++memnode) { + MemTraceEventNodeProto* mem_node_proto = + current_host_trace_event_node_proto_->add_mem_nodes(); + current_mem_trace_event_node_proto_ = mem_node_proto; + (*memnode)->LogMe(this); + } } } } +void SerializationLogger::LogMemTraceEventNode( + const MemTraceEventNode& mem_node) { + MemTraceEventProto* mem_trace_event = new MemTraceEventProto(); + mem_trace_event->set_timestamp_ns(mem_node.TimeStampNs()); + mem_trace_event->set_type( + static_cast(mem_node.Type())); + mem_trace_event->set_addr(mem_node.Addr()); + mem_trace_event->set_process_id(mem_node.ProcessId()); + mem_trace_event->set_thread_id(mem_node.ThreadId()); + mem_trace_event->set_increase_bytes(mem_node.IncreaseBytes()); + mem_trace_event->set_place(mem_node.Place()); + mem_trace_event->set_current_allocated(mem_node.CurrentAllocated()); + mem_trace_event->set_current_reserved(mem_node.CurrentReserved()); + mem_trace_event->set_peak_allocated(mem_node.PeakAllocated()); + mem_trace_event->set_peak_reserved(mem_node.PeakReserved()); + current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event); +} + void SerializationLogger::LogHostTraceEventNode( const HostTraceEventNode& host_node) { HostTraceEventProto* host_trace_event = new HostTraceEventProto(); @@ -122,6 +150,63 @@ void SerializationLogger::LogHostTraceEventNode( host_trace_event->set_thread_id(host_node.ThreadId()); current_host_trace_event_node_proto_->set_allocated_host_trace_event( host_trace_event); + OperatorSupplementEventNode* op_supplement_event_node = + host_node.GetOperatorSupplementEventNode(); + if (op_supplement_event_node != nullptr) { + current_op_supplement_event_node_proto_ = + current_host_trace_event_node_proto_->add_op_supplement_nodes(); + OperatorSupplementEventProto* op_supplement_event_proto = + new OperatorSupplementEventProto(); + op_supplement_event_proto->set_op_type(op_supplement_event_node->Name()); + op_supplement_event_proto->set_timestamp_ns( + op_supplement_event_node->TimeStampNs()); + op_supplement_event_proto->set_process_id( + op_supplement_event_node->ProcessId()); + op_supplement_event_proto->set_thread_id( + op_supplement_event_node->ThreadId()); + op_supplement_event_proto->set_callstack( + op_supplement_event_node->CallStack()); + + OperatorSupplementEventProto::input_shape_proto* input_shape_proto = + op_supplement_event_proto->mutable_input_shapes(); + for (auto it = op_supplement_event_node->InputShapes().begin(); + it != op_supplement_event_node->InputShapes().end(); + it++) { + input_shape_proto->add_key(it->first); + OperatorSupplementEventProto::input_shape_proto::shape_vector* + shape_vectors_proto = input_shape_proto->add_shape_vecs(); + auto shape_vectors = it->second; + for (auto shape_vecs_it = shape_vectors.begin(); + shape_vecs_it != shape_vectors.end(); + shape_vecs_it++) { + auto shape_vector = *shape_vecs_it; + OperatorSupplementEventProto::input_shape_proto::shape_vector::shape* + shape_proto = shape_vectors_proto->add_shapes(); + for (auto shape_it = shape_vector.begin(); + shape_it != shape_vector.end(); + shape_it++) { + shape_proto->add_size(*shape_it); + } + } + } + + OperatorSupplementEventProto::dtype_proto* dtype_proto = + op_supplement_event_proto->mutable_dtypes(); + for (auto it = op_supplement_event_node->Dtypes().begin(); + it != op_supplement_event_node->Dtypes().end(); + it++) { + dtype_proto->add_key(it->first); + OperatorSupplementEventProto::dtype_proto::dtype_vector* + dtype_vector_proto = dtype_proto->add_dtype_vecs(); + auto dtype_vector = it->second; + for (auto dtype_it = dtype_vector.begin(); dtype_it != dtype_vector.end(); + dtype_it++) { + dtype_vector_proto->add_dtype(*dtype_it); + } + } + current_op_supplement_event_node_proto_->set_allocated_op_supplement_event( + op_supplement_event_proto); + } } void SerializationLogger::LogRuntimeTraceEventNode( diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h index 378834cff590d..31910cb68c5d7 100644 --- a/paddle/fluid/platform/profiler/dump/serialization_logger.h +++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h @@ -34,6 +34,7 @@ class SerializationLogger : public BaseLogger { void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override; void LogNodeTrees(const NodeTrees&) override; void LogMetaInfo(const std::unordered_map); + void LogMemTraceEventNode(const MemTraceEventNode&) override; private: void OpenFile(); @@ -48,6 +49,8 @@ class SerializationLogger : public BaseLogger { HostTraceEventNodeProto* current_host_trace_event_node_proto_; CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_; DeviceTraceEventNodeProto* current_device_trace_event_node_proto_; + MemTraceEventNodeProto* current_mem_trace_event_node_proto_; + OperatorSupplementEventNodeProto* current_op_supplement_event_node_proto_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc index 9380a26dbc3b4..a49d799c78521 100644 --- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc +++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc @@ -35,6 +35,7 @@ using paddle::platform::ProfilerResult; using paddle::platform::RuntimeTraceEvent; using paddle::platform::SerializationLogger; using paddle::platform::TracerEventType; +using paddle::platform::TracerMemEventType; TEST(SerializationLoggerTest, dump_case0) { std::list host_events; @@ -54,6 +55,36 @@ TEST(SerializationLoggerTest, dump_case0) { std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10)); host_events.push_back(HostTraceEvent( std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11)); + mem_events.push_back(MemTraceEvent(11500, + 0x1000, + TracerMemEventType::Allocate, + 10, + 10, + 50, + "GPU:0", + 50, + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); + std::map>> input_shapes; + std::map> dtypes; + input_shapes[std::string("X")].push_back(std::vector{1, 2, 3}); + input_shapes[std::string("X")].push_back(std::vector{4, 5, 6, 7}); + dtypes[std::string("X")].push_back(std::string("int8")); + dtypes[std::string("X")].push_back(std::string("float32")); + op_supplement_events.push_back(OperatorSupplementEvent( + 11600, "op1", input_shapes, dtypes, "op1()", 10, 10)); runtime_events.push_back(RuntimeTraceEvent( std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0)); runtime_events.push_back(RuntimeTraceEvent( @@ -128,6 +159,8 @@ TEST(SerializationLoggerTest, dump_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { @@ -137,6 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(SerializationLoggerTest, dump_case1) { @@ -224,6 +258,7 @@ TEST(SerializationLoggerTest, dump_case1) { } } tree.LogMe(&logger); + logger.LogMetaInfo(std::unordered_map()); } TEST(DeserializationReaderTest, restore_case0) { @@ -243,6 +278,8 @@ TEST(DeserializationReaderTest, restore_case0) { if ((*it)->Name() == "op1") { EXPECT_EQ((*it)->GetChildren().size(), 0u); EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u); + EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u); + EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr); } } for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) { diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h index 13ec115100505..34e6556f7f47a 100644 --- a/paddle/fluid/platform/profiler/event_node.h +++ b/paddle/fluid/platform/profiler/event_node.h @@ -47,6 +47,8 @@ class MemTraceEventNode { std::string Place() const { return mem_event_.place; } uint64_t CurrentAllocated() const { return mem_event_.current_allocated; } uint64_t CurrentReserved() const { return mem_event_.current_reserved; } + uint64_t PeakAllocated() const { return mem_event_.peak_allocated; } + uint64_t PeakReserved() const { return mem_event_.peak_reserved; } // member function void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); } diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc index 1a6f19d2f93af..028d666f35537 100644 --- a/paddle/fluid/platform/profiler/event_python.cc +++ b/paddle/fluid/platform/profiler/event_python.cc @@ -31,6 +31,9 @@ HostPythonNode::~HostPythonNode() { for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) { delete *it; } + for (auto it = mem_node_ptrs.begin(); it != mem_node_ptrs.end(); ++it) { + delete *it; + } } HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { @@ -52,7 +55,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { } // copy its CudaRuntimeTraceEventNode for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin(); - runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) { + runtimenode != root->GetRuntimeTraceEventNodes().end(); + ++runtimenode) { HostPythonNode* runtime_python_node = new HostPythonNode(); runtime_python_node->name = (*runtimenode)->Name(); runtime_python_node->type = (*runtimenode)->Type(); @@ -76,6 +80,32 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) { runtime_python_node->device_node_ptrs.push_back(device_python_node); } } + // copy MemTraceEventNode + for (auto memnode = root->GetMemTraceEventNodes().begin(); + memnode != root->GetMemTraceEventNodes().end(); + memnode++) { + MemPythonNode* mem_python_node = new MemPythonNode(); + mem_python_node->timestamp_ns = (*memnode)->TimeStampNs(); + mem_python_node->addr = (*memnode)->Addr(); + mem_python_node->type = (*memnode)->Type(); + mem_python_node->process_id = (*memnode)->ProcessId(); + mem_python_node->thread_id = (*memnode)->ThreadId(); + mem_python_node->increase_bytes = (*memnode)->IncreaseBytes(); + mem_python_node->place = (*memnode)->Place(); + mem_python_node->current_allocated = (*memnode)->CurrentAllocated(); + mem_python_node->current_reserved = (*memnode)->CurrentReserved(); + mem_python_node->peak_allocated = (*memnode)->PeakAllocated(); + mem_python_node->peak_reserved = (*memnode)->PeakReserved(); + host_python_node->mem_node_ptrs.push_back(mem_python_node); + } + // copy OperatorSupplementEventNode's information if exists + OperatorSupplementEventNode* op_supplement_node = + root->GetOperatorSupplementEventNode(); + if (op_supplement_node != nullptr) { + host_python_node->input_shapes = op_supplement_node->InputShapes(); + host_python_node->dtypes = op_supplement_node->Dtypes(); + host_python_node->callstack = op_supplement_node->CallStack(); + } return host_python_node; } @@ -93,7 +123,8 @@ ProfilerResult::ProfilerResult(std::unique_ptr tree, ProfilerResult::~ProfilerResult() { // delete all root nodes for (auto it = thread_event_trees_map_.begin(); - it != thread_event_trees_map_.end(); ++it) { + it != thread_event_trees_map_.end(); + ++it) { delete it->second; } } diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h index 12ecb9fde32aa..44f6e61fd3737 100644 --- a/paddle/fluid/platform/profiler/event_python.h +++ b/paddle/fluid/platform/profiler/event_python.h @@ -43,6 +43,35 @@ struct DevicePythonNode { uint64_t stream_id; }; +struct MemPythonNode { + MemPythonNode() = default; + ~MemPythonNode() {} + + // timestamp of the record + uint64_t timestamp_ns; + // memory addr of allocation or free + uint64_t addr; + // memory manipulation type + TracerMemEventType type; + // process id of the record + uint64_t process_id; + // thread id of the record + uint64_t thread_id; + // increase bytes after this manipulation, allocation for sign +, free for + // sign - + int64_t increase_bytes; + // place + std::string place; + // current total allocated memory + uint64_t current_allocated; + // current total reserved memory + uint64_t current_reserved; + // peak allocated memory + uint64_t peak_allocated; + // peak reserved memory + uint64_t peak_reserved; +}; + struct HostPythonNode { HostPythonNode() = default; ~HostPythonNode(); @@ -58,12 +87,19 @@ struct HostPythonNode { uint64_t process_id; // thread id of the record uint64_t thread_id; + // input shapes + std::map>> input_shapes; + std::map> dtypes; + // call stack + std::string callstack; // children node std::vector children_node_ptrs; // runtime node std::vector runtime_node_ptrs; // device node std::vector device_node_ptrs; + // mem node + std::vector mem_node_ptrs; }; class ProfilerResult { diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index b7eb53331b793..7923a8fba0051 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -11,8 +11,10 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/platform/profiler/host_tracer.h" + +#include + #include "glog/logging.h" #include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/profiler/common_event.h" @@ -20,7 +22,8 @@ // Used to filter events, works like glog VLOG(level). // RecordEvent will works if host_trace_level >= level. -PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, +PADDLE_DEFINE_EXPORTED_int64(host_trace_level, + 1, "RecordEvent will works " "if host_trace_level >= level."); @@ -49,6 +52,79 @@ void ProcessHostEvents(const HostEventSection& host_events, } } +void ProcessHostMemEvents( + const HostEventSection& host_mem_events, + TraceEventCollector* collector) { + for (const auto& thr_sec : host_mem_events.thr_sections) { + uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } + for (const auto& evt : thr_sec.events) { + MemTraceEvent event; + event.timestamp_ns = evt.timestamp_ns; + event.addr = evt.addr; + event.type = evt.type; + event.increase_bytes = evt.increase_bytes; + event.place = evt.place.DebugString(); + event.current_allocated = evt.current_allocated; + event.current_reserved = evt.current_reserved; + event.peak_allocated = evt.peak_allocated; + event.peak_reserved = evt.peak_reserved; + event.process_id = host_mem_events.process_id; + event.thread_id = tid; + collector->AddMemEvent(std::move(event)); + } + } +} + +void ProcessOperatorSupplementEvents( + const HostEventSection& op_supplement_events, + TraceEventCollector* collector) { + for (const auto& thr_sec : op_supplement_events.thr_sections) { + uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } + for (const auto& evt : thr_sec.events) { + OperatorSupplementEvent event; + event.timestamp_ns = evt.timestamp_ns; + event.op_type = evt.op_type; + std::map>> input_shapes; + std::map> dtypes; + std::string callstack; + for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end(); + it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + input_shapes[it->first].push_back(std::vector()); + for (auto dim_idx = 0; dim_idx < it->second.at(idx).size(); + dim_idx++) { + input_shapes[it->first][idx].push_back( + it->second.at(idx).at(dim_idx)); + } + } + } + for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) { + for (auto idx = 0lu; idx < it->second.size(); idx++) { + dtypes[it->first].push_back( + framework::proto::VarType::Type_Name(it->second.at(idx))); + } + } + + std::ostringstream result_string; + for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) { + result_string << (*it) << std::endl; + } + event.input_shapes = input_shapes; + event.dtypes = dtypes; + event.callstack = result_string.str(); + event.process_id = op_supplement_events.process_id; + event.thread_id = tid; + collector->AddOperatorSupplementEvent(std::move(event)); + } + } +} + } // namespace void HostTracer::PrepareTracing() { @@ -59,16 +135,21 @@ void HostTracer::PrepareTracing() { void HostTracer::StartTracing() { PADDLE_ENFORCE_EQ( - state_ == TracerState::READY || state_ == TracerState::STOPED, true, + state_ == TracerState::READY || state_ == TracerState::STOPED, + true, platform::errors::PreconditionNotMet("TracerState must be READY")); - HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance().GatherEvents(); + HostEventRecorder::GetInstance() + .GatherEvents(); HostTraceLevel::GetInstance().SetLevel(options_.trace_level); state_ = TracerState::STARTED; } void HostTracer::StopTracing() { PADDLE_ENFORCE_EQ( - state_, TracerState::STARTED, + state_, + TracerState::STARTED, platform::errors::PreconditionNotMet("TracerState must be STARTED")); HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled); state_ = TracerState::STOPED; @@ -76,11 +157,19 @@ void HostTracer::StopTracing() { void HostTracer::CollectTraceData(TraceEventCollector* collector) { PADDLE_ENFORCE_EQ( - state_, TracerState::STOPED, + state_, + TracerState::STOPED, platform::errors::PreconditionNotMet("TracerState must be STOPED")); HostEventSection host_events = HostEventRecorder::GetInstance().GatherEvents(); ProcessHostEvents(host_events, collector); + HostEventSection host_mem_events = + HostEventRecorder::GetInstance().GatherEvents(); + ProcessHostMemEvents(host_mem_events, collector); + HostEventSection op_supplement_events = + HostEventRecorder::GetInstance() + .GatherEvents(); + ProcessOperatorSupplementEvents(op_supplement_events, collector); } } // namespace platform diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h new file mode 100644 index 0000000000000..3d3508c7bd570 --- /dev/null +++ b/paddle/fluid/platform/profiler/mem_tracing.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler/trace_event.h" + +namespace paddle { +namespace platform { +// Memory event tracing. A trace marks memory manipulation such as allocation +// and free. +// The events can be used to draw memory variation curve. +class RecordMemEvent { + public: + /** + * @param ptr: Pointer address allocated or free. + * @param place: Device for this memory event. + * @param size: Memory size allocated or free. + * @param type: Denote manipulation type for this memory event. + */ + explicit RecordMemEvent( + const void* ptr, + const Place& place, + size_t size, + const TracerMemEventType type = TracerMemEventType::Allocate); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc index 32310b9e86228..ab9da63c1165c 100644 --- a/paddle/fluid/platform/profiler/profiler_test.cc +++ b/paddle/fluid/platform/profiler/profiler_test.cc @@ -22,16 +22,18 @@ #ifdef PADDLE_WITH_HIP #include #endif +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/profiler.h" TEST(ProfilerTest, TestHostTracer) { - using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; + using paddle::platform::ProfilerResult; using paddle::platform::RecordInstantEvent; using paddle::platform::TracerEventType; - using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 2; options.trace_switch = 3; @@ -40,10 +42,10 @@ TEST(ProfilerTest, TestHostTracer) { profiler->Prepare(); profiler->Start(); { - RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined, - 2); - RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, - 3); + RecordInstantEvent( + "TestTraceLevel_record1", TracerEventType::UserDefined, 2); + RecordInstantEvent( + "TestTraceLevel_record2", TracerEventType::UserDefined, 3); } auto profiler_result = profiler->Stop(); auto& nodetree = profiler_result->GetNodeTrees(); @@ -58,8 +60,8 @@ TEST(ProfilerTest, TestHostTracer) { } TEST(ProfilerTest, TestCudaTracer) { - using paddle::platform::ProfilerOptions; using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; using paddle::platform::ProfilerResult; ProfilerOptions options; options.trace_level = 0; @@ -92,3 +94,49 @@ TEST(ProfilerTest, TestCudaTracer) { EXPECT_GT(runtime_events.size(), 0u); #endif } + +TEST(ProfilerTest, TestHostTracerForMem) { + using paddle::platform::CPUPlace; + using paddle::platform::EnableHostEventRecorder; + using paddle::platform::MemTraceEventNode; + using paddle::platform::Profiler; + using paddle::platform::ProfilerOptions; + using paddle::platform::ProfilerResult; + using paddle::platform::RecordEvent; + using paddle::platform::RecordInstantEvent; + using paddle::platform::RecordMemEvent; + using paddle::platform::TracerEventType; + using paddle::platform::TracerMemEventType; + ProfilerOptions options; + options.trace_level = 1; + options.trace_switch = 3; + auto profiler = Profiler::Create(options); + EXPECT_TRUE(profiler); + EnableHostEventRecorder(); + profiler->Prepare(); + profiler->Start(); + { + RecordEvent event1( + "TestTracerForMem_phase1", TracerEventType::UserDefined, 1); + RecordMemEvent(reinterpret_cast(0), + CPUPlace(), + 1024, + TracerMemEventType::Allocate); + RecordMemEvent( + reinterpret_cast(0), CPUPlace(), 1024, TracerMemEventType::Free); + } + { + RecordEvent event2( + "TestTracerForMem_phase2", TracerEventType::UserDefined, 1); + RecordMemEvent(reinterpret_cast(1024), + CPUPlace(), + 1024, + TracerMemEventType::Allocate); + RecordMemEvent(reinterpret_cast(1024), + CPUPlace(), + 1024, + TracerMemEventType::Free); + } + auto profiler_result = profiler->Stop(); + auto nodetree = profiler_result->GetNodeTrees(); +} diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h new file mode 100644 index 0000000000000..46b1616d71cc3 --- /dev/null +++ b/paddle/fluid/platform/profiler/supplement_tracing.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/profiler/trace_event.h" + +namespace paddle { + +namespace framework { +class RuntimeContext; +} +namespace platform { + +class RecordOpInfoSupplement { + public: + /** + * @param type: Operator type name. + * @param attrs: Attribute map of op. + * @param shape_ctx: Infershape context object. + * @param ctx: Runtime context object. + */ + explicit RecordOpInfoSupplement(const std::string& type, + const framework::AttributeMap& attrs, + const framework::InferShapeContext& shape_ctx, + const framework::RuntimeContext& ctx); +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc index 41a5ebce023a0..dcf6dd56d74af 100644 --- a/paddle/fluid/platform/profiler/test_event_node.cc +++ b/paddle/fluid/platform/profiler/test_event_node.cc @@ -60,9 +60,20 @@ TEST(NodeTreesTest, LogMe_case0) { 50, "GPU:0", 50, - 50)); - mem_events.push_back(MemTraceEvent( - 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50)); + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); std::map>> input_shapes; std::map> dtypes; input_shapes[std::string("X")].push_back(std::vector{1, 2, 3}); @@ -267,9 +278,20 @@ TEST(NodeTreesTest, HandleTrees_case0) { 50, "GPU:0", 50, - 50)); - mem_events.push_back(MemTraceEvent( - 11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50)); + 50, + 100, + 100)); + mem_events.push_back(MemTraceEvent(11900, + 0x1000, + TracerMemEventType::Free, + 10, + 10, + -50, + "GPU:0", + 0, + 50, + 100, + 100)); op_supplement_events.push_back(OperatorSupplementEvent( 11600, "op1", diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h index d50c5584f5c4b..62d82c19d1796 100644 --- a/paddle/fluid/platform/profiler/trace_event.h +++ b/paddle/fluid/platform/profiler/trace_event.h @@ -59,10 +59,14 @@ enum class TracerEventType { }; enum class TracerMemEventType { - // Used to mark memory allocation + // Used to mark memory allocation which is managed by paddle Allocate = 0, - // Used to mark memory free + // Used to mark memory free which is managed by paddle Free = 1, + // Used to mark reserved memory allocation which is applied from device. + ReservedAllocate = 2, + // Used to mark reserved memory free which is released to device. + ReservedFree = 3, // A flag to denote the number of current types NumTypes }; @@ -318,7 +322,9 @@ struct MemTraceEvent { int64_t increase_bytes, const std::string& place, uint64_t current_allocated, - uint64_t current_reserved) + uint64_t current_reserved, + uint64_t peak_allocated, + uint64_t peak_reserved) : timestamp_ns(timestamp_ns), addr(addr), type(type), @@ -327,7 +333,9 @@ struct MemTraceEvent { increase_bytes(increase_bytes), place(place), current_allocated(current_allocated), - current_reserved(current_reserved) {} + current_reserved(current_reserved), + peak_allocated(peak_allocated), + peak_reserved(peak_reserved) {} // timestamp of the record uint64_t timestamp_ns; @@ -348,6 +356,10 @@ struct MemTraceEvent { uint64_t current_allocated; // current total reserved memory uint64_t current_reserved; + // current peak allocated memory + uint64_t peak_allocated; + // current peak reserved memory + uint64_t peak_reserved; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc index bbfc687738dd9..11035867416b8 100644 --- a/paddle/fluid/platform/profiler/utils.cc +++ b/paddle/fluid/platform/profiler/utils.cc @@ -91,7 +91,8 @@ float CalculateEstOccupancy(uint32_t DeviceId, #endif const char* StringTracerMemEventType(TracerMemEventType type) { - static const char* categary_name_[] = {"Allocate", "Free"}; + static const char* categary_name_[] = { + "Allocate", "Free", "ReservedAllocate", "ReservedFree"}; return categary_name_[static_cast(type)]; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a003de812a3ac..e3dffc6442a48 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -922,6 +922,13 @@ PYBIND11_MODULE(core_noavx, m) { return reinterpret_cast( self.mutable_data(place, framework::TransToPhiDataType(type))); }) + .def("_mutable_data", + [](framework::Tensor &self, + paddle::platform::CustomPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); + }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::XPUPlace &place, @@ -963,6 +970,11 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) + .def("_copy_from", + &TensorCopyFrom, + py::arg("tensor"), + py::arg("place"), + py::arg("batch_size") = -1) .def("_copy_from", &TensorCopyFrom, py::arg("tensor"), @@ -998,6 +1010,11 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", + SetTensorFromPyArray, + py::arg("array"), + py::arg("place"), + py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), @@ -2200,9 +2217,9 @@ All parameter, weight, gradient are variables in Paddle. #endif return devices; }); - py::class_(m, - "CustomPlace", - R"DOC( + py::class_ customplace(m, + "CustomPlace", + R"DOC( CustomPlace is a descriptor of a device. It represents a custom device on which a tensor will be allocated and a model will run. @@ -2852,6 +2869,13 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, + const Scope &scope, + const platform::CustomPlace &place) { + pybind11::gil_scoped_release release; + self.Run(scope, place); + }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", @@ -3493,6 +3517,26 @@ All parameter, weight, gradient are variables in Paddle. .def("save", &paddle::platform::ProfilerResult::Save) .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); + py::class_(m, "MemPythonNode") + .def(py::init<>()) + .def_readwrite("timestamp_ns", + &paddle::platform::MemPythonNode::timestamp_ns) + .def_readwrite("addr", &paddle::platform::MemPythonNode::addr) + .def_readwrite("type", &paddle::platform::MemPythonNode::type) + .def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id) + .def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id) + .def_readwrite("increase_bytes", + &paddle::platform::MemPythonNode::increase_bytes) + .def_readwrite("place", &paddle::platform::MemPythonNode::place) + .def_readwrite("current_allocated", + &paddle::platform::MemPythonNode::current_allocated) + .def_readwrite("current_reserved", + &paddle::platform::MemPythonNode::current_reserved) + .def_readwrite("peak_allocated", + &paddle::platform::MemPythonNode::peak_allocated) + .def_readwrite("peak_reserved", + &paddle::platform::MemPythonNode::peak_reserved); + py::class_(m, "DevicePythonNode") .def(py::init<>()) .def_readwrite("name", &paddle::platform::DevicePythonNode::name) @@ -3515,12 +3559,18 @@ All parameter, weight, gradient are variables in Paddle. .def_readwrite("process_id", &paddle::platform::HostPythonNode::process_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) + .def_readwrite("input_shapes", + &paddle::platform::HostPythonNode::input_shapes) + .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes) + .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack) .def_readwrite("children_node", &paddle::platform::HostPythonNode::children_node_ptrs) .def_readwrite("runtime_node", &paddle::platform::HostPythonNode::runtime_node_ptrs) .def_readwrite("device_node", - &paddle::platform::HostPythonNode::device_node_ptrs); + &paddle::platform::HostPythonNode::device_node_ptrs) + .def_readwrite("mem_node", + &paddle::platform::HostPythonNode::mem_node_ptrs); py::class_(m, "_Profiler") .def("create", @@ -3555,6 +3605,14 @@ All parameter, weight, gradient are variables in Paddle. })) .def("end", [](platform::RecordEvent *event) { event->End(); }); + py::enum_(m, "TracerMemEventType") + .value("Allocate", paddle::platform::TracerMemEventType::Allocate) + .value("Free", paddle::platform::TracerMemEventType::Free) + .value("ReservedAllocate", + paddle::platform::TracerMemEventType::ReservedAllocate) + .value("ReservedFree", + paddle::platform::TracerMemEventType::ReservedFree); + py::enum_(m, "TracerEventType") .value("Operator", paddle::platform::TracerEventType::Operator) .value("Dataloader", paddle::platform::TracerEventType::Dataloader) @@ -4566,6 +4624,12 @@ All parameter, weight, gradient are variables in Paddle. option.first.cast(), option.second.cast()); } + } else if (option_name == "replicated_collectives_settings") { + for (auto option : element.second.cast()) { + self.SetReplicatedCollectivesSettings( + option.first.cast(), + option.second.cast()); + } } else if (option_name == "accumulate_outer_fragment") { for (auto option : element.second.cast()) { std::vector values;