diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index dd1c0d885efdd..e26f45a84673a 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -22,14 +22,17 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/core/kernel_context.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
 
-PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
+PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
+                            true,
                             "Use inplace in new executor");
-PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
+PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
+                            true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
 
@@ -167,8 +170,8 @@ paddle::framework::FetchList InterpreterCore::Run(
       // scope?
     }
     global_scope_->SetLocalScope(local_scope_);
-    paddle::framework::interpreter::build_variable_scope(block_, global_scope_,
-                                                         create_local_scope_);
+    paddle::framework::interpreter::build_variable_scope(
+        block_, global_scope_, create_local_scope_);
     std::vector<paddle::framework::OpFuncNode> op_func_nodes;
     paddle::framework::interpreter::build_op_func_list(
         place_, block_, &op_func_nodes, global_scope_, create_local_scope_);
@@ -490,7 +493,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     // If it is OperatorBase, InferShape do nothing.
     if (op_with_kernel != nullptr) {
       platform::RecordEvent infershape_event(
-          "infer_shape", platform::TracerEventType::OperatorInner, 1,
+          "infer_shape",
+          platform::TracerEventType::OperatorInner,
+          1,
           platform::EventRole::kInnerOp);
 
       // see OperatorWithKernel::RunImpl in operator.cc for why
@@ -499,6 +504,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
         op_with_kernel->Info().infer_shape_(
             instr_node.InnerInferShapeContext().get());
       }
+      infershape_event.End();
+      platform::RecordOpInfoSupplement(op->Type(),
+                                       op->Attrs(),
+                                       *(instr_node.InnerInferShapeContext()),
+                                       *(instr_node.InnerRuntimeContext()));
     }
   }
 
@@ -516,7 +526,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
   {
     platform::RecordEvent compute_event(
-        "compute", platform::TracerEventType::OperatorInner, 1,
+        "compute",
+        platform::TracerEventType::OperatorInner,
+        1,
         platform::EventRole::kInnerOp);
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
@@ -571,7 +583,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
     VLOG(4) << "Check nan/inf";
     framework::details::CheckOpHasNanOrInf(
-        *op, *global_scope_,
+        *op,
+        *global_scope_,
         place);  // TODO(xiongkun03) change it to inner scope.
   }
 }
@@ -596,10 +609,14 @@ void InterpreterCore::ExecuteInstructionList(
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
-        this, i, atomic_deps = atomic_deps.get(),
-        atomic_var_ref = atomic_var_ref.get()
-      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+                                 [this,
+                                  i,
+                                  atomic_deps = atomic_deps.get(),
+                                  atomic_var_ref = atomic_var_ref.get()] {
+                                   RunInstructionAsync(
+                                       i, atomic_deps, atomic_var_ref);
+                                 });
     }
   }
 
@@ -615,7 +632,8 @@ void InterpreterCore::ExecuteInstructionList(
     }
     VLOG(4) << "Cancel ok";
     PADDLE_ENFORCE_EQ(
-        main_thread_blocker_.Clear(), 0,
+        main_thread_blocker_.Clear(),
+        0,
         platform::errors::PreconditionNotMet(
             "main_thread_blocker_.Clear() return -1, clear failed"));
     VLOG(4) << "clear ok";
@@ -624,7 +642,8 @@ void InterpreterCore::ExecuteInstructionList(
 }
 
 void InterpreterCore::RunNextInstructions(
-    const Instruction& instr, std::queue<size_t>* reserved_next_ops,
+    const Instruction& instr,
+    std::queue<size_t>* reserved_next_ops,
     std::vector<std::atomic<size_t>>* atomic_deps,
     std::vector<std::atomic<size_t>>* atomic_var_ref) {
   auto& next_instr = instr.NextInstructions();
@@ -691,7 +710,8 @@ void InterpreterCore::RunNextInstructions(
 }
 
 void InterpreterCore::RunInstructionAsync(
-    size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
+    size_t instr_id,
+    std::vector<std::atomic<size_t>>* atomic_deps,
     std::vector<std::atomic<size_t>>* atomic_var_ref) {
   std::queue<size_t> ready_ops;
   ready_ops.push(instr_id);
@@ -700,10 +720,10 @@ void InterpreterCore::RunInstructionAsync(
     ready_ops.pop();
     auto& instr_node = vec_instruction_.at(instr_id);
     VLOG(5) << __func__ << " OP id:" << instr_node.Id()
-            << " name:" << instr_node.OpBase()->Type()
-            << " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync
-                                ? "kQueueSync"
-                                : "kQueueAsync")
+            << " name:" << instr_node.OpBase()->Type() << " type:"
+            << (instr_node.KernelType() == OpFuncType::kQueueSync
+                    ? "kQueueSync"
+                    : "kQueueAsync")
             << " runs on " << platform::GetCurrentThreadName();
 
     auto* op = instr_node.OpBase();
@@ -877,12 +897,14 @@ void InterpreterCore::CheckGC(
 
       } else {
         static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
-            var_scope.Var(var_id), &gc_event_.at(instr_id),
+            var_scope.Var(var_id),
+            &gc_event_.at(instr_id),
             &instr.DeviceContext());
       }
 #else
       static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
-          var_scope.Var(var_id), &gc_event_.at(instr_id),
+          var_scope.Var(var_id),
+          &gc_event_.at(instr_id),
           &instr.DeviceContext());
 #endif
     }
@@ -891,20 +913,24 @@ void InterpreterCore::CheckGC(
 
 void InterpreterCore::Prepare(
     const std::vector<std::string>& feed_names,
-    const std::vector<framework::LoDTensor>& feed_tensors, bool prepare_feed) {
-  PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(),
+    const std::vector<framework::LoDTensor>& feed_tensors,
+    bool prepare_feed) {
+  PADDLE_ENFORCE_EQ(feed_names.size(),
+                    feed_tensors.size(),
                     platform::errors::PreconditionNotMet(
                         "Required feed_names.size() == feed_tensors.size(), "
                         "but received %d != %d",
-                        feed_names.size(), feed_tensors.size()));
+                        feed_names.size(),
+                        feed_tensors.size()));
 
   auto FeedInput = [&] {
     VLOG(4) << "Feed inputs";
     for (size_t i = 0; i < feed_names.size(); ++i) {
       auto* feed_var = global_scope_->FindVar(feed_names[i]);
       PADDLE_ENFORCE_NOT_NULL(
-          feed_var, platform::errors::NotFound(
-                        "Variable %s should not be nullptr.", feed_names[i]));
+          feed_var,
+          platform::errors::NotFound("Variable %s should not be nullptr.",
+                                     feed_names[i]));
 
       auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
       feed_tensor->ShareDataWith(feed_tensors[i]);
@@ -913,8 +939,8 @@ void InterpreterCore::Prepare(
   };
 
   if (!is_build_) {
-    paddle::framework::interpreter::build_variable_scope(block_, global_scope_,
-                                                         create_local_scope_);
+    paddle::framework::interpreter::build_variable_scope(
+        block_, global_scope_, create_local_scope_);
     FeedInput();
     std::vector<paddle::framework::OpFuncNode> op_func_nodes;
     paddle::framework::interpreter::build_op_func_list(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f06ed0b496e9b..140525384c3e3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/kernel_context.h"
@@ -59,7 +60,8 @@ class DenseTensor;
 DECLARE_bool(benchmark);
 DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
-PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
+PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism,
+                             0,
                              "number of threads for inner op");
 DECLARE_bool(run_kp_kernel);
 DECLARE_bool(enable_host_event_recorder_hook);
@@ -74,7 +76,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
     std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
 };
 
-static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
+static DDim GetDimsDebug(const ScopeBase& scope,
+                         const std::string& name,
                          bool get_actual_dim = false) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
@@ -268,7 +271,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
           Type(), platform::TracerEventType::Operator, 1);
       auto op_name = platform::OpName(outputs_, Type());
       platform::RecordEvent op_name_record_event(
-          op_name, platform::TracerEventType::Operator,
+          op_name,
+          platform::TracerEventType::Operator,
           FLAGS_enable_host_event_recorder_hook ? 20 : 1,
           platform::EventRole::kUniqueOp);
       RunImpl(scope, place);
@@ -297,9 +301,11 @@ bool OperatorBase::HasInputs(const std::string& name) const {
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(
-      ins.size(), 1UL,
+      ins.size(),
+      1UL,
       platform::errors::InvalidArgument(
-          "Operator %s's input %s should contain only one variable.", type_,
+          "Operator %s's input %s should contain only one variable.",
+          type_,
           name));
   return ins.empty() ? kEmptyVarName : ins[0];
 }
@@ -308,9 +314,10 @@ const std::vector<std::string>& OperatorBase::Inputs(
     const std::string& name) const {
   auto it = inputs_.find(name);
   PADDLE_ENFORCE_NE(
-      it, inputs_.end(),
-      platform::errors::NotFound("Operator %s does not have the input %s.",
-                                 type_, name));
+      it,
+      inputs_.end(),
+      platform::errors::NotFound(
+          "Operator %s does not have the input %s.", type_, name));
   return it->second;
 }
 
@@ -325,9 +332,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
 std::string OperatorBase::Output(const std::string& name) const {
   auto& outs = Outputs(name);
   PADDLE_ENFORCE_LE(
-      outs.size(), 1UL,
+      outs.size(),
+      1UL,
       platform::errors::InvalidArgument(
-          "Operator %s's output %s should contain only one variable.", type_,
+          "Operator %s's output %s should contain only one variable.",
+          type_,
           name));
   return outs.empty() ? kEmptyVarName : outs[0];
 }
@@ -336,7 +345,8 @@ const std::vector<std::string>& OperatorBase::Outputs(
     const std::string& name) const {
   auto it = outputs_.find(name);
   PADDLE_ENFORCE_NE(
-      it, outputs_.end(),
+      it,
+      outputs_.end(),
       platform::errors::NotFound(
           "Operator %s does not have an output called %s.", type_, name));
   return it->second;
@@ -484,18 +494,20 @@ void OperatorBase::CheckAllInputOutputSet() const {
   for (auto& in : info_->Proto().inputs()) {
     if (!in.dispensable() && !in.extra()) {
       PADDLE_ENFORCE_NE(
-          inputs_.find(in.name()), inputs_.end(),
-          platform::errors::NotFound("Operator %s's input (%s) is not set.",
-                                     Type(), in.name()));
+          inputs_.find(in.name()),
+          inputs_.end(),
+          platform::errors::NotFound(
+              "Operator %s's input (%s) is not set.", Type(), in.name()));
     }
   }
 
   for (auto& out : info_->Proto().outputs()) {
     if (!out.dispensable() && !out.extra()) {
       PADDLE_ENFORCE_NE(
-          outputs_.find(out.name()), outputs_.end(),
-          platform::errors::NotFound("Operator %s's output (%s) is not set.",
-                                     Type(), out.name()));
+          outputs_.find(out.name()),
+          outputs_.end(),
+          platform::errors::NotFound(
+              "Operator %s's output (%s) is not set.", Type(), out.name()));
     }
   }
 }
@@ -568,10 +580,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
   if (it == ctx_.inputs.end()) return nullptr;
 
   PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
       platform::errors::InvalidArgument(
           "Operator %s's input %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -580,10 +594,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
   if (it == ctx_.outputs.end()) return nullptr;
 
   PADDLE_ENFORCE_LE(
-      it->second.size(), 1UL,
+      it->second.size(),
+      1UL,
       platform::errors::InvalidArgument(
           "Operator %s's output %s should contain only one variable.",
-          op_.Type(), name));
+          op_.Type(),
+          name));
   return it->second.empty() ? nullptr : it->second[0];
 }
 
@@ -598,10 +614,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
   }
   std::vector<const Tensor*> res;
   res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                  [&](const Variable* var) -> const Tensor* {
                    if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
+                   PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
+                                     true,
                                      platform::errors::InvalidArgument(
                                          "Input variable should be LoDTensor, "
                                          "but the received type is %s.",
@@ -621,7 +640,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
   }
   std::vector<Tensor*> res;
   res.reserve(vars.size());
-  std::transform(vars.begin(), vars.end(), std::back_inserter(res),
+  std::transform(vars.begin(),
+                 vars.end(),
+                 std::back_inserter(res),
                  [&](Variable* var) -> Tensor* {
                    return var == nullptr ? nullptr
                                          : var->GetMutable<LoDTensor>();
@@ -679,7 +700,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
     const auto& in = it->second;
     if (in.size() == 0) return false;
     PADDLE_ENFORCE_EQ(
-        in.size(), 1UL,
+        in.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Input %s should not contain more than one inputs.", name));
     return in[0] != nullptr;
@@ -697,7 +719,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
       return false;
     }
     PADDLE_ENFORCE_EQ(
-        out.size(), 1UL,
+        out.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Output %s should not contain more than one outputs.", name));
     return out[0] != nullptr;
@@ -754,11 +777,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
   std::string GetInputNameByIdx(size_t idx) const override {
     auto& op_proto =
         paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
-    PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
+    PADDLE_ENFORCE_LT(idx,
+                      op_proto->inputs().size(),
                       platform::errors::OutOfRange(
                           "The index should be less than the size of inputs of "
                           "operator %s, but got index is %d and size is %d",
-                          op_.Type(), idx, op_proto->inputs().size()));
+                          op_.Type(),
+                          idx,
+                          op_proto->inputs().size()));
     return op_proto->inputs()[idx].name();
   }
 
@@ -766,42 +792,55 @@ class RuntimeInferShapeContext : public InferShapeContext {
     auto& op_proto =
         paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
     PADDLE_ENFORCE_LT(
-        idx, op_proto->outputs().size(),
+        idx,
+        op_proto->outputs().size(),
         platform::errors::OutOfRange(
             "The index should be less than the size of outputs of "
             "operator %s, but got index is %d and size is %d",
-            op_.Type(), idx, op_proto->outputs().size()));
+            op_.Type(),
+            idx,
+            op_proto->outputs().size()));
     return op_proto->outputs()[idx].name();
   }
 
-  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareDim(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                 size_t j = 0) override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
     PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
         platform::errors::NotFound("Input %s does not exist.", in));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
         platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of input dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          in_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of output dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
 
     Variable* in_var = in_it->second[i];
     Variable* out_var = out_it->second[j];
 
     PADDLE_ENFORCE_EQ(
-        in_var->Type(), out_var->Type(),
+        in_var->Type(),
+        out_var->Type(),
         platform::errors::InvalidArgument(
-            "The type of input (%s) and output (%s) are inconsistent.", in,
+            "The type of input (%s) and output (%s) are inconsistent.",
+            in,
             out));
 
     if (in_var->IsType<phi::SelectedRows>()) {
@@ -825,19 +864,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
                    const std::string& out) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
-    PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
+    PADDLE_ENFORCE_NE(in_it,
+                      ctx_.inputs.end(),
                       platform::errors::NotFound(
                           "Input [%s] found error in Op [%s]", in, op_.Type()));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
-        platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
-                                   op_.Type()));
+        out_it,
+        ctx_.outputs.end(),
+        platform::errors::NotFound(
+            "Output [%s] found error in Op [%s]", out, op_.Type()));
 
     auto& in_var_list = in_it->second;
     auto& out_var_list = out_it->second;
 
     PADDLE_ENFORCE_EQ(
-        in_var_list.size(), out_var_list.size(),
+        in_var_list.size(),
+        out_var_list.size(),
         platform::errors::PreconditionNotMet(
             "Op [%s]: Input var size should be equal with output var size",
             op_.Type()));
@@ -852,10 +894,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
       Variable* in_var = in_var_list[i];
       if (!in_var->IsType<LoDTensor>()) return;
       Variable* out_var = out_var_list[i];
-      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
+      PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(),
+                        true,
                         platform::errors::PreconditionNotMet(
                             "The %d-th output of Output(%s) must be LoDTensor.",
-                            i, out_var_names[i]));
+                            i,
+                            out_var_names[i]));
       auto& in_tensor = in_var->Get<LoDTensor>();
       auto* out_tensor = out_var->GetMutable<LoDTensor>();
       out_tensor->set_lod(in_tensor.lod());
@@ -866,32 +910,41 @@ class RuntimeInferShapeContext : public InferShapeContext {
     }
   }
 
-  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+  void ShareLoD(const std::string& in,
+                const std::string& out,
+                size_t i = 0,
                 size_t j = 0) const override {
     auto in_it = ctx_.inputs.find(in);
     auto out_it = ctx_.outputs.find(out);
     PADDLE_ENFORCE_NE(
-        in_it, ctx_.inputs.end(),
+        in_it,
+        ctx_.inputs.end(),
         platform::errors::NotFound("Input %s does not exist.", in));
     PADDLE_ENFORCE_NE(
-        out_it, ctx_.outputs.end(),
+        out_it,
+        ctx_.outputs.end(),
         platform::errors::NotFound("Output %s does not exist.", out));
-    PADDLE_ENFORCE_LT(i, in_it->second.size(),
+    PADDLE_ENFORCE_LT(i,
+                      in_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of input dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          in_it->second.size(), i));
-    PADDLE_ENFORCE_LT(j, out_it->second.size(),
+                          in_it->second.size(),
+                          i));
+    PADDLE_ENFORCE_LT(j,
+                      out_it->second.size(),
                       platform::errors::InvalidArgument(
                           "The index of output dimension is out of range, "
                           "excepted index less than %zu, but received %zu.",
-                          out_it->second.size(), j));
+                          out_it->second.size(),
+                          j));
 
     Variable* in_var = in_it->second.at(i);
     if (!in_var->IsType<LoDTensor>()) return;
     Variable* out_var = out_it->second.at(j);
     PADDLE_ENFORCE_EQ(
-        out_var->IsType<LoDTensor>(), true,
+        out_var->IsType<LoDTensor>(),
+        true,
         platform::errors::InvalidArgument(
             "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
     auto& in_tensor = in_var->Get<LoDTensor>();
@@ -926,7 +979,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
         "set in the runtime kernel."));
   }
 
-  void SetLoDLevel(const std::string& out, int32_t lod_level,
+  void SetLoDLevel(const std::string& out,
+                   int32_t lod_level,
                    size_t j = 0) const override {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "SetLoDLevel is only used in compile time. The calculation of "
@@ -969,10 +1023,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
   DDim GetInputDim(const std::string& name) const override {
     const std::vector<Variable*>& vars = InputVars(name);
     PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
         platform::errors::InvalidArgument(
             "Input(%s) should hold one element, but now it holds %zu elements.",
-            name, vars.size()));
+            name,
+            vars.size()));
     return this->GetDim(vars[0]);
   }
 
@@ -998,10 +1054,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
   void SetOutputDim(const std::string& name, const DDim& dim) override {
     auto& vars = OutputVars(name);
     PADDLE_ENFORCE_EQ(
-        vars.size(), 1UL,
+        vars.size(),
+        1UL,
         platform::errors::InvalidArgument("Output(%s) should hold one element, "
                                           "but now it holds %zu elements.",
-                                          name, vars.size()));
+                                          name,
+                                          vars.size()));
     SetDim(vars[0], dim);
   }
 
@@ -1038,7 +1096,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
   std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
     std::vector<DDim> ret;
     ret.reserve(vars.size());
-    std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   std::back_inserter(ret),
                    [this](Variable* var) { return this->GetDim(var); });
     return ret;
   }
@@ -1064,12 +1124,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
   void SetDims(const std::vector<Variable*>& vars,
                const std::vector<DDim>& dims) {
     size_t length = vars.size();
-    PADDLE_ENFORCE_EQ(length, dims.size(),
+    PADDLE_ENFORCE_EQ(length,
+                      dims.size(),
                       platform::errors::InvalidArgument(
                           "The number of input variables do not match the "
                           "number of input dimensions, the number of variables "
                           "is %zu, the number of dimensions is %zu.",
-                          length, dims.size()));
+                          length,
+                          dims.size()));
     for (size_t i = 0; i < length; ++i) {
       if (vars[i] == nullptr) {
         continue;
@@ -1088,9 +1150,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
       const std::vector<Variable*>& vars) const {
     std::vector<proto::VarType::Type> retv;
     retv.resize(vars.size());
-    std::transform(vars.begin(), vars.end(), retv.begin(),
+    std::transform(vars.begin(),
+                   vars.end(),
+                   retv.begin(),
                    std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
-                             this, std::placeholders::_1));
+                             this,
+                             std::placeholders::_1));
     return retv;
   }
 
@@ -1102,7 +1167,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const std::vector<Variable*>& InputVars(const std::string& name) const {
     auto it = ctx_.inputs.find(name);
     PADDLE_ENFORCE_NE(
-        it, ctx_.inputs.end(),
+        it,
+        ctx_.inputs.end(),
         platform::errors::NotFound(
             "Operator (%s) does not have the input (%s).", op_.Type(), name));
     return it->second;
@@ -1111,7 +1177,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
   const std::vector<Variable*>& OutputVars(const std::string& name) const {
     auto it = ctx_.outputs.find(name);
     PADDLE_ENFORCE_NE(
-        it, ctx_.outputs.end(),
+        it,
+        ctx_.outputs.end(),
         platform::errors::NotFound(
             "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
     return it->second;
@@ -1132,20 +1199,23 @@ static void CheckTensorNANOrInf(const std::string& op_type,
     return;
   }
   PADDLE_ENFORCE_NE(
-      framework::TensorContainsInf(tensor), true,
-      platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
-                              op_type, name));
+      framework::TensorContainsInf(tensor),
+      true,
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains Inf.", op_type, name));
   PADDLE_ENFORCE_NE(
-      framework::TensorContainsNAN(tensor), true,
-      platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
-                              op_type, name));
+      framework::TensorContainsNAN(tensor),
+      true,
+      platform::errors::Fatal(
+          "Operator %s output Tensor %s contains NAN.", op_type, name));
 }
 
 bool OperatorWithKernel::SupportGPU() const {
   auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
       phi::TransToPhiKernelName(type_));
   auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                   [](phi::KernelKeyMap::const_reference kern_pair) {
                     return kern_pair.first.backend() == phi::Backend::GPU;
                   });
@@ -1158,7 +1228,8 @@ bool OperatorWithKernel::SupportGPU() const {
     } else {
       auto& op_kernels = kernel_iter->second;
       return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
           [](OpKernelMap::const_reference kern_pair) {
             return platform::is_gpu_place(kern_pair.first.place_);
           });
@@ -1170,7 +1241,8 @@ bool OperatorWithKernel::SupportNPU() const {
   auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
       phi::TransToPhiKernelName(type_));
   auto has_phi_kernel =
-      std::any_of(phi_kernels.begin(), phi_kernels.end(),
+      std::any_of(phi_kernels.begin(),
+                  phi_kernels.end(),
                   [](phi::KernelKeyMap::const_reference kern_pair) {
                     return kern_pair.first.backend() == phi::Backend::NPU;
                   });
@@ -1183,7 +1255,8 @@ bool OperatorWithKernel::SupportNPU() const {
     } else {
       auto& op_kernels = kernel_iter->second;
       return std::any_of(
-          op_kernels.begin(), op_kernels.end(),
+          op_kernels.begin(),
+          op_kernels.end(),
           [](OpKernelMap::const_reference kern_pair) {
             return platform::is_npu_place(kern_pair.first.place_);
           });
@@ -1195,14 +1268,16 @@ bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
   auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
   if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
-    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
-                                       "Registered Kernels. And We don't "
-                                       "search its kernels in phi lib, "
-                                       "SupportsMKLDNN() return false.";
+    VLOG(6) << "Warning: " << type_
+            << " don't find its MKLDNN Kernel in Fluid "
+               "Registered Kernels. And We don't "
+               "search its kernels in phi lib, "
+               "SupportsMKLDNN() return false.";
     return false;
   }
   auto& op_kernels = op_kernel_iter->second;
-  return std::any_of(op_kernels.begin(), op_kernels.end(),
+  return std::any_of(op_kernels.begin(),
+                     op_kernels.end(),
                      [data_type](OpKernelMap::const_reference kern_pair) {
                        return platform::is_cpu_place(kern_pair.first.place_) &&
                               kern_pair.first.library_type_ ==
@@ -1366,7 +1441,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         && !is_xpu_unsupport
 #endif
-        ) {
+    ) {
       run_phi_kernel_ = true;
     } else {
       auto& all_op_kernels = AllOpKernels();
@@ -1399,7 +1474,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 #if defined(PADDLE_WITH_XPU_KP)
           || (is_xpu_unsupport && !is_xpu_kp_support)
 #endif
-              ) {
+      ) {
         auto pt_cpu_kernel_key =
             FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this);
         pt_kernel_.reset(
@@ -1429,10 +1504,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("prepare_data",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
-      transfer_scope = PrepareData(scope, *kernel_type_,
-                                   &transfered_inplace_vars, runtime_ctx);
+      transfer_scope = PrepareData(
+          scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
     }
   }
   // exec scope is the scope that kernel actually executed on.
@@ -1442,9 +1518,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (!all_kernels_must_compute_runtime_shape_) {
     platform::RecordEvent record_event("infer_shape",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
     this->Info().infer_shape_(&infer_shape_ctx);
+    record_event.End();
+    platform::RecordOpInfoSupplement(
+        Type(), Attrs(), infer_shape_ctx, *runtime_ctx);
   }
 
   if (FLAGS_enable_unused_var_check) {
@@ -1456,7 +1536,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("compute",
                                        platform::TracerEventType::OperatorInner,
-                                       1, platform::EventRole::kInnerOp);
+                                       1,
+                                       platform::EventRole::kInnerOp);
     if (run_phi_kernel_) {
       phi::KernelContext pt_kernel_context;
       // Do data transform before building KernelContext
@@ -1584,7 +1665,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   auto& all_op_kernels = AllOpKernels();
   auto kernels_iter = all_op_kernels.find(type_);
   PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
+      kernels_iter,
+      all_op_kernels.end(),
       platform::errors::Unavailable(
           "There are no kernels which are registered in the %s operator.",
           type_));
@@ -1706,10 +1788,12 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
     kernel_iter = kernels.find(expected_kernel_key);
   }
 #endif
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    platform::errors::NotFound(
-                        "Operator (%s) does not have kernel for %s.", type_,
-                        KernelTypeToString(expected_kernel_key)));
+  PADDLE_ENFORCE_NE(
+      kernel_iter,
+      kernels.end(),
+      platform::errors::NotFound("Operator (%s) does not have kernel for %s.",
+                                 type_,
+                                 KernelTypeToString(expected_kernel_key)));
 
   std::lock_guard<std::mutex> lock(cache_update_mutex_);
   if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
@@ -1719,7 +1803,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
 }
 
 void OperatorWithKernel::TransferInplaceVarsBack(
-    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& scope,
+    const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
   for (auto& var_name : inplace_vars) {
     VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
@@ -1730,8 +1815,9 @@ void OperatorWithKernel::TransferInplaceVarsBack(
     auto* original_tensor =
         GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
     auto* var = transfer_scope.FindVar(var_name);
-    PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
-                                     "The variable[%s] is nullptr.", var_name));
+    PADDLE_ENFORCE_NOT_NULL(var,
+                            platform::errors::InvalidArgument(
+                                "The variable[%s] is nullptr.", var_name));
     auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
     auto original_dims = original_tensor->dims();
     original_tensor->ShareDataWith(*transformed_tensor);
@@ -1811,7 +1897,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
 }
 
 Scope* OperatorWithKernel::PrepareData(
-    const Scope& scope, const OpKernelType& expected_kernel_key,
+    const Scope& scope,
+    const OpKernelType& expected_kernel_key,
     std::vector<std::string>* transfered_inplace_vars,
     RuntimeContext* ctx) const {
   Scope* new_scope = nullptr;
@@ -1867,8 +1954,8 @@ Scope* OperatorWithKernel::PrepareData(
           input_vars[i] = trans_var;
           auto out = trans_var->GetMutable<LoDTensor>();
           out->Resize(tensor_in->dims());
-          platform::MatchShapeToLayout(out, tensor_in->layout(),
-                                       DataLayout::kNHWC);
+          platform::MatchShapeToLayout(
+              out, tensor_in->layout(), DataLayout::kNHWC);
           VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
                      "but kNHWC layout"
                   << var_name_item.first << " in Operator " << type_;
@@ -1915,8 +2002,8 @@ Scope* OperatorWithKernel::PrepareData(
       if (!run_by_executor_ &&
           (platform::is_gpu_place(kernel_type_for_var.place_) ||
            platform::is_gpu_place(expected_kernel_key.place_))) {
-        new_scope = TryCreateTransferScope(kernel_type_for_var,
-                                           expected_kernel_key, &scope);
+        new_scope = TryCreateTransferScope(
+            kernel_type_for_var, expected_kernel_key, &scope);
         enable_cache_transfer_scope_ = true;
       }
       if (!new_scope) {
@@ -1978,7 +2065,8 @@ Scope* OperatorWithKernel::PrepareData(
 }
 
 void OperatorWithKernel::ParseInputDataType(
-    const Variable* var, const std::string& name,
+    const Variable* var,
+    const std::string& name,
     proto::VarType::Type* data_type) const {
   if (var != nullptr) {
     const Tensor* t = nullptr;
@@ -1998,17 +2086,20 @@ void OperatorWithKernel::ParseInputDataType(
     }
     if (t != nullptr) {
       PADDLE_ENFORCE_EQ(
-          t->IsInitialized(), true,
+          t->IsInitialized(),
+          true,
           platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                             "contains uninitialized Tensor.",
-                                            Type(), name));
+                                            Type(),
+                                            name));
       *data_type = paddle::framework::TransToProtoVarType(t->dtype());
     }
   }
 }
 
 void OperatorWithKernel::ParseMultiInputDataType(
-    const std::vector<Variable*>& vars, const std::string& name,
+    const std::vector<Variable*>& vars,
+    const std::string& name,
     proto::VarType::Type* data_type) const {
   proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
@@ -2032,10 +2123,12 @@ void OperatorWithKernel::ParseMultiInputDataType(
       }
       if (t != nullptr) {
         PADDLE_ENFORCE_EQ(
-            t->IsInitialized(), true,
+            t->IsInitialized(),
+            true,
             platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
                                               "contains uninitialized Tensor.",
-                                              Type(), name));
+                                              Type(),
+                                              name));
         proto::VarType::Type tmp =
             paddle::framework::TransToProtoVarType(t->dtype());
         PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
@@ -2045,7 +2138,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
                            "consistent or reigster GetExpectedKernelType. The "
                            "current variable type is (%s), but the "
                            "previous variable type is (%s).",
-                           Type(), name, DataTypeToString(tmp),
+                           Type(),
+                           name,
+                           DataTypeToString(tmp),
                            DataTypeToString(*data_type)));
         *data_type = tmp;
       }
@@ -2066,7 +2161,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
     }
   }
   PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
       platform::errors::NotFound(
           "DataType should be indicated by input Variable at %s.", Type()));
   return data_type;
@@ -2083,12 +2179,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
     ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type);
   }
   PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
+      data_type,
+      dafault_data_type,
       platform::errors::InvalidArgument(
           "The Input Variable(%s) of (%s) Operator used to determine kernel "
           "data type is empty or not LoDTensor or SelectedRows or "
           "LoDTensorArray.",
-          name, Type()));
+          name,
+          Type()));
   return data_type;
 }
 
@@ -2120,11 +2218,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
       t,
       platform::errors::InvalidArgument(
           "The Tensor of variable %s is nullptr when promote complex types."));
-  PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+  PADDLE_ENFORCE_EQ(t->IsInitialized(),
+                    true,
                     platform::errors::InvalidArgument(
                         "The Tensor in the %s Op's Input Variable %s(%s) is "
                         "not initialized.",
-                        Type(), name, ctx.InputName(name)));
+                        Type(),
+                        name,
+                        ctx.InputName(name)));
   return t;
 }
 
@@ -2136,7 +2237,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
  * the kernel data type.
  */
 proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes(
-    const ExecutionContext& ctx, const std::string& name1,
+    const ExecutionContext& ctx,
+    const std::string& name1,
     const std::string& name2) const {
   // 1. Get tensor
   auto* tensor_a = GetTensorFormInputSafely(ctx, name1);
@@ -2158,10 +2260,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
 }
 
 OpKernelType OperatorWithKernel::GetKernelTypeForVar(
-    const std::string& var_name, const Tensor& tensor,
+    const std::string& var_name,
+    const Tensor& tensor,
     const OpKernelType& expected_kernel_type) const {
-  return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
-                      tensor.layout());
+  return OpKernelType(
+      expected_kernel_type.data_type_, tensor.place(), tensor.layout());
 }
 
 phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
@@ -2172,8 +2275,9 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
     if (arg_map_fn) {
       arg_map_fn_.reset(new phi::ArgumentMappingFn(*arg_map_fn));
     } else {
-      auto func = [this](
-          const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
+      auto func =
+          [this](
+              const phi::ArgumentMappingContext& ctx) -> phi::KernelSignature {
         return phi::DefaultKernelSignatureMap::Instance().Get(type_);
       };
       arg_map_fn_.reset(new phi::ArgumentMappingFn(func));
@@ -2183,16 +2287,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
 }
 
 Scope* OperatorWithKernel::PreparePhiData(
-    const Scope& scope, const phi::Kernel& pt_kernel,
+    const Scope& scope,
+    const phi::Kernel& pt_kernel,
     const phi::KernelSignature& pt_kernel_signature,
     RuntimeContext* ctx) const {
   const auto& input_names = pt_kernel_signature.input_names;
   auto input_defs = pt_kernel.args_def().input_defs();
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of inputs_args names (%d) must be equal to "
                         "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
   Scope* new_scope = nullptr;
   auto& name_map = Inputs();
   const std::unordered_set<std::string>* no_buffer_ins = nullptr;
@@ -2279,7 +2386,8 @@ Scope* OperatorWithKernel::PreparePhiData(
 }
 
 void OperatorWithKernel::BuildPhiKernelContext(
-    const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
+    const RuntimeContext& ctx,
+    platform::DeviceContext* dev_ctx,
     phi::KernelContext* pt_kernel_context) const {
   pt_kernel_context->SetDeviceContext(dev_ctx);
 
@@ -2291,23 +2399,29 @@ void OperatorWithKernel::BuildPhiKernelContext(
   auto attr_defs = pt_kernel_->args_def().attribute_defs();
   auto output_defs = pt_kernel_->args_def().output_defs();
 
-  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+  PADDLE_ENFORCE_EQ(input_names.size(),
+                    input_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of inputs_args names (%d) must be equal to "
                         "the size of kernel input_defs (%d).",
-                        input_names.size(), input_defs.size()));
+                        input_names.size(),
+                        input_defs.size()));
 
-  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+  PADDLE_ENFORCE_EQ(output_names.size(),
+                    output_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of outputs_args names (%d) must be equal to "
                         "the size of kernel output_defs (%d).",
-                        output_names.size(), output_defs.size()));
+                        output_names.size(),
+                        output_defs.size()));
 
-  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
+  PADDLE_ENFORCE_EQ(attr_names.size(),
+                    attr_defs.size(),
                     platform::errors::InvalidArgument(
                         "The size of attribute_args names (%d) must be equal "
                         "to the size of kernel attribute_defs (%d).",
-                        attr_names.size(), attr_defs.size()));
+                        attr_names.size(),
+                        attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto it = ctx.inputs.find(input_names[i]);
@@ -2489,7 +2603,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
         break;
       case phi::AttributeType::SCALARS: {
         PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                        "buildind static KernelContext.",
                                        attr_names[i]));
@@ -2553,7 +2668,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } break;
       default: {
         PADDLE_ENFORCE_NE(
-            attr_iter, Attrs().end(),
+            attr_iter,
+            Attrs().end(),
             platform::errors::NotFound("(%s) is not found in AttributeMap when "
                                        "buildind static KernelContext.",
                                        attr_names[i]));
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
old mode 100644
new mode 100755
index 5af13f76b36bd..ef9cde883fb01
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -1,137 +1,264 @@
-cc_library(allocator SRCS allocator.cc DEPS place stats)
-cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
-cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
-cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
-cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
-cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler)
-cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator)
-cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
-
-if (WITH_MKLDNN)
+cc_library(
+  allocator
+  SRCS allocator.cc
+  DEPS place stats profiler)
+cc_library(
+  cpu_allocator
+  SRCS cpu_allocator.cc
+  DEPS allocator)
+cc_library(
+  locked_allocator
+  SRCS locked_allocator.cc
+  DEPS allocator)
+cc_library(
+  buffered_allocator
+  SRCS buffered_allocator.cc
+  DEPS allocator)
+cc_library(
+  best_fit_allocator
+  SRCS best_fit_allocator.cc
+  DEPS allocator)
+cc_library(
+  naive_best_fit_allocator
+  SRCS naive_best_fit_allocator.cc
+  DEPS allocator buddy_allocator)
+cc_test(
+  naive_best_fit_allocator_test
+  SRCS naive_best_fit_allocator_test.cc
+  DEPS naive_best_fit_allocator)
+cc_test(
+  buffered_allocator_test
+  SRCS buffered_allocator_test.cc
+  DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
+
+if(WITH_MKLDNN)
   set(MKLDNN_CTX_DEPS mkldnn)
-else ()
+else()
   set(MKLDNN_CTX_DEPS)
 endif()
 
-if (WITH_GPU)
-  nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
-  nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
-  nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph)
-  nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
+if(WITH_GPU)
+  nv_library(
+    cuda_allocator
+    SRCS cuda_allocator.cc
+    DEPS allocator cuda_device_guard stats)
+  nv_library(
+    cuda_managed_allocator
+    SRCS cuda_managed_allocator.cc
+    DEPS allocator cuda_device_guard gpu_info)
+  nv_library(
+    pinned_allocator
+    SRCS pinned_allocator.cc
+    DEPS allocator)
+  nv_library(
+    stream_safe_cuda_allocator
+    SRCS stream_safe_cuda_allocator.cc
+    DEPS allocator cuda_graph)
+  nv_library(
+    thread_local_allocator
+    SRCS thread_local_allocator.cc
+    DEPS allocator)
 
-  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+  cc_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS thread_local_allocator)
   if(CUDA_VERSION GREATER_EQUAL 10.2)
-    nv_library(cuda_virtual_mem_allocator SRCS cuda_virtual_mem_allocator.cc DEPS dynload_cuda)
+    nv_library(
+      cuda_virtual_mem_allocator
+      SRCS cuda_virtual_mem_allocator.cc
+      DEPS dynload_cuda)
   endif()
 endif()
 
-if (WITH_ROCM)
-  hip_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard stats)
-  hip_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info)
-  hip_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
-  hip_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator)
-  hip_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator)
-  
-  cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator)
+if(WITH_ROCM)
+  hip_library(
+    cuda_allocator
+    SRCS cuda_allocator.cc
+    DEPS allocator cuda_device_guard stats)
+  hip_library(
+    cuda_managed_allocator
+    SRCS cuda_managed_allocator.cc
+    DEPS allocator cuda_device_guard gpu_info)
+  hip_library(
+    pinned_allocator
+    SRCS pinned_allocator.cc
+    DEPS allocator)
+  hip_library(
+    stream_safe_cuda_allocator
+    SRCS stream_safe_cuda_allocator.cc
+    DEPS allocator)
+  hip_library(
+    thread_local_allocator
+    SRCS thread_local_allocator.cc
+    DEPS allocator)
+
+  cc_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS thread_local_allocator)
 endif()
 
-if (WITH_ASCEND_CL)
-  cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info)
-  cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info)
+if(WITH_ASCEND_CL)
+  cc_library(
+    npu_allocator
+    SRCS npu_allocator.cc
+    DEPS allocator npu_info)
+  cc_library(
+    npu_pinned_allocator
+    SRCS npu_pinned_allocator.cc
+    DEPS allocator npu_info)
 endif()
 
-cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
+cc_library(
+  retry_allocator
+  SRCS retry_allocator.cc
+  DEPS allocator)
 
-if (WITH_GPU OR WITH_ROCM)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator cuda_managed_allocator pinned_allocator cuda_device_guard thread_local_allocator stream_safe_cuda_allocator device_context)
-    if(CUDA_VERSION GREATER_EQUAL 10.2)
-      list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
-    endif()
+if(WITH_GPU OR WITH_ROCM)
+  set(AllocatorFacadeDeps
+      gpu_info
+      cuda_allocator
+      cuda_managed_allocator
+      pinned_allocator
+      cuda_device_guard
+      thread_local_allocator
+      stream_safe_cuda_allocator
+      device_context)
+  if(CUDA_VERSION GREATER_EQUAL 10.2)
+    list(APPEND AllocatorFacadeDeps cuda_virtual_mem_allocator)
+  endif()
 elseif(WITH_XPU)
-    set(AllocatorFacadeDeps xpu_info)
+  set(AllocatorFacadeDeps xpu_info)
 elseif(WITH_IPU)
-    set(AllocatorFacadeDeps ipu_info)
+  set(AllocatorFacadeDeps ipu_info)
 elseif(WITH_ASCEND)
-    set(AllocatorFacadeDeps ascend_npu_info)
-else ()
-    set(AllocatorFacadeDeps)
+  set(AllocatorFacadeDeps ascend_npu_info)
+else()
+  set(AllocatorFacadeDeps)
 endif()
 
-if (WITH_CUSTOM_DEVICE)
-  cc_library(custom_allocator SRCS custom_allocator.cc DEPS allocator device_manager)
+if(WITH_CUSTOM_DEVICE)
+  cc_library(
+    custom_allocator
+    SRCS custom_allocator.cc
+    DEPS allocator device_manager)
   set(AllocatorFacadeDeps ${AllocatorFacadeDeps} custom_allocator)
 endif()
 
-if (WITH_GPU)
-    nv_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-                best_fit_allocator_test.cu
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator
-                cuda_allocator
-                device_context
-                memcpy)
-elseif (WITH_ROCM)
-    hip_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-                best_fit_allocator_test.cu
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator
-                cuda_allocator
-                device_context
-                memcpy)
+if(WITH_GPU)
+  nv_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
+    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
+         device_context memcpy)
+elseif(WITH_ROCM)
+  hip_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
+    DEPS best_fit_allocator locked_allocator cpu_allocator cuda_allocator
+         device_context memcpy)
 else()
-    cc_test(best_fit_allocator_test
-            SRCS best_fit_allocator_test.cc
-            DEPS best_fit_allocator
-                locked_allocator
-                cpu_allocator)
+  cc_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc
+    DEPS best_fit_allocator locked_allocator cpu_allocator)
 endif()
 
-list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator virtual_memory_auto_growth_best_fit_allocator best_fit_allocator)
+list(
+  APPEND
+  AllocatorFacadeDeps
+  cpu_allocator
+  locked_allocator
+  aligned_allocator
+  retry_allocator
+  buffered_allocator
+  naive_best_fit_allocator
+  auto_growth_best_fit_allocator
+  virtual_memory_auto_growth_best_fit_allocator
+  best_fit_allocator)
 
-if (WITH_ASCEND_CL)
-    list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
+if(WITH_ASCEND_CL)
+  list(APPEND AllocatorFacadeDeps npu_pinned_allocator)
 endif()
 
+cc_library(
+  aligned_allocator
+  SRCS aligned_allocator.cc
+  DEPS allocator)
+cc_test(
+  test_aligned_allocator
+  SRCS test_aligned_allocator.cc
+  DEPS aligned_allocator)
+cc_library(
+  allocator_strategy
+  SRCS allocator_strategy.cc
+  DEPS gflags ${AllocatorFacadeDeps})
+cc_library(
+  allocator_facade
+  SRCS allocator_facade.cc
+  DEPS allocator_strategy stats)
 
-cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator)
-cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator)
-cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps})
-cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy stats)
-
-if (WITH_GPU)
+if(WITH_GPU)
   target_link_libraries(allocator_facade cuda_graph)
 endif()
 
-cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator)
-if (WITH_TESTING)
-  if ((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
+cc_test(
+  retry_allocator_test
+  SRCS retry_allocator_test.cc
+  DEPS retry_allocator locked_allocator cpu_allocator)
+if(WITH_TESTING)
+  if((WITH_GPU OR WITH_ROCM) AND TARGET retry_allocator_test)
     target_link_libraries(retry_allocator_test cuda_allocator)
   endif()
 
-  if (TEST retry_allocator_test)
-    set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+  if(TEST retry_allocator_test)
+    set_tests_properties(retry_allocator_test PROPERTIES LABELS
+                                                         "RUN_TYPE=EXCLUSIVE")
   endif()
 endif()
 
-cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade)
+cc_test(
+  allocator_facade_abs_flags_test
+  SRCS allocator_facade_abs_flags_test.cc
+  DEPS allocator_facade)
 
-cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade)
+cc_test(
+  allocator_facade_frac_flags_test
+  SRCS allocator_facade_frac_flags_test.cc
+  DEPS allocator_facade)
 
-cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator flags)
-cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator)
-cc_test(auto_growth_best_fit_allocator_test SRCS auto_growth_best_fit_allocator_test.cc DEPS auto_growth_best_fit_allocator)
+cc_library(
+  auto_growth_best_fit_allocator
+  SRCS auto_growth_best_fit_allocator.cc
+  DEPS allocator aligned_allocator flags)
+cc_test(
+  auto_growth_best_fit_allocator_facade_test
+  SRCS auto_growth_best_fit_allocator_facade_test.cc
+  DEPS cpu_allocator auto_growth_best_fit_allocator)
+cc_test(
+  auto_growth_best_fit_allocator_test
+  SRCS auto_growth_best_fit_allocator_test.cc
+  DEPS auto_growth_best_fit_allocator)
 
-cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator)
+cc_library(
+  virtual_memory_auto_growth_best_fit_allocator
+  SRCS virtual_memory_auto_growth_best_fit_allocator.cc
+  DEPS allocator aligned_allocator)
 
 if(NOT WIN32)
-  cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
-  cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
-  if (WITH_GPU)
-    cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator)
+  cc_library(
+    mmap_allocator
+    SRCS mmap_allocator.cc
+    DEPS allocator)
+  cc_test(
+    mmap_allocator_test
+    SRCS mmap_allocator_test.cc
+    DEPS mmap_allocator allocator)
+  if(WITH_GPU)
+    cc_library(
+      cuda_ipc_allocator
+      SRCS cuda_ipc_allocator.cc
+      DEPS allocator)
   endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 5efbfce7fedd6..52cb4dd18a814 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -33,7 +33,8 @@
 #endif
 
 PADDLE_DEFINE_EXPORTED_bool(
-    init_allocated_mem, false,
+    init_allocated_mem,
+    false,
     "It is a mistake that the values of the memory allocated by "
     "BuddyAllocator are always zeroed in some op's implementation. "
     "To find this error in time, we use init_allocated_mem to indicate "
@@ -78,7 +79,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
   std::call_once(init_flag, []() {
     a = new detail::BuddyAllocator(
         std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(),
+        platform::CpuMaxChunkSize());
   });
 
   return a;
@@ -96,7 +98,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
+void Free<platform::CPUPlace>(const platform::CPUPlace &place,
+                              void *p,
                               size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
@@ -126,7 +129,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
   return p;
 }
 template <>
-void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+void Free<platform::IPUPlace>(const platform::IPUPlace &place,
+                              void *p,
                               size_t size) {
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
   GetCPUBuddyAllocator()->Free(p);
@@ -155,7 +159,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
     ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
   }
   PADDLE_ENFORCE_EQ(
-      ret, XPU_SUCCESS,
+      ret,
+      XPU_SUCCESS,
       platform::errors::External(
           "XPU API return wrong value[%d], no enough memory", ret));
   if (FLAGS_init_allocated_mem) {
@@ -172,7 +177,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
+void Free<platform::XPUPlace>(const platform::XPUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
@@ -235,11 +241,13 @@ class NPUBuddyAllocatorList {
   BuddyAllocator *Get(int npu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetNPUDeviceId(devices_[pos]);
@@ -247,7 +255,8 @@ class NPUBuddyAllocatorList {
           new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                  new detail::NPUAllocator(devices_[pos])),
                              platform::NPUMinChunkSize(),
-                             platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
+                             platform::NPUMaxChunkSize(),
+                             EXTRA_PADDING_SIZE));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -313,8 +322,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
         "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::NPUPlace>(place))));
@@ -332,7 +343,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
+void Free<platform::NPUPlace>(const platform::NPUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -385,7 +397,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
 
 template <>
 void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
-                                    void *p, size_t size) {
+                                    void *p,
+                                    size_t size) {
 #ifdef PADDLE_WITH_ASCEND_CL
   GetNPUPinnedBuddyAllocator()->Free(p);
 #else
@@ -431,18 +444,21 @@ class GPUBuddyAllocatorList {
   BuddyAllocator *Get(int gpu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::GPUAllocator(devices_[pos])),
-          platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                 new detail::GPUAllocator(devices_[pos])),
+                             platform::GpuMinChunkSize(),
+                             platform::GpuMaxChunkSize()));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "'FLAGS_fraction_of_gpu_memory_to_use' "
@@ -494,8 +510,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
         "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
@@ -516,7 +534,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 }
 
 template <>
-void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
+void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
+                               void *p,
                                size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GetGPUBuddyAllocator(place.device)->Free(p);
@@ -585,7 +604,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 
 template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
-                                     void *p, size_t size) {
+                                     void *p,
+                                     size_t size) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -631,18 +651,21 @@ class MLUBuddyAllocatorList {
   BuddyAllocator *Get(int mlu_id) {
     auto pos = std::distance(
         devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
-    PADDLE_ENFORCE_LT(pos, devices_.size(),
+    PADDLE_ENFORCE_LT(pos,
+                      devices_.size(),
                       platform::errors::OutOfRange(
                           "The index exceeds the size of devices, the size of "
                           "devices is %d, the index is %d",
-                          devices_.size(), pos));
+                          devices_.size(),
+                          pos));
 
     std::call_once(*init_flags_[pos], [this, pos] {
       platform::SetMLUDeviceId(devices_[pos]);
-      allocators_[pos].reset(new BuddyAllocator(
-          std::unique_ptr<detail::SystemAllocator>(
-              new detail::MLUAllocator(devices_[pos])),
-          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
+      allocators_[pos].reset(
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                 new detail::MLUAllocator(devices_[pos])),
+                             platform::MLUMinChunkSize(),
+                             platform::MLUMaxChunkSize()));
       VLOG(10) << "\n\nNOTE:\n"
                << "You can set GFlags environment variable "
                << "(mlu reuse gpu GFlags) "
@@ -694,8 +717,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
         "%s, MLUMinChunkSize %s, MLU memory used: %s.",
-        string::HumanReadableSize(size), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
         string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
         string::HumanReadableSize(Used<platform::MLUPlace>(place))));
@@ -712,7 +737,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
 }
 
 template <>
-void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
+void Free<platform::MLUPlace>(const platform::MLUPlace &place,
+                              void *p,
                               size_t size) {
 #ifdef PADDLE_WITH_MLU
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -760,10 +786,12 @@ class BuddyAllocatorList {
   }
 
   BuddyAllocator *Get(int dev_id) {
-    PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
+    PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
+                      init_flags_.end(),
                       platform::errors::OutOfRange(
                           "Cannot find %s %d, please check visible devices.",
-                          device_type_, dev_id));
+                          device_type_,
+                          dev_id));
 
     std::call_once(*init_flags_[dev_id], [this, dev_id] {
       phi::DeviceManager::SetDevice(device_type_, dev_id);
@@ -774,7 +802,8 @@ class BuddyAllocatorList {
               new detail::CustomAllocator(device_type_, dev_id)),
           phi::DeviceManager::GetMinChunkSize(place),
           phi::DeviceManager::GetMaxChunkSize(place),
-          phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
+          phi::DeviceManager::GetExtraPaddingSize(place),
+          device_type_));
     });
 
     return allocators_[dev_id].get();
@@ -814,8 +843,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
     PADDLE_THROW(platform::errors::ResourceExhausted(
         "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
         "%s. ",
-        string::HumanReadableSize(size), place.GetDeviceType(), place.device,
-        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(size),
+        place.GetDeviceType(),
+        place.device,
+        string::HumanReadableSize(avail),
+        string::HumanReadableSize(total),
         string::HumanReadableSize(total - avail)));
   } else {
     if (FLAGS_init_allocated_mem) {
@@ -831,7 +863,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
 }
 
 template <>
-void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
+void Free<platform::CustomPlace>(const platform::CustomPlace &place,
+                                 void *p,
                                  size_t size) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
@@ -923,8 +956,6 @@ namespace allocation {
 phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
   auto *tmp_alloc = new Allocation(ptr, size, place_);
-  platform::MemEvenRecorder::Instance().PushMemRecord(
-      static_cast<void *>(tmp_alloc), place_, size);
   return tmp_alloc;
 }
 
@@ -932,8 +963,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
   paddle::platform::VisitPlace(
       allocation->place(),
       legacy::FreeVisitor(allocation->ptr(), allocation->size()));
-  platform::MemEvenRecorder::Instance().PopMemRecord(
-      static_cast<void *>(allocation), place_);
   delete allocation;
 }
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 276c6bb0e69b8..f1c0178fafc02 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
 
+#include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -24,6 +26,11 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
+  platform::RecordMemEvent(allocation->ptr(),
+                           allocation->place(),
+                           allocation->size(),
+                           platform::TracerMemEventType::ReservedFree);
   delete allocation;
 }
 phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
@@ -33,6 +40,11 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(ptr,
+                           platform::CUDAPinnedPlace(),
+                           size,
+                           platform::TracerMemEventType::ReservedAllocate);
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/stat_allocator.h b/paddle/fluid/memory/allocation/stat_allocator.h
index 71569366c2446..ef999dddf4591 100644
--- a/paddle/fluid/memory/allocation/stat_allocator.h
+++ b/paddle/fluid/memory/allocation/stat_allocator.h
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/stats.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 
 namespace paddle {
 namespace memory {
@@ -30,16 +31,38 @@ class StatAllocator : public Allocator {
 
  protected:
   void FreeImpl(phi::Allocation* allocation) override {
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       -allocation->size());
+    if (platform::is_cpu_place(allocation->place()) ||
+        platform::is_cuda_pinned_place(allocation->place())) {
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, allocation->place().GetDeviceId(), -allocation->size());
+    }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Free);
     underlying_allocator_->Free(allocation);
   }
 
   phi::Allocation* AllocateImpl(size_t size) override {
     phi::Allocator::AllocationPtr allocation =
         underlying_allocator_->Allocate(size);
-    MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-                       allocation->size());
+
+    const platform::Place& place = allocation->place();
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      HOST_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
+    } else {
+      DEVICE_MEMORY_STAT_UPDATE(
+          Allocated, place.GetDeviceId(), allocation->size());
+    }
+    platform::RecordMemEvent(allocation->ptr(),
+                             allocation->place(),
+                             allocation->size(),
+                             platform::TracerMemEventType::Allocate);
     return allocation.release();
   }
 
diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc
index 37ac0b4483291..f79b97de18414 100644
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@@ -39,6 +39,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 
 DECLARE_bool(use_pinned_memory);
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -62,12 +63,14 @@ void* AlignedMalloc(size_t size) {
 #else
   int error = posix_memalign(&p, alignment, size);
   PADDLE_ENFORCE_EQ(
-      error, 0,
+      error,
+      0,
       platform::errors::ResourceExhausted(
           "Fail to alloc memory of %ld size, error code is %d.", size, error));
 #endif
-  PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
-                                 "Fail to alloc memory of %ld size.", size));
+  PADDLE_ENFORCE_NOT_NULL(p,
+                          platform::errors::ResourceExhausted(
+                              "Fail to alloc memory of %ld size.", size));
   return p;
 }
 
@@ -92,6 +95,9 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
     }
   }
 
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
   return p;
 }
 
@@ -108,6 +114,10 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 #else
   free(p);
 #endif
+
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
@@ -140,7 +150,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum GPU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -155,20 +166,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        gpu_id_, string::HumanReadableSize(size), gpu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        gpu_id_,
+        string::HumanReadableSize(size),
+        gpu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        gpu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void GPUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(gpu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, gpu_alloc_size_));
+                        size,
+                        gpu_alloc_size_));
   gpu_alloc_size_ -= size;
 
   platform::RecordedGpuFree(p, size, gpu_id_);
@@ -205,6 +225,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
   if (result == gpuSuccess) {
     *index = 1;  // PINNED memory
     cuda_pinnd_alloc_size_ += size;
+    HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
+    platform::RecordMemEvent(
+        p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
     return p;
   } else {
     LOG(WARNING) << "cudaHostAlloc failed.";
@@ -216,20 +239,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   gpuError_t err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
-  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated cuda pinned memory (%d)",
-                        size, cuda_pinnd_alloc_size_));
+                        size,
+                        cuda_pinnd_alloc_size_));
   cuda_pinnd_alloc_size_ -= size;
 #ifdef PADDLE_WITH_HIP
   err = hipHostFree(p);
   if (err != hipErrorDeinitialized) {
     PADDLE_ENFORCE_EQ(
-        err, hipSuccess,
+        err,
+        hipSuccess,
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
@@ -243,12 +271,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
   // cudaFreeHost succeeds.
   if (err != cudaErrorCudartUnloading) {
     PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
         platform::errors::Fatal(
             "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
             err));
   }
 #endif
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 
 bool CUDAPinnedAllocator::UseGpu() const { return false; }
@@ -279,7 +311,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum GPU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -294,21 +327,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        npu_id_, string::HumanReadableSize(size), npu_id_,
-        string::HumanReadableSize(avail), npu_id_,
-        FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        npu_id_,
+        string::HumanReadableSize(size),
+        npu_id_,
+        string::HumanReadableSize(avail),
+        npu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void NPUAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(npu_alloc_size_, size,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(npu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, npu_alloc_size_));
+                        size,
+                        npu_alloc_size_));
   npu_alloc_size_ -= size;
 
   platform::RecordedNPUFree(p, size, npu_id_);
@@ -347,20 +388,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
 
 void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
   aclError err;
-  PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument(
-                                  "The index should be 1, but got %d", index));
+  PADDLE_ENFORCE_EQ(index,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "The index should be 1, but got %d", index));
 
-  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
+  PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated npu pinned memory (%d)",
-                        size, npu_pinnd_alloc_size_));
+                        size,
+                        npu_pinnd_alloc_size_));
   npu_pinnd_alloc_size_ -= size;
   err = platform::NPUHostFree(p);
 
   if (err != ACL_ERROR_NONE) {
     PADDLE_ENFORCE_EQ(
-        err, 0,
+        err,
+        0,
         platform::errors::Fatal(
             "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
   }
@@ -395,7 +441,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
           "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
           "maximum MLU memory usage is limited to %d MB.\n"
           "      The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
-          limit_size, limit_size);
+          limit_size,
+          limit_size);
     }
 
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
@@ -410,20 +457,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
         "please set it to a higher value but less than 1.0.\n"
         "      The command is "
         "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
-        mlu_id_, string::HumanReadableSize(size), mlu_id_,
-        string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
-        mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
+        mlu_id_,
+        string::HumanReadableSize(size),
+        mlu_id_,
+        string::HumanReadableSize(allocated),
+        string::HumanReadableSize(avail),
+        mlu_id_,
+        FLAGS_fraction_of_gpu_memory_to_use,
+        err_msg));
   }
 }
 
 void MLUAllocator::Free(void* p, size_t size, size_t index) {
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(mlu_alloc_size_,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, mlu_alloc_size_));
+                        size,
+                        mlu_alloc_size_));
   mlu_alloc_size_ -= size;
 
   platform::RecordedMLUFree(p, size, mlu_id_);
@@ -452,7 +508,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
         "\n\nOut of memory error on %s %d. "
         "total memory is %s, used memory is %s, "
         "available memory is only %s.\n\n",
-        dev_type_, dev_id_, string::HumanReadableSize(total),
+        dev_type_,
+        dev_id_,
+        string::HumanReadableSize(total),
         string::HumanReadableSize(total - avail),
         string::HumanReadableSize(avail)));
   }
@@ -461,13 +519,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
 
 void CustomAllocator::Free(void* p, size_t size, size_t index) {
   VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
-  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
-                                  "The index should be 0, index is %d", index));
-  PADDLE_ENFORCE_GE(plug_alloc_size, size,
+  PADDLE_ENFORCE_EQ(index,
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The index should be 0, index is %d", index));
+  PADDLE_ENFORCE_GE(plug_alloc_size,
+                    size,
                     platform::errors::InvalidArgument(
                         "The size of memory (%d) to free exceeds the size of "
                         "allocated gpu memory (%d)",
-                        size, plug_alloc_size));
+                        size,
+                        plug_alloc_size));
   plug_alloc_size -= size;
   auto place = platform::CustomPlace(dev_type_, dev_id_);
   auto device = phi::DeviceManager::GetDeviceWithPlace(place);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 3198b4f8d935e..ae2c0aa612e77 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/common/place.h"
 
 #ifdef PADDLE_WITH_XPU
@@ -33,8 +33,12 @@ namespace memory {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 template <>
 void Copy<platform::CPUPlace, platform::CustomPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
 
 template <>
 void Copy<platform::CustomPlace, platform::CPUPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CustomPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
   auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CustomPlace, platform::CustomPlace>(
-    platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CustomPlace dst_place,
+    void* dst,
+    platform::CustomPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
 #endif  // PADDLE_WITH_CUSTOM_DEVICE
 
 template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
+                                                  void* dst,
                                                   platform::CPUPlace,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
@@ -115,7 +129,8 @@ template <>
 void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -123,7 +138,8 @@ template <>
 void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -131,15 +147,18 @@ template <>
 void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
                                                   void* dst,
                                                   platform::IPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
 
 // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
 template <>
-void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   if (src_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_src;
@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
 
 // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
 template <>
-void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::IPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::IPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   if (dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst;
@@ -170,7 +191,8 @@ template <>
 void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
     return;
@@ -182,7 +204,8 @@ template <>
 void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
     return;
@@ -194,7 +217,8 @@ template <>
 void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
                                                   void* dst,
                                                   platform::XPUPlace src_place,
-                                                  const void* src, size_t num) {
+                                                  const void* src,
+                                                  size_t num) {
   if (num <= 0) {
     VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
     return;
@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
 
 // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
 template <>
-void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   if (src_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_src;
@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
 
 // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
 template <>
-void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::XPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::XPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   if (dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst;
@@ -236,7 +264,8 @@ template <>
 void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
   if (stream) {
     platform::RecordEvent record_event(
         "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
@@ -267,7 +299,8 @@ template <>
 void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
   if (stream) {
     platform::RecordEvent record_event(
         "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -295,7 +331,8 @@ template <>
 void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
                                                   void* dst,
                                                   platform::NPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
       platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
+      platform::NPUMemcpyAsync(dst,
+                               src,
+                               num,
+                               ACL_MEMCPY_DEVICE_TO_DEVICE,
                                reinterpret_cast<aclrtStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
 
 template <>
 void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst,
-    platform::NPUPinnedPlace src_place, const void* src, size_t num) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
 
 template <>
 void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
-    platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::NPUPinnedPlace dst_place,
+    void* dst,
+    platform::NPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(src_place.device);
@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
     platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_DEVICE_TO_HOST,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
 
 template <>
 void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
-    platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::NPUPlace dst_place,
+    void* dst,
+    platform::NPUPinnedPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetNPUDeviceId(dst_place.device);
@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
     platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
+    platform::NPUMemcpyAsync(dst,
+                             src,
+                             num,
+                             ACL_MEMCPY_HOST_TO_DEVICE,
                              reinterpret_cast<aclrtStream>(stream));
   } else {
     // On NPU, async operation after sync operation is ok, while sync operation
@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
 
 // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, aclrtStream stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  aclrtStream stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
 template <>
-void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::NPUPlace src_place, const void* src,
-                                     size_t num, aclrtStream stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::NPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     aclrtStream stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num,
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                            aclrtStream stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                            aclrtStream stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
 template <>
 void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num) {
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 
 // NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::NPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -608,8 +705,12 @@ inline void SyncCUDAStream() {
 
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPlace>(
-    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(src_place.device);
@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CPUPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPlace>(
-    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
-    const void* src, size_t num, void* stream) {
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
 
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                          platform::TracerEventType::UserDefined,
                                          1);
 #ifdef PADDLE_WITH_HIP
-      platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
 #else
-      platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               cudaMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
 #endif
     } else {
@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
       platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, reinterpret_cast<gpuStream_t>(stream));
+      platform::GpuMemcpyPeerAsync(dst,
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<gpuStream_t>(stream));
     } else {
       platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
-                                  num);
+      platform::GpuMemcpyPeerSync(
+          dst, dst_place.device, src, src_place.device, num);
     }
   }
 }
 
 template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
-    platform::CPUPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    platform::CPUPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CPUPlace src_place, const void* src, size_t num) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CPUPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num) {
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (UNLIKELY(num == 0)) return;
@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
 
 template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
-    platform::CUDAPinnedPlace dst_place, void* dst,
-    platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
+    platform::CUDAPinnedPlace dst_place,
+    void* dst,
+    platform::CUDAPlace src_place,
+    const void* src,
+    size_t num,
+    void* stream) {
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                        platform::TracerEventType::UserDefined,
                                        1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
 
 template <>
 void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
-    platform::CUDAPlace dst_place, void* dst,
-    platform::CUDAPinnedPlace src_place, const void* src, size_t num,
+    platform::CUDAPlace dst_place,
+    void* dst,
+    platform::CUDAPinnedPlace src_place,
+    const void* src,
+    size_t num,
     void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                        platform::TracerEventType::UserDefined,
                                        1);
 #ifdef PADDLE_WITH_HIP
-    platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #else
-    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             cudaMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
 #endif
   } else {
@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 
 // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
 template <>
-void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::GPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::GPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num,
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num,
                                            void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num,
+                                           const void* src,
+                                           size_t num,
                                            void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
 // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
 template <>
 void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
-                                           void* dst, phi::Place src_place,
-                                           const void* src, size_t num) {
+                                           void* dst,
+                                           phi::Place src_place,
+                                           const void* src,
+                                           size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
 }
 
 // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
 template <>
-void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
+void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
+                                           void* dst,
                                            phi::GPUPinnedPlace src_place,
-                                           const void* src, size_t num) {
+                                           const void* src,
+                                           size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
 }
 #endif
@@ -959,7 +1149,8 @@ template <>
 void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::MLUMemcpyD2HAsync(dst, src, num,
-                                reinterpret_cast<mluStream>(stream));
+    platform::MLUMemcpyD2HAsync(
+        dst, src, num, reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -988,7 +1179,8 @@ template <>
 void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::CPUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
     platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
                                        platform::TracerEventType::UserDefined,
                                        1);
-    platform::MLUMemcpyH2DAsync(dst, src, num,
-                                reinterpret_cast<mluStream>(stream));
+    platform::MLUMemcpyH2DAsync(
+        dst, src, num, reinterpret_cast<mluStream>(stream));
   } else {
     platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
     static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
@@ -1017,7 +1209,8 @@ template <>
 void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
                                                   void* dst,
                                                   platform::MLUPlace src_place,
-                                                  const void* src, size_t num,
+                                                  const void* src,
+                                                  size_t num,
                                                   void* stream) {
   if (UNLIKELY(num == 0)) return;
 
@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
       platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyD2DAsync(dst, src, num,
-                                  reinterpret_cast<mluStream>(stream));
+      platform::MLUMemcpyD2DAsync(
+          dst, src, num, reinterpret_cast<mluStream>(stream));
     } else {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
       platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
-                                   num, reinterpret_cast<mluStream>(stream));
+      platform::MLUMemcpyPeerAsync(dst,
+                                   dst_place.device,
+                                   src,
+                                   src_place.device,
+                                   num,
+                                   reinterpret_cast<mluStream>(stream));
     } else {
       VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
               << " to " << dst_place;
       platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
                                          platform::TracerEventType::UserDefined,
                                          1);
-      platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
-                                  num);
+      platform::MLUMemcpyPeerSync(
+          dst, dst_place.device, src, src_place.device, num);
     }
   }
 }
 
 // NOTE: only for CPUPlace and MLUPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&
       dst_place.GetType() == phi::AllocationType::CPU) {
     platform::CPUPlace place_dst, place_src;
@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
 template <>
-void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
-       src, num, stream);
+void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
+       dst,
+       src_place,
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
 template <>
-void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place, void* dst,
-                                     phi::MLUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
-  Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
-       src, num, stream);
+void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::MLUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
+  Copy(dst_place,
+       dst,
+       phi::Place(src_place.GetType(), src_place.GetDeviceId()),
+       src,
+       num,
+       stream);
 }
 
 // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 
@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
 
 // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
                                   size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 
 // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
                                      size_t num) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num);
 }
 
 // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
                                      size_t num) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
 }
@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
     !defined(PADDLE_WITH_MLU)
 
 template <>
-void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
-                                  phi::Place src_place, const void* src,
-                                  size_t num, void* stream) {
+void Copy<phi::Place, phi::Place>(phi::Place dst_place,
+                                  void* dst,
+                                  phi::Place src_place,
+                                  const void* src,
+                                  size_t num,
+                                  void* stream) {
   if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
       dst_place.GetType() == phi::AllocationType::CUSTOM) {
     platform::CPUPlace place_src;
@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
 }
 
 template <>
-void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
-                                     phi::Place src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
+                                     void* dst,
+                                     phi::Place src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
 }
 
 // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
 template <>
-void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
-                                     phi::CPUPlace src_place, const void* src,
-                                     size_t num, void* stream) {
+void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
+                                     void* dst,
+                                     phi::CPUPlace src_place,
+                                     const void* src,
+                                     size_t num,
+                                     void* stream) {
   Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
 }
 #endif
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
old mode 100644
new mode 100755
index f29546c5210d9..5bd28e3a96307
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,229 +1,463 @@
-proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool)
+proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto
+              simple_threadpool)
 if(WITH_GPU)
   proto_library(external_error_proto SRCS external_error.proto)
 endif(WITH_GPU)
-if (WITH_PYTHON)
+if(WITH_PYTHON)
   py_proto_compile(profiler_py_proto SRCS profiler.proto)
-  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+  add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E
+                                                       touch __init__.py)
   add_dependencies(profiler_py_proto profiler_py_proto_init)
 
-  if (NOT WIN32)
-    add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+  if(NOT WIN32)
+    add_custom_command(
+      TARGET profiler_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMENT
+        "Copy generated python proto into directory paddle/fluid/proto/profiler."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   else(NOT WIN32)
-    string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
-    add_custom_command(TARGET profiler_py_proto POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
-        COMMAND copy /Y *.py ${proto_dstpath}
-        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    string(REPLACE "/" "\\" proto_dstpath
+                   "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+    add_custom_command(
+      TARGET profiler_py_proto
+      POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E make_directory
+              ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+      COMMAND copy /Y *.py ${proto_dstpath}
+      COMMENT
+        "Copy generated python proto into directory paddle/fluid/proto/profiler."
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif(NOT WIN32)
 endif()
 
-cc_library(flags SRCS flags.cc DEPS gflags boost)
-cc_library(denormal SRCS denormal.cc DEPS)
+cc_library(
+  flags
+  SRCS flags.cc
+  DEPS gflags boost)
+cc_library(
+  denormal
+  SRCS denormal.cc
+  DEPS)
 
-cc_test(errors_test SRCS errors_test.cc DEPS errors enforce)
+cc_test(
+  errors_test
+  SRCS errors_test.cc
+  DEPS errors enforce)
 
 set(enforce_deps flags errors boost flags phi_enforce)
 if(WITH_GPU)
   set(enforce_deps ${enforce_deps} external_error_proto)
 endif()
 
-cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps})
+cc_library(
+  enforce INTERFACE
+  SRCS enforce.cc
+  DEPS ${enforce_deps})
 cc_library(monitor SRCS monitor.cc)
-cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+cc_test(
+  enforce_test
+  SRCS enforce_test.cc
+  DEPS stringpiece enforce)
 
 set(CPU_INFO_DEPS gflags glog enforce)
-IF(WITH_XBYAK)
-    list(APPEND CPU_INFO_DEPS xbyak)
-ENDIF()
-cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS})
-cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
-cc_library(os_info SRCS os_info.cc DEPS enforce)
-cc_test(os_info_test SRCS os_info_test.cc DEPS os_info)
-
-IF(WITH_GPU)
-    nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph)
-ELSE()
-    cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade)
-ENDIF()
-
-cc_library(place SRCS place.cc DEPS enforce boost phi_place)
-cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-IF(WITH_MKLDNN)
-    set(MKLDNN_CTX_DEPS mkldnn)
-ELSE()
-    set(MKLDNN_CTX_DEPS)
-ENDIF()
+if(WITH_XBYAK)
+  list(APPEND CPU_INFO_DEPS xbyak)
+endif()
+cc_library(
+  cpu_info
+  SRCS cpu_info.cc
+  DEPS ${CPU_INFO_DEPS})
+cc_test(
+  cpu_info_test
+  SRCS cpu_info_test.cc
+  DEPS cpu_info)
+cc_library(
+  os_info
+  SRCS os_info.cc
+  DEPS enforce)
+cc_test(
+  os_info_test
+  SRCS os_info_test.cc
+  DEPS os_info)
+
+if(WITH_GPU)
+  nv_library(
+    cuda_graph_with_memory_pool
+    SRCS cuda_graph_with_memory_pool.cc
+    DEPS device_context allocator_facade cuda_graph)
+else()
+  cc_library(
+    cuda_graph_with_memory_pool
+    SRCS cuda_graph_with_memory_pool.cc
+    DEPS device_context allocator_facade)
+endif()
+
+cc_library(
+  place
+  SRCS place.cc
+  DEPS enforce boost phi_place)
+cc_test(
+  place_test
+  SRCS place_test.cc
+  DEPS place glog gflags)
+
+if(WITH_MKLDNN)
+  set(MKLDNN_CTX_DEPS mkldnn)
+else()
+  set(MKLDNN_CTX_DEPS)
+endif()
 
 add_subdirectory(device)
 add_subdirectory(dynload)
 add_subdirectory(stream)
 
-cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
-cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
+cc_library(
+  cpu_helper
+  SRCS cpu_helper.cc
+  DEPS cblas enforce)
+cc_test(
+  cpu_helper_test
+  SRCS cpu_helper_test.cc
+  DEPS cpu_helper)
 
 set(dgc_deps "")
-IF(WITH_DGC)
-    set(dgc_deps dgc)
-ENDIF()
-
-IF(WITH_GPU OR WITH_ROCM)
-    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
-ENDIF()
-
-IF(WITH_IPU)
-    set(IPU_CTX_DEPS ipu_info)
-ELSE()
-    set(IPU_CTX_DEPS)
-ENDIF(WITH_IPU)
-
-IF(WITH_ASCEND_CL)
-    set(NPU_CTX_DEPS npu_stream npu_info)
-ENDIF()
-
-IF(WITH_MLU)
-    set(MLU_CTX_DEPS mlu_device_context)
-ENDIF()
-
-IF(WITH_ASCEND_CL OR WITH_MLU)
-cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-
-IF(WITH_GPU)
-    nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-IF(WITH_ROCM)
-    hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce)
-ENDIF()
-
-IF(WITH_GPU OR WITH_ROCM)
+if(WITH_DGC)
+  set(dgc_deps dgc)
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
+  set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
+endif()
+
+if(WITH_IPU)
+  set(IPU_CTX_DEPS ipu_info)
+else()
+  set(IPU_CTX_DEPS)
+endif(WITH_IPU)
+
+if(WITH_ASCEND_CL)
+  set(NPU_CTX_DEPS npu_stream npu_info)
+endif()
+
+if(WITH_MLU)
+  set(MLU_CTX_DEPS mlu_device_context)
+endif()
+
+if(WITH_ASCEND_CL OR WITH_MLU)
+  cc_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+
+if(WITH_GPU)
+  nv_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+if(WITH_ROCM)
+  hip_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce)
+endif()
+
+if(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
-ELSEIF(WITH_ASCEND_CL)
+elseif(WITH_ASCEND_CL)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
-ELSE()
+else()
   set(STREAM_CALLBACK_DEPS)
-ENDIF()
+endif()
 
 if(WITH_GLOO)
-    cc_library(gloo_context SRCS gloo_context.cc DEPS framework_proto gloo_wrapper enforce)
+  cc_library(
+    gloo_context
+    SRCS gloo_context.cc
+    DEPS framework_proto gloo_wrapper enforce)
 endif()
 
-cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
+cc_library(
+  cudnn_workspace_helper
+  SRCS cudnn_workspace_helper.cc
+  DEPS boost)
 
 # seperate init from device_context to avoid cycle dependencies
-cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
+cc_library(
+  init
+  SRCS init.cc
+  DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
-    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
+cc_library(
+  device_context
+  SRCS device_context.cc
+  DEPS simple_threadpool
+       malloc
+       xxhash
+       ${STREAM_CALLBACK_DEPS}
+       place
+       phi_place
+       eigen3
+       stringpiece
+       cpu_helper
+       cpu_info
+       framework_proto
+       ${IPU_CTX_DEPS}
+       ${GPU_CTX_DEPS}
+       ${NPU_CTX_DEPS}
+       ${MKLDNN_CTX_DEPS}
+       ${dgc_deps}
+       dlpack
+       cudnn_workspace_helper
+       ${XPU_CTX_DEPS}
+       ${MLU_CTX_DEPS}
+       eigen3
+       cpu_context
+       generator)
 if(WITH_XPU)
   target_link_libraries(device_context xpu_context)
 endif()
 
-cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
+cc_library(
+  collective_helper
+  SRCS collective_helper.cc gen_comm_id_helper.cc
+  DEPS framework_proto device_context enforce)
 if(WITH_ASCEND_CL)
-    target_link_libraries(collective_helper npu_collective_helper)
+  target_link_libraries(collective_helper npu_collective_helper)
 endif()
 
 if(WITH_CNCL)
-    target_link_libraries(collective_helper mlu_collective_helper)
+  target_link_libraries(collective_helper mlu_collective_helper)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
-    target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
-    target_link_libraries(device_context gpu_resource_pool)
+  target_link_libraries(device_context gpu_info gpu_context phi_gpu_info)
+  target_link_libraries(device_context gpu_resource_pool)
 endif()
-if (WITH_CUSTOM_DEVICE)
-    target_link_libraries(device_context custom_context)
+if(WITH_CUSTOM_DEVICE)
+  target_link_libraries(device_context custom_context)
 endif()
 if(WITH_ASCEND_CL)
-    target_link_libraries(device_context npu_resource_pool)
+  target_link_libraries(device_context npu_resource_pool)
 endif()
 
 if(WITH_MLU)
-    target_link_libraries(device_context mlu_resource_pool)
+  target_link_libraries(device_context mlu_resource_pool)
 endif()
 
 if(WITH_CUSTOM_DEVICE)
-    target_link_libraries(device_context custom_context)
+  target_link_libraries(device_context custom_context)
 endif()
 
-cc_test(init_test SRCS init_test.cc DEPS device_context)
+cc_test(
+  init_test
+  SRCS init_test.cc
+  DEPS device_context)
 
 # Manage all device event library
 set(DEVICE_EVENT_LIBS)
-cc_library(device_event_base SRCS device_event_base.cc DEPS place enforce device_context op_registry)
-set(DEVICE_EVENT_LIBS  device_event_base CACHE INTERNAL "device event libs")
-
+cc_library(
+  device_event_base
+  SRCS device_event_base.cc
+  DEPS place enforce device_context op_registry)
+set(DEVICE_EVENT_LIBS
+    device_event_base
+    CACHE INTERNAL "device event libs")
 
 if(WITH_GPU)
-  nv_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
-  set(DEVICE_EVENT_LIBS  device_event_gpu CACHE INTERNAL "device event libs")
-  nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
-
-  nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+  nv_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+  nv_test(
+    device_event_test
+    SRCS device_event_test.cc
+    DEPS device_event_gpu)
+
+  nv_test(
+    device_context_test
+    SRCS device_context_test.cu
+    DEPS device_context gpu_info)
+  nv_test(
+    transform_test
+    SRCS transform_test.cu
+    DEPS memory place device_context)
 endif()
 
 if(WITH_ROCM)
-  hip_library(device_event_gpu SRCS device_event_gpu.cc DEPS device_event_base)
-  set(DEVICE_EVENT_LIBS  device_event_gpu CACHE INTERNAL "device event libs")
-  hip_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
-
-  hip_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  hip_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+  hip_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+  hip_test(
+    device_event_test
+    SRCS device_event_test.cc
+    DEPS device_event_gpu)
+
+  hip_test(
+    device_context_test
+    SRCS device_context_test.cu
+    DEPS device_context gpu_info)
+  hip_test(
+    transform_test
+    SRCS transform_test.cu
+    DEPS memory place device_context)
 endif()
 
 cc_library(timer SRCS timer.cc)
-cc_test(timer_test SRCS timer_test.cc DEPS timer)
-
-cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto)
-cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer)
+cc_test(
+  timer_test
+  SRCS timer_test.cc
+  DEPS timer)
+
+cc_library(
+  lodtensor_printer
+  SRCS lodtensor_printer.cc
+  DEPS ddim
+       place
+       tensor
+       scope
+       lod_tensor
+       variable_helper
+       framework_proto)
+cc_test(
+  lodtensor_printer_test
+  SRCS lodtensor_printer_test.cc
+  DEPS lodtensor_printer)
 
 add_subdirectory(profiler)
 
-cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
+cc_library(
+  device_tracer
+  SRCS device_tracer.cc
+  DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats)
-  nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+  nv_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         dynload_cuda
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
+  nv_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats)
-  hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
+  hip_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS os_info
+         device_tracer
+         gpu_info
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
+  hip_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info gpu_info place)
 else()
-  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats)
-  cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
+  cc_library(
+    profiler
+    SRCS profiler.cc
+    DEPS os_info
+         device_tracer
+         enforce
+         new_profiler
+         stats
+         op_proto_maker
+         shape_inference)
+  cc_library(
+    device_memory_aligment
+    SRCS device_memory_aligment.cc
+    DEPS cpu_info place)
 endif()
 
-cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
-cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor)
-cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor)
+cc_test(
+  profiler_test
+  SRCS profiler_test.cc
+  DEPS profiler)
+cc_test(
+  float16_test
+  SRCS float16_test.cc
+  DEPS lod_tensor)
+cc_test(
+  bfloat16_test
+  SRCS bfloat16_test.cc
+  DEPS lod_tensor)
+cc_test(
+  complex_test
+  SRCS complex_test.cc
+  DEPS lod_tensor)
 
-IF(WITH_GPU)
-  nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
-  nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor)
-  nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor)
-  nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
-  nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
-ENDIF()
+if(WITH_GPU)
+  nv_test(
+    float16_gpu_test
+    SRCS float16_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    bfloat16_gpu_test
+    SRCS bfloat16_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    complex_gpu_test
+    SRCS complex_test.cu
+    DEPS lod_tensor)
+  nv_test(
+    test_limit_gpu_memory
+    SRCS test_limit_gpu_memory.cu
+    DEPS gpu_info flags)
+  nv_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
 
-IF(WITH_ROCM)
-  hip_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
-  hip_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags)
-  hip_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
-ENDIF()
+if(WITH_ROCM)
+  hip_test(
+    float16_gpu_test
+    SRCS float16_test.cu
+    DEPS lod_tensor)
+  hip_test(
+    test_limit_gpu_memory
+    SRCS test_limit_gpu_memory.cu
+    DEPS gpu_info flags)
+  hip_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
 
 if(NOT APPLE AND NOT WIN32)
-  cc_library(device_code SRCS device_code.cc DEPS device_context)
+  cc_library(
+    device_code
+    SRCS device_code.cc
+    DEPS device_context)
   if(WITH_GPU OR WITH_ROCM)
-    cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor)
+    cc_test(
+      device_code_test
+      SRCS device_code_test.cc
+      DEPS device_code lod_tensor)
   endif()
 endif()
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 89e3b74bb3aca..179c04b75f8f4 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/monitor.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 
@@ -49,6 +50,15 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
 DECLARE_bool(enable_cublas_tensor_op_math);
 DECLARE_uint64(gpu_memory_limit_mb);
 
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
+                            false,
+                            "Whether to print the message of gpu memory usage "
+                            "at exit, mainly used for UT and CI.");
+PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
+                            true,
+                            "Whether to print the message of gpu memory usage "
+                            "MB as a unit of measurement.");
+
 constexpr static float fraction_reserve_gpu_memory = 0.05f;
 
 USE_GPU_MEM_STAT;
@@ -57,7 +67,10 @@ namespace platform {
 
 void GpuMemoryUsage(size_t *available, size_t *total) {
   size_t actual_available, actual_total;
-  RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
+  RecordedGpuMemGetInfo(available,
+                        total,
+                        &actual_available,
+                        &actual_total,
                         platform::GetCurrentDeviceId());
 }
 
@@ -85,17 +98,20 @@ size_t GpuMaxAllocSize() {
 static size_t GpuAllocSize(bool realloc) {
   size_t available_to_alloc = GpuAvailableMemToAlloc();
   PADDLE_ENFORCE_GT(
-      available_to_alloc, 0,
+      available_to_alloc,
+      0,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
   // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
   // allocated by fraction
   size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb
                            : FLAGS_initial_gpu_memory_in_mb;
   size_t alloc_bytes =
-      (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc *
-                                           FLAGS_fraction_of_gpu_memory_to_use);
+      (flag_mb > 0ul
+           ? flag_mb << 20
+           : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
   PADDLE_ENFORCE_GE(
-      available_to_alloc, alloc_bytes,
+      available_to_alloc,
+      alloc_bytes,
       platform::errors::ResourceExhausted("Not enough available GPU memory."));
   VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
            << " MiB, is it Re-alloc: " << realloc;
@@ -153,13 +169,16 @@ class RecordedGpuMallocHelper {
     });
 
     PADDLE_ENFORCE_GE(
-        dev_id, 0,
+        dev_id,
+        0,
         platform::errors::OutOfRange(
             "Device id must be not less than 0, but got %d.", dev_id));
     PADDLE_ENFORCE_LT(
-        dev_id, instances_.size(),
+        dev_id,
+        instances_.size(),
         platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
-                                     dev_id, instances_.size()));
+                                     dev_id,
+                                     instances_.size()));
     return instances_[dev_id].get();
   }
 
@@ -168,7 +187,8 @@ class RecordedGpuMallocHelper {
    * or cudaSuccess would be returned, and the cudaGetLastError() flag
    * would be clear.
    */
-  gpuError_t Malloc(void **ptr, size_t size,
+  gpuError_t Malloc(void **ptr,
+                    size_t size,
                     bool malloc_managed_memory = false) {
     LockGuardPtr<std::mutex> lock(mtx_);
     if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
@@ -196,8 +216,11 @@ class RecordedGpuMallocHelper {
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
-
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedAllocate);
 #ifdef PADDLE_WITH_TESTING
       gpu_ptrs.insert(*ptr);
 #endif
@@ -235,7 +258,11 @@ class RecordedGpuMallocHelper {
       PADDLE_ENFORCE_GPU_SUCCESS(err);
       cur_size_.fetch_sub(size);
       STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
-      MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
+      platform::RecordMemEvent(ptr,
+                               GPUPlace(dev_id_),
+                               size,
+                               platform::TracerMemEventType::ReservedFree);
     } else {
       platform::GpuGetLastError();  // clear the error flag when
                                     // cudaErrorCudartUnloading /
@@ -261,7 +288,9 @@ class RecordedGpuMallocHelper {
 #endif
   }
 
-  bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
+  bool GetMemInfo(size_t *avail,
+                  size_t *total,
+                  size_t *actual_avail,
                   size_t *actual_total) {
     {
       CUDADeviceGuard guard(dev_id_);
@@ -296,7 +325,8 @@ class RecordedGpuMallocHelper {
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-  CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+  CUresult MemCreate(CUmemGenericAllocationHandle *handle,
+                     size_t size,
                      const CUmemAllocationProp *prop,
                      unsigned long long flags) {  // NOLINT
     auto result =
@@ -335,7 +365,9 @@ std::once_flag RecordedGpuMallocHelper::once_flag_;
 std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
     RecordedGpuMallocHelper::instances_;
 
-gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
+gpuError_t RecordedGpuMalloc(void **ptr,
+                             size_t size,
+                             int dev_id,
                              bool malloc_managed_memory) {
   return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
       ptr, size, malloc_managed_memory);
@@ -347,22 +379,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
 
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10020
-CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
+                              size_t size,
                               const CUmemAllocationProp *prop,
-                              unsigned long long flags, int dev_id) {  // NOLINT
-  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
-                                                              prop, flags);
+                              unsigned long long flags,
+                              int dev_id) {  // NOLINT
+  return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
+      handle, size, prop, flags);
 }
 
-CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
+CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
+                               size_t size,
                                int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
 }
 #endif
 #endif
 
-bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
-                           size_t *actual_total, int dev_id) {
+bool RecordedGpuMemGetInfo(size_t *avail,
+                           size_t *total,
+                           size_t *actual_avail,
+                           size_t *actual_total,
+                           int dev_id) {
   return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
       avail, total, actual_avail, actual_total);
 }
@@ -457,26 +495,35 @@ void GpuDestroyStream(gpuStream_t stream) {
 
 void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
 
-void GpuMemcpyAsync(void *dst, const void *src, size_t count,
-                    gpuMemcpyKind kind, gpuStream_t stream) {
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
   phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
 }
 
-void GpuMemcpySync(void *dst, const void *src, size_t count,
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
                    gpuMemcpyKind kind) {
   phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
 }
 
-void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
-                        int src_device, size_t count, gpuStream_t stream) {
-  phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
-                                         count, stream);
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  phi::backends::gpu::GpuMemcpyPeerAsync(
+      dst, dst_device, src, src_device, count, stream);
 }
 
-void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
-                       int src_device, size_t count) {
-  phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
-                                        count);
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  phi::backends::gpu::GpuMemcpyPeerSync(
+      dst, dst_device, src, src_device, count);
 }
 
 void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 8fa48ffcfb158..6467c4fdc403c 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -29,12 +29,16 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/nvtx.h"
 #endif
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/os_info.h"
 
-PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
+PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
+                            false,
                             "Enable rpc profiler or not.");
 
-DEFINE_bool(enable_host_event_recorder_hook, false,
+DEFINE_bool(enable_host_event_recorder_hook,
+            false,
             "enable HostEventRecorder, hook Profiler");
 
 namespace paddle {
@@ -42,8 +46,11 @@ namespace platform {
 
 MemEvenRecorder MemEvenRecorder::recorder;
 
-Event::Event(EventType type, std::string name, uint32_t thread_id,
-             EventRole role, std::string attr)
+Event::Event(EventType type,
+             std::string name,
+             uint32_t thread_id,
+             EventRole role,
+             std::string attr)
     : type_(type),
       name_(name),
       thread_id_(thread_id),
@@ -67,8 +74,10 @@ double Event::CudaElapsedMs(const Event &e) const {
 #endif
 }
 
-RecordEvent::RecordEvent(const char *name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const char *name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -99,8 +108,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
   start_ns_ = PosixInNsec();
 }
 
-RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
-                         uint32_t level, const EventRole role) {
+RecordEvent::RecordEvent(const std::string &name,
+                         const TracerEventType type,
+                         uint32_t level,
+                         const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
   if (g_enable_nvprof_hook) {
@@ -129,8 +140,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
   start_ns_ = PosixInNsec();
 }
 
-RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
-                         const TracerEventType type, uint32_t level,
+RecordEvent::RecordEvent(const std::string &name,
+                         const std::string &attr,
+                         const TracerEventType type,
+                         uint32_t level,
                          const EventRole role) {
 #ifndef _WIN32
 #ifdef PADDLE_WITH_CUDA
@@ -195,11 +208,11 @@ void RecordEvent::End() {
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
     } else if (name_ != nullptr) {
       if (attr_ == nullptr) {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_);
+        HostEventRecorder::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_);
       } else {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_, *attr_);
+        HostEventRecorder::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_, *attr_);
         delete attr_;
       }
       delete name_;
@@ -214,8 +227,8 @@ void RecordEvent::End() {
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer) {
     uint64_t end_ns = PosixInNsec();
-    tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
   }
   ClearCurAnnotation();
   PopEvent(*name_, role_);
@@ -225,30 +238,217 @@ void RecordEvent::End() {
   is_enabled_ = false;
 }
 
-RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
+RecordInstantEvent::RecordInstantEvent(const char *name,
+                                       TracerEventType type,
                                        uint32_t level) {
   if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
     return;
   }
   auto start_end_ns = PosixInNsec();
-  HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
-                                               EventRole::kOrdinary, type);
+  HostEventRecorder::GetInstance().RecordEvent(
+      name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
+}
+
+RecordOpInfoSupplement::RecordOpInfoSupplement(
+    const std::string &type,
+    const framework::AttributeMap &attrs,
+    const framework::InferShapeContext &shape_ctx,
+    const framework::RuntimeContext &ctx) {
+  if (FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
+    input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
+    dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
+  }
+
+  const std::vector<std::string> *callstack_ptr = nullptr;
+  std::vector<std::string> callstack;
+  auto iter = attrs.find(
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+  if (iter != attrs.end()) {
+    callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
+    callstack = *callstack_ptr;
+  }
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
+      PosixInNsec(), type, input_shapes, dtypes, callstack);
+}
+
+RecordMemEvent::RecordMemEvent(const void *ptr,
+                               const phi::Place &place,
+                               size_t size,
+                               const TracerMemEventType type) {
+  if (g_state == ProfilerState::kDisabled &&
+      FLAGS_enable_host_event_recorder_hook == false) {
+    return;
+  }
+  if (type == TracerMemEventType::Allocate) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::ReservedAllocate) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
+                                                        place,
+                                                        size,
+                                                        type,
+                                                        current_allocated,
+                                                        current_reserved,
+                                                        peak_allocated,
+                                                        peak_reserved);
+  } else if (type == TracerMemEventType::Free) {
+    uint64_t current_allocated;
+    uint64_t peak_allocated;
+    uint64_t current_reserved = 0;  // 0 means keep the same as before
+    uint64_t peak_reserved = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_allocated =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    } else {
+      current_allocated =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
+      peak_allocated =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  } else if (type == TracerMemEventType::ReservedFree) {
+    uint64_t current_reserved;
+    uint64_t peak_reserved;
+    uint64_t current_allocated = 0;  // 0 means keep the same as before
+    uint64_t peak_allocated = 0;     // 0 means keep the same as before
+    if (platform::is_cpu_place(place) ||
+        platform::is_cuda_pinned_place(place)) {
+      current_reserved =
+          HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    } else {
+      current_reserved =
+          DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
+      peak_reserved =
+          DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
+    }
+
+    platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
+                                                       place,
+                                                       size,
+                                                       type,
+                                                       current_allocated,
+                                                       current_reserved,
+                                                       peak_allocated,
+                                                       peak_reserved);
+  }
 }
 
-void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
                                     size_t size) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
-  PADDLE_ENFORCE_EQ(events.count(ptr), 0,
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
                     platform::errors::InvalidArgument(
                         "The Place can't exist in the stage of PushMemRecord"));
-  events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
-                          new MemEvenRecorder::RecordMemEvent(place, size)));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
+}
+
+void MemEvenRecorder::PushMemRecord(const void *ptr,
+                                    const Place &place,
+                                    size_t size,
+                                    TracerMemEventType type,
+                                    uint64_t current_allocated,
+                                    uint64_t current_reserved,
+                                    uint64_t peak_allocated,
+                                    uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedAllocate) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  PADDLE_ENFORCE_EQ(events.count(ptr),
+                    0,
+                    platform::errors::InvalidArgument(
+                        "The Place can't exist in the stage of PushMemRecord"));
+  events.emplace(ptr,
+                 std::unique_ptr<RecordMemEvent>(
+                     new MemEvenRecorder::RecordMemEvent(place, size)));
 }
 
 void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
-  if (g_state == ProfilerState::kDisabled) return;
+  if (g_state == ProfilerState::kDisabled) {
+    return;
+  }
   std::lock_guard<std::mutex> guard(mtx_);
   auto &events = address_memevent_[place];
   auto iter = events.find(ptr);
@@ -258,6 +458,41 @@ void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
   }
 }
 
+void MemEvenRecorder::PopMemRecord(const void *ptr,
+                                   const Place &place,
+                                   size_t size,
+                                   TracerMemEventType type,
+                                   uint64_t current_allocated,
+                                   uint64_t current_reserved,
+                                   uint64_t peak_allocated,
+                                   uint64_t peak_reserved) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (FLAGS_enable_host_event_recorder_hook) {  // new MemRecord
+    HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
+        PosixInNsec(),
+        reinterpret_cast<uint64_t>(ptr),
+        type,
+        -size,
+        place,
+        current_allocated,
+        current_reserved,
+        peak_allocated,
+        peak_reserved);
+    return;
+  }
+  if (type == TracerMemEventType::ReservedFree) {
+    // old profiler only analyse memory managed by paddle.
+    return;
+  }
+  if (g_state == ProfilerState::kDisabled) return;
+  auto &events = address_memevent_[place];
+  auto iter = events.find(ptr);
+  // The ptr maybe not in address_memevent
+  if (iter != events.end()) {
+    events.erase(iter);
+  }
+}
+
 void MemEvenRecorder::Flush() {
   std::lock_guard<std::mutex> guard(mtx_);
   address_memevent_.clear();
@@ -278,8 +513,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
 
   auto annotation_free = CurAnnotationName();
   if (tracer) {
-    tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
-                             annotation_free, g_mem_thread_id);
+    tracer->AddMemInfoRecord(start_ns_,
+                             end_ns_,
+                             bytes_,
+                             place_,
+                             alloc_in_,
+                             annotation_free,
+                             g_mem_thread_id);
   }
   PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
 }
@@ -306,22 +546,38 @@ RecordBlock::~RecordBlock() {
   if (tracer) {
     // We try to put all blocks at the same nested depth in the
     // same timeline lane. and distinguish the using thread_id.
-    tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
-                          g_thread_id);
+    tracer->AddCPURecords(
+        name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
   }
   ClearCurBlock();
 }
 
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
-                           place, g_mem_thread_id, annotation);
-}
-
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place &place, const std::string &annotation) {
-  GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
-                           g_mem_thread_id, annotation);
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place &place,
+                  const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPushRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
+}
+
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place &place,
+                 const std::string &annotation) {
+  GetMemEventList().Record(EventType::kPopRange,
+                           start_ns,
+                           end_ns,
+                           bytes,
+                           place,
+                           g_mem_thread_id,
+                           annotation);
 }
 
 void Mark(const std::string &name) {
@@ -333,17 +589,19 @@ void Mark(const std::string &name) {
   GetEventList().Record(EventType::kMark, name, g_thread_id);
 }
 
-Event *PushEvent(const std::string &name, const EventRole role,
+Event *PushEvent(const std::string &name,
+                 const EventRole role,
                  std::string attr) {
-  return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
-                               attr);
+  return GetEventList().Record(
+      EventType::kPushRange, name, g_thread_id, role, attr);
 }
 
 void PopEvent(const std::string &name, const EventRole role, std::string attr) {
   GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
 }
 void EnableProfiler(ProfilerState state) {
-  PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
+  PADDLE_ENFORCE_NE(state,
+                    ProfilerState::kDisabled,
                     platform::errors::InvalidArgument(
                         "Can't enable profiling, since the input state is"
                         "ProfilerState::kDisabled"));
@@ -379,7 +637,8 @@ void ResetProfiler() {
     (*it)->Clear();
   }
   for (auto it = g_all_mem_event_lists.begin();
-       it != g_all_mem_event_lists.end(); ++it) {
+       it != g_all_mem_event_lists.end();
+       ++it) {
     (*it)->Clear();
   }
 }
@@ -573,8 +832,8 @@ static void EmulateEventPushAndPop(const HostEventSection &host_sec,
       std::string name =
           prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
       const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
-      Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
-                                             evt.role, attr);
+      Event *orig_evt = cur_thr_list->Record(
+          EventType::kPushRange, name, tid, evt.role, attr);
       (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
       cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
     }
@@ -589,8 +848,8 @@ static void EmulateCPURecordsAdd(const HostEventSection &host_sec) {
   for (const auto &thr_sec : host_sec.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
     for (const auto &evt : thr_sec.events) {
-      tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
-                            tid);
+      tracer->AddCPURecords(
+          evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
     }
   }
 }
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 78275341cbbf7..4773b1a177ba0 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -30,6 +30,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/fluid/platform/profiler/mem_tracing.h"
+#include "paddle/fluid/platform/profiler/supplement_tracing.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -102,6 +104,22 @@ struct MemEvenRecorder {
  public:
   void PushMemRecord(const void* ptr, const Place& place, size_t size);
   void PopMemRecord(const void* ptr, const Place& place);
+  void PushMemRecord(const void* ptr,
+                     const Place& place,
+                     size_t size,
+                     TracerMemEventType type,
+                     uint64_t current_allocated,
+                     uint64_t current_reserved,
+                     uint64_t peak_allocated,
+                     uint64_t peak_reserved);
+  void PopMemRecord(const void* ptr,
+                    const Place& place,
+                    size_t size,
+                    TracerMemEventType type,
+                    uint64_t current_allocated,
+                    uint64_t current_reserved,
+                    uint64_t peak_allocated,
+                    uint64_t peak_reserved);
   void Flush();
   static MemEvenRecorder& Instance() { return recorder; }
 
@@ -160,7 +178,8 @@ struct EventList {
   std::vector<T> Reduce() {
     std::vector<T> result;
     for (auto& block : event_blocks) {
-      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+      result.insert(result.begin(),
+                    std::make_move_iterator(block.begin()),
                     std::make_move_iterator(block.end()));
     }
     event_blocks.clear();
@@ -173,13 +192,21 @@ struct EventList {
 };
 
 void Mark(const std::string& name);
-void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                  const Place& place, const std::string& annotation);
-void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
-                 const Place& place, const std::string& annotation);
-Event* PushEvent(const std::string& name, const EventRole role,
+void PushMemEvent(uint64_t start_ns,
+                  uint64_t end_ns,
+                  size_t bytes,
+                  const Place& place,
+                  const std::string& annotation);
+void PopMemEvent(uint64_t start_ns,
+                 uint64_t end_ns,
+                 size_t bytes,
+                 const Place& place,
+                 const std::string& annotation);
+Event* PushEvent(const std::string& name,
+                 const EventRole role,
                  const std::string attr = "none");
-void PopEvent(const std::string& name, const EventRole role,
+void PopEvent(const std::string& name,
+              const EventRole role,
               const std::string attr = "none");
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt
index 084bc44dbc78b..1daed7db1e701 100755
--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -1,14 +1,52 @@
-cc_library(host_tracer SRCS host_tracer.cc DEPS enforce)
-cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog)
+cc_library(
+  host_tracer
+  SRCS host_tracer.cc
+  DEPS enforce ddim var_type_traits)
+cc_library(
+  cuda_tracer
+  SRCS cuda_tracer.cc cupti_data_process.cc
+  DEPS workqueue_utils enforce glog)
 add_subdirectory(mlu)
-cc_library(event_node SRCS event_node.cc DEPS enforce)
-cc_library(profiler_utils SRCS utils.cc DEPS enforce glog)
+cc_library(
+  event_node
+  SRCS event_node.cc
+  DEPS enforce place)
+cc_library(
+  profiler_utils
+  SRCS utils.cc
+  DEPS enforce glog)
 add_subdirectory(dump)
-cc_library(profiler_logger SRCS chrometracing_logger.cc dump/serialization_logger.cc dump/deserialization_reader.cc DEPS nodetreeproto event_node profiler_utils)
-cc_library(event_bind SRCS event_python.cc DEPS profiler_logger)
-cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog)
-cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind mlu_tracer)
-cc_test(test_event_node SRCS test_event_node.cc DEPS event_node profiler_logger)
-cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils)
-cc_test(test_serialization_logger SRCS dump/test_serialization_logger.cc DEPS event_bind)
-cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler)
+cc_library(
+  profiler_logger
+  SRCS chrometracing_logger.cc dump/serialization_logger.cc
+       dump/deserialization_reader.cc
+  DEPS nodetreeproto event_node profiler_utils)
+cc_library(
+  event_bind
+  SRCS event_python.cc
+  DEPS profiler_logger)
+cc_library(
+  cpu_utilization
+  SRCS cpu_utilization.cc
+  DEPS cpu_info os_info enforce glog)
+cc_library(
+  new_profiler
+  SRCS profiler.cc
+  DEPS host_tracer cuda_tracer profiler_utils cpu_utilization event_bind
+       mlu_tracer)
+cc_test(
+  test_event_node
+  SRCS test_event_node.cc
+  DEPS event_node profiler_logger)
+cc_test(
+  test_extra_info
+  SRCS test_extra_info.cc
+  DEPS profiler_utils)
+cc_test(
+  test_serialization_logger
+  SRCS dump/test_serialization_logger.cc
+  DEPS event_bind)
+cc_test(
+  new_profiler_test
+  SRCS profiler_test.cc
+  DEPS new_profiler)
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 1e26c0a94408c..e8fe541272137 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <cstdio>
 #include <ctime>
 #include <limits>
+#include <regex>
 
 #include "glog/logging.h"
 
@@ -128,27 +129,32 @@ void ChromeTracingLogger::LogMemTraceEventNode(
       std::string(
           R"JSON(
   { 
-    "name": "[memory]", "pid": %lld, "tid": "%lld",
+    "name": "[memory]", "pid": %lld, "tid": "%lld(C++)",
     "ts": %lld, 
     "ph": "i", "cat": "%s", 
     "args": {
       "place": "%s",
       "addr": "%llu",
+      "increase_bytes": %lld,
       "current_allocated": %llu,
       "current_reserved": %llu,
-      "increase_bytes": %lld
+      "peak_allocated": %llu,
+      "peak_reserved": %llu
     }
   },
   )JSON"),
       mem_node.ProcessId(),
       mem_node.ThreadId(),
-      mem_node.TimeStampNs(),
+      nsToUs(mem_node.TimeStampNs()),
       StringTracerMemEventType(mem_node.Type()),
       mem_node.Place().c_str(),
       mem_node.Addr(),
+      mem_node.IncreaseBytes(),
       mem_node.CurrentAllocated(),
       mem_node.CurrentReserved(),
-      mem_node.IncreaseBytes());
+      mem_node.PeakAllocated(),
+      mem_node.PeakReserved());
+  pid_tid_set_.insert({mem_node.ProcessId(), mem_node.ThreadId()});
 }
 
 void ChromeTracingLogger::LogHostTraceEventNode(
@@ -172,6 +178,8 @@ void ChromeTracingLogger::LogHostTraceEventNode(
     input_shapes = op_supplement_node->InputShapes();
     input_dtypes = op_supplement_node->Dtypes();
     callstack = op_supplement_node->CallStack();
+    callstack = std::regex_replace(callstack, std::regex("\""), "\'");
+    callstack = std::regex_replace(callstack, std::regex("\n"), "\\n");
   }
   switch (host_node.Type()) {
     case TracerEventType::ProfileStep:
diff --git a/paddle/fluid/platform/profiler/common_event.h b/paddle/fluid/platform/profiler/common_event.h
index cfdc3be110a5b..3e166d1d04db9 100644
--- a/paddle/fluid/platform/profiler/common_event.h
+++ b/paddle/fluid/platform/profiler/common_event.h
@@ -17,16 +17,22 @@
 #include <cstring>
 #include <functional>
 #include <string>
+
+#include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/event.h"  // import EventRole, TODO(TIEXING): remove later
 #include "paddle/fluid/platform/profiler/trace_event.h"
+#include "paddle/phi/core/ddim.h"
 
 namespace paddle {
 namespace platform {
 
 struct CommonEvent {
  public:
-  CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+  CommonEvent(const char *name,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
       : name(name),
         start_ns(start_ns),
         end_ns(end_ns),
@@ -34,8 +40,12 @@ struct CommonEvent {
         type(type) {}
 
   CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type, const std::string &attr_str)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type,
+              const std::string &attr_str)
       : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
     auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
     strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -46,8 +56,11 @@ struct CommonEvent {
   }
 
   CommonEvent(std::function<void *(size_t)> arena_allocator,
-              const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
-              EventRole role, TracerEventType type)
+              const std::string &name_str,
+              uint64_t start_ns,
+              uint64_t end_ns,
+              EventRole role,
+              TracerEventType type)
       : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
     auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
     strncpy(buf, name_str.c_str(), name_str.length() + 1);
@@ -62,5 +75,61 @@ struct CommonEvent {
   const char *attr = nullptr;  // not owned, designed for performance
 };
 
+struct CommonMemEvent {
+ public:
+  CommonMemEvent(uint64_t timestamp_ns,
+                 uint64_t addr,
+                 TracerMemEventType type,
+                 int64_t increase_bytes,
+                 const Place &place,
+                 uint64_t current_allocated,
+                 uint64_t current_reserved,
+                 uint64_t peak_allocated,
+                 uint64_t peak_reserved)
+      : timestamp_ns(timestamp_ns),
+        addr(addr),
+        type(type),
+        increase_bytes(increase_bytes),
+        place(place),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
+  uint64_t timestamp_ns;
+  uint64_t addr;
+  TracerMemEventType type;
+  int64_t increase_bytes;
+  Place place;
+  uint64_t current_allocated;
+  uint64_t current_reserved;
+  uint64_t peak_allocated;
+  uint64_t peak_reserved;
+};
+
+struct OperatorSupplementOriginEvent {
+ public:
+  OperatorSupplementOriginEvent(
+      std::function<void *(size_t)> arena_allocator,
+      uint64_t timestamp_ns,
+      const std::string &type_name,
+      const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
+      const std::map<std::string, std::vector<framework::proto::VarType::Type>>
+          &dtypes,
+      const std::vector<std::string> callstack)
+      : timestamp_ns(timestamp_ns),
+        input_shapes(input_shapes),
+        dtypes(dtypes),
+        callstack(callstack) {
+    auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
+    strncpy(buf, type_name.c_str(), type_name.length() + 1);
+    op_type = buf;
+  }
+  uint64_t timestamp_ns;
+  const char *op_type = nullptr;  // not owned, designed for performance
+  // input shapes
+  std::map<std::string, std::vector<framework::DDim>> input_shapes;
+  std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
+  // call stack
+  const std::vector<std::string> callstack;
+};
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index de3411579d3e9..d17aa9e9ce2aa 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -45,7 +45,8 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   ExtraInfo extrainfo;
   for (auto indx = 0; indx < node_trees_proto_->extra_info_size(); indx++) {
     ExtraInfoMap extra_info_map = node_trees_proto_->extra_info(indx);
-    extrainfo.AddExtraInfo(extra_info_map.key(), std::string("%s"),
+    extrainfo.AddExtraInfo(extra_info_map.key(),
+                           std::string("%s"),
                            extra_info_map.value().c_str());
   }
   // restore NodeTrees
@@ -90,6 +91,26 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
               device_node);  // insert into runtime_node
         }
       }
+      // handle mem node
+      for (int mem_node_index = 0;
+           mem_node_index < host_node_proto.mem_nodes_size();
+           mem_node_index++) {
+        const MemTraceEventNodeProto& mem_node_proto =
+            host_node_proto.mem_nodes(mem_node_index);
+        MemTraceEventNode* mem_node = RestoreMemTraceEventNode(mem_node_proto);
+        host_node->AddMemNode(mem_node);
+      }
+      // handle op supplement node
+      for (int op_supplement_node_index = 0;
+           op_supplement_node_index <
+           host_node_proto.op_supplement_nodes_size();
+           op_supplement_node_index++) {
+        const OperatorSupplementEventNodeProto& op_supplement_node_proto =
+            host_node_proto.op_supplement_nodes(op_supplement_node_index);
+        OperatorSupplementEventNode* op_supplement_node =
+            RestoreOperatorSupplementEventNode(op_supplement_node_proto);
+        host_node->SetOperatorSupplementNode(op_supplement_node);
+      }
     }
     // restore parent-child relationship
     for (auto it = child_parent_map.begin(); it != child_parent_map.end();
@@ -174,6 +195,64 @@ HostTraceEventNode* DeserializationReader::RestoreHostTraceEventNode(
   return new HostTraceEventNode(host_event);
 }
 
+MemTraceEventNode* DeserializationReader::RestoreMemTraceEventNode(
+    const MemTraceEventNodeProto& mem_node_proto) {
+  const MemTraceEventProto& mem_event_proto = mem_node_proto.mem_event();
+  MemTraceEvent mem_event;
+  mem_event.timestamp_ns = mem_event_proto.timestamp_ns();
+  mem_event.addr = mem_event_proto.addr();
+  mem_event.type = static_cast<TracerMemEventType>(mem_event_proto.type());
+  mem_event.process_id = mem_event_proto.process_id();
+  mem_event.thread_id = mem_event_proto.thread_id();
+  mem_event.increase_bytes = mem_event_proto.increase_bytes();
+  mem_event.place = mem_event_proto.place();
+  mem_event.current_allocated = mem_event_proto.current_allocated();
+  mem_event.current_reserved = mem_event_proto.current_reserved();
+  mem_event.peak_allocated = mem_event_proto.peak_allocated();
+  mem_event.peak_reserved = mem_event_proto.peak_reserved();
+  return new MemTraceEventNode(mem_event);
+}
+
+OperatorSupplementEventNode*
+DeserializationReader::RestoreOperatorSupplementEventNode(
+    const OperatorSupplementEventNodeProto& op_supplement_node_proto) {
+  const OperatorSupplementEventProto& op_supplement_event_proto =
+      op_supplement_node_proto.op_supplement_event();
+  OperatorSupplementEvent op_supplement_event;
+  op_supplement_event.timestamp_ns = op_supplement_event_proto.timestamp_ns();
+  op_supplement_event.op_type = op_supplement_event_proto.op_type();
+  op_supplement_event.callstack = op_supplement_event_proto.callstack();
+  op_supplement_event.process_id = op_supplement_event_proto.process_id();
+  op_supplement_event.thread_id = op_supplement_event_proto.thread_id();
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  auto input_shape_proto = op_supplement_event_proto.input_shapes();
+  for (int i = 0; i < input_shape_proto.key_size(); i++) {
+    auto input_shape_vec = input_shapes[input_shape_proto.key(i)];
+    auto shape_vectors_proto = input_shape_proto.shape_vecs(i);
+    for (int j = 0; j < shape_vectors_proto.shapes_size(); j++) {
+      auto shape_vector_proto = shape_vectors_proto.shapes(j);
+      std::vector<int64_t> shape;
+      for (int k = 0; k < shape_vector_proto.size_size(); k++) {
+        shape.push_back(shape_vector_proto.size(k));
+      }
+      input_shape_vec.push_back(shape);
+    }
+  }
+  op_supplement_event.input_shapes = input_shapes;
+  auto dtype_proto = op_supplement_event_proto.dtypes();
+  for (int i = 0; i < dtype_proto.key_size(); i++) {
+    auto dtype_vec = dtypes[dtype_proto.key(i)];
+    auto dtype_vec_proto = dtype_proto.dtype_vecs(i);
+    for (int j = 0; j < dtype_vec_proto.dtype_size(); j++) {
+      auto dtype_string = dtype_vec_proto.dtype(j);
+      dtype_vec.push_back(dtype_string);
+    }
+  }
+  op_supplement_event.dtypes = dtypes;
+  return new OperatorSupplementEventNode(op_supplement_event);
+}
+
 KernelEventInfo DeserializationReader::HandleKernelEventInfoProto(
     const DeviceTraceEventProto& device_event_proto) {
   const KernelEventInfoProto& kernel_info_proto =
@@ -203,11 +282,14 @@ MemcpyEventInfo DeserializationReader::HandleMemcpyEventInfoProto(
       device_event_proto.memcpy_info();
   MemcpyEventInfo memcpy_info;
   memcpy_info.num_bytes = memcpy_info_proto.num_bytes();
-  std::strncpy(memcpy_info.copy_kind, memcpy_info_proto.copy_kind().c_str(),
+  std::strncpy(memcpy_info.copy_kind,
+               memcpy_info_proto.copy_kind().c_str(),
                kMemKindMaxLen - 1);
-  std::strncpy(memcpy_info.src_kind, memcpy_info_proto.src_kind().c_str(),
+  std::strncpy(memcpy_info.src_kind,
+               memcpy_info_proto.src_kind().c_str(),
                kMemKindMaxLen - 1);
-  std::strncpy(memcpy_info.dst_kind, memcpy_info_proto.dst_kind().c_str(),
+  std::strncpy(memcpy_info.dst_kind,
+               memcpy_info_proto.dst_kind().c_str(),
                kMemKindMaxLen - 1);
   return memcpy_info;
 }
@@ -218,7 +300,8 @@ MemsetEventInfo DeserializationReader::HandleMemsetEventInfoProto(
       device_event_proto.memset_info();
   MemsetEventInfo memset_info;
   memset_info.num_bytes = memset_info_proto.num_bytes();
-  std::strncpy(memset_info.memory_kind, memset_info_proto.memory_kind().c_str(),
+  std::strncpy(memset_info.memory_kind,
+               memset_info_proto.memory_kind().c_str(),
                kMemKindMaxLen - 1);
   memset_info.value = memset_info_proto.value();
   return memset_info;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index e6feb4f9489e8..7df93b7703c32 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -36,6 +36,9 @@ class DeserializationReader {
   KernelEventInfo HandleKernelEventInfoProto(const DeviceTraceEventProto&);
   MemcpyEventInfo HandleMemcpyEventInfoProto(const DeviceTraceEventProto&);
   MemsetEventInfo HandleMemsetEventInfoProto(const DeviceTraceEventProto&);
+  MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
+  OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
+      const OperatorSupplementEventNodeProto&);
   std::string filename_;
   std::ifstream input_file_stream_;
   NodeTreesProto* node_trees_proto_;
diff --git a/paddle/fluid/platform/profiler/dump/nodetree.proto b/paddle/fluid/platform/profiler/dump/nodetree.proto
index 7016745059d40..4ebfb6e73b331 100644
--- a/paddle/fluid/platform/profiler/dump/nodetree.proto
+++ b/paddle/fluid/platform/profiler/dump/nodetree.proto
@@ -46,6 +46,19 @@ enum TracerEventTypeProto {
   PythonOp = 13;
   // Used to mark python level userdefined
   PythonUserDefined = 14;
+  // Used to mark mlu runtime record returned by cnpapi
+  MluRuntime = 15;
+};
+
+enum TracerMemEventTypeProto {
+  // Used to mark memory allocation which is managed by paddle
+  Allocate = 0;
+  // Used to mark memory free which is managed by paddle
+  Free = 1;
+  // Used to mark reserved memory allocation which is applied from device.
+  ReservedAllocate = 2;
+  // Used to mark reserved memory free which is released to device.
+  ReservedFree = 3;
 };
 
 message KernelEventInfoProto {
@@ -121,6 +134,62 @@ message HostTraceEventProto {
   required uint64 thread_id = 6;
 }
 
+message MemTraceEventProto {
+  // timestamp of the record
+  required uint64 timestamp_ns = 1;
+  // memory manipulation type
+  required TracerMemEventTypeProto type = 2;
+  // memory addr of allocation or free
+  required uint64 addr = 3;
+  // process id of the record
+  required uint64 process_id = 4;
+  // thread id of the record
+  required uint64 thread_id = 5;
+  // increase bytes after this manipulation, allocation for sign +, free for
+  // sign -
+  required int64 increase_bytes = 6;
+  // place
+  required string place = 7;
+  // current total allocated memory
+  required uint64 current_allocated = 8;
+  // current total reserved memory
+  required uint64 current_reserved = 9;
+  // current peak allocated memory
+  required uint64 peak_allocated = 10;
+  // current peak reserved memory
+  required uint64 peak_reserved = 11;
+}
+
+message OperatorSupplementEventProto {
+  // timestamp of the record
+  required uint64 timestamp_ns = 1;
+  // op type name
+  required string op_type = 2;
+  // process id of the record
+  required uint64 process_id = 3;
+  // thread id of the record
+  required uint64 thread_id = 4;
+  // input shapes
+  message input_shape_proto {
+    repeated string key = 1;
+    message shape_vector {
+      message shape { repeated uint64 size = 1; }
+      repeated shape shapes = 1;
+    }
+    repeated shape_vector shape_vecs = 2;
+  }
+  required input_shape_proto input_shapes = 5;
+  // dtypes
+  message dtype_proto {
+    repeated string key = 1;
+    message dtype_vector { repeated string dtype = 1; }
+    repeated dtype_vector dtype_vecs = 2;
+  }
+  required dtype_proto dtypes = 6;
+  // call stack
+  required string callstack = 7;
+}
+
 message CudaRuntimeTraceEventProto {
   // record name
   required string name = 1;
@@ -166,6 +235,12 @@ message DeviceTraceEventProto {
   }
 }
 
+message OperatorSupplementEventNodeProto {
+  required OperatorSupplementEventProto op_supplement_event = 1;
+}
+
+message MemTraceEventNodeProto { required MemTraceEventProto mem_event = 1; }
+
 message DeviceTraceEventNodeProto {
   required DeviceTraceEventProto device_event = 1;
 }
@@ -180,6 +255,9 @@ message HostTraceEventNodeProto {
   required int64 parentid = 2;
   required HostTraceEventProto host_trace_event = 3;
   repeated CudaRuntimeTraceEventNodeProto runtime_nodes = 4;
+  // below is added in version 1.0.1
+  repeated MemTraceEventNodeProto mem_nodes = 5;
+  repeated OperatorSupplementEventNodeProto op_supplement_nodes = 6;
 }
 
 message ThreadNodeTreeProto {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 73021f4362af5..cbb86e76d3a1e 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -20,19 +20,19 @@ namespace paddle {
 namespace platform {
 
 static const char* kDefaultFilename = "pid_%s_time_%s.paddle_trace.pb";
-static const char* version = "1.0.0";
+static const char* version = "1.0.1";
 static uint32_t span_indx = 0;
 
 static std::string DefaultFileName() {
   auto pid = GetProcessId();
-  return string_format(std::string(kDefaultFilename), pid,
-                       GetStringFormatLocalTime().c_str());
+  return string_format(
+      std::string(kDefaultFilename), pid, GetStringFormatLocalTime().c_str());
 }
 
 void SerializationLogger::OpenFile() {
-  output_file_stream_.open(filename_, std::ofstream::out |
-                                          std::ofstream::trunc |
-                                          std::ofstream::binary);
+  output_file_stream_.open(
+      filename_,
+      std::ofstream::out | std::ofstream::trunc | std::ofstream::binary);
   if (!output_file_stream_) {
     LOG(WARNING) << "Unable to open file for writing profiling data."
                  << std::endl;
@@ -50,7 +50,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
       thread2host_event_nodes = node_trees.Traverse(true);
 
   for (auto it = thread2host_event_nodes.begin();
-       it != thread2host_event_nodes.end(); ++it) {
+       it != thread2host_event_nodes.end();
+       ++it) {
     // 1. order every node an index, every node a parent
     std::map<HostTraceEventNode*, int64_t> node_index_map;
     std::map<HostTraceEventNode*, int64_t> node_parent_map;
@@ -64,7 +65,8 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
     for (auto hostnode = it->second.begin(); hostnode != it->second.end();
          ++hostnode) {
       for (auto childnode = (*hostnode)->GetChildren().begin();
-           childnode != (*hostnode)->GetChildren().end(); ++childnode) {
+           childnode != (*hostnode)->GetChildren().end();
+           ++childnode) {
         node_parent_map[(*childnode)] =
             node_index_map[(*hostnode)];  // mark each node's parent
       }
@@ -106,10 +108,36 @@ void SerializationLogger::LogNodeTrees(const NodeTrees& node_trees) {
           (*devicenode)->LogMe(this);  // fill detail information
         }
       }
+      for (auto memnode = (*hostnode)->GetMemTraceEventNodes().begin();
+           memnode != (*hostnode)->GetMemTraceEventNodes().end();
+           ++memnode) {
+        MemTraceEventNodeProto* mem_node_proto =
+            current_host_trace_event_node_proto_->add_mem_nodes();
+        current_mem_trace_event_node_proto_ = mem_node_proto;
+        (*memnode)->LogMe(this);
+      }
     }
   }
 }
 
+void SerializationLogger::LogMemTraceEventNode(
+    const MemTraceEventNode& mem_node) {
+  MemTraceEventProto* mem_trace_event = new MemTraceEventProto();
+  mem_trace_event->set_timestamp_ns(mem_node.TimeStampNs());
+  mem_trace_event->set_type(
+      static_cast<TracerMemEventTypeProto>(mem_node.Type()));
+  mem_trace_event->set_addr(mem_node.Addr());
+  mem_trace_event->set_process_id(mem_node.ProcessId());
+  mem_trace_event->set_thread_id(mem_node.ThreadId());
+  mem_trace_event->set_increase_bytes(mem_node.IncreaseBytes());
+  mem_trace_event->set_place(mem_node.Place());
+  mem_trace_event->set_current_allocated(mem_node.CurrentAllocated());
+  mem_trace_event->set_current_reserved(mem_node.CurrentReserved());
+  mem_trace_event->set_peak_allocated(mem_node.PeakAllocated());
+  mem_trace_event->set_peak_reserved(mem_node.PeakReserved());
+  current_mem_trace_event_node_proto_->set_allocated_mem_event(mem_trace_event);
+}
+
 void SerializationLogger::LogHostTraceEventNode(
     const HostTraceEventNode& host_node) {
   HostTraceEventProto* host_trace_event = new HostTraceEventProto();
@@ -122,6 +150,63 @@ void SerializationLogger::LogHostTraceEventNode(
   host_trace_event->set_thread_id(host_node.ThreadId());
   current_host_trace_event_node_proto_->set_allocated_host_trace_event(
       host_trace_event);
+  OperatorSupplementEventNode* op_supplement_event_node =
+      host_node.GetOperatorSupplementEventNode();
+  if (op_supplement_event_node != nullptr) {
+    current_op_supplement_event_node_proto_ =
+        current_host_trace_event_node_proto_->add_op_supplement_nodes();
+    OperatorSupplementEventProto* op_supplement_event_proto =
+        new OperatorSupplementEventProto();
+    op_supplement_event_proto->set_op_type(op_supplement_event_node->Name());
+    op_supplement_event_proto->set_timestamp_ns(
+        op_supplement_event_node->TimeStampNs());
+    op_supplement_event_proto->set_process_id(
+        op_supplement_event_node->ProcessId());
+    op_supplement_event_proto->set_thread_id(
+        op_supplement_event_node->ThreadId());
+    op_supplement_event_proto->set_callstack(
+        op_supplement_event_node->CallStack());
+
+    OperatorSupplementEventProto::input_shape_proto* input_shape_proto =
+        op_supplement_event_proto->mutable_input_shapes();
+    for (auto it = op_supplement_event_node->InputShapes().begin();
+         it != op_supplement_event_node->InputShapes().end();
+         it++) {
+      input_shape_proto->add_key(it->first);
+      OperatorSupplementEventProto::input_shape_proto::shape_vector*
+          shape_vectors_proto = input_shape_proto->add_shape_vecs();
+      auto shape_vectors = it->second;
+      for (auto shape_vecs_it = shape_vectors.begin();
+           shape_vecs_it != shape_vectors.end();
+           shape_vecs_it++) {
+        auto shape_vector = *shape_vecs_it;
+        OperatorSupplementEventProto::input_shape_proto::shape_vector::shape*
+            shape_proto = shape_vectors_proto->add_shapes();
+        for (auto shape_it = shape_vector.begin();
+             shape_it != shape_vector.end();
+             shape_it++) {
+          shape_proto->add_size(*shape_it);
+        }
+      }
+    }
+
+    OperatorSupplementEventProto::dtype_proto* dtype_proto =
+        op_supplement_event_proto->mutable_dtypes();
+    for (auto it = op_supplement_event_node->Dtypes().begin();
+         it != op_supplement_event_node->Dtypes().end();
+         it++) {
+      dtype_proto->add_key(it->first);
+      OperatorSupplementEventProto::dtype_proto::dtype_vector*
+          dtype_vector_proto = dtype_proto->add_dtype_vecs();
+      auto dtype_vector = it->second;
+      for (auto dtype_it = dtype_vector.begin(); dtype_it != dtype_vector.end();
+           dtype_it++) {
+        dtype_vector_proto->add_dtype(*dtype_it);
+      }
+    }
+    current_op_supplement_event_node_proto_->set_allocated_op_supplement_event(
+        op_supplement_event_proto);
+  }
 }
 
 void SerializationLogger::LogRuntimeTraceEventNode(
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 378834cff590d..31910cb68c5d7 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -34,6 +34,7 @@ class SerializationLogger : public BaseLogger {
   void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) override;
   void LogNodeTrees(const NodeTrees&) override;
   void LogMetaInfo(const std::unordered_map<std::string, std::string>);
+  void LogMemTraceEventNode(const MemTraceEventNode&) override;
 
  private:
   void OpenFile();
@@ -48,6 +49,8 @@ class SerializationLogger : public BaseLogger {
   HostTraceEventNodeProto* current_host_trace_event_node_proto_;
   CudaRuntimeTraceEventNodeProto* current_runtime_trace_event_node_proto_;
   DeviceTraceEventNodeProto* current_device_trace_event_node_proto_;
+  MemTraceEventNodeProto* current_mem_trace_event_node_proto_;
+  OperatorSupplementEventNodeProto* current_op_supplement_event_node_proto_;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
index 9380a26dbc3b4..a49d799c78521 100644
--- a/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/test_serialization_logger.cc
@@ -35,6 +35,7 @@ using paddle::platform::ProfilerResult;
 using paddle::platform::RuntimeTraceEvent;
 using paddle::platform::SerializationLogger;
 using paddle::platform::TracerEventType;
+using paddle::platform::TracerMemEventType;
 
 TEST(SerializationLoggerTest, dump_case0) {
   std::list<HostTraceEvent> host_events;
@@ -54,6 +55,36 @@ TEST(SerializationLoggerTest, dump_case0) {
       std::string("op2"), TracerEventType::Operator, 21000, 30000, 10, 10));
   host_events.push_back(HostTraceEvent(
       std::string("op3"), TracerEventType::Operator, 31000, 40000, 10, 11));
+  mem_events.push_back(MemTraceEvent(11500,
+                                     0x1000,
+                                     TracerMemEventType::Allocate,
+                                     10,
+                                     10,
+                                     50,
+                                     "GPU:0",
+                                     50,
+                                     50,
+                                     100,
+                                     100));
+  mem_events.push_back(MemTraceEvent(11900,
+                                     0x1000,
+                                     TracerMemEventType::Free,
+                                     10,
+                                     10,
+                                     -50,
+                                     "GPU:0",
+                                     0,
+                                     50,
+                                     100,
+                                     100));
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
+  input_shapes[std::string("X")].push_back(std::vector<int64_t>{4, 5, 6, 7});
+  dtypes[std::string("X")].push_back(std::string("int8"));
+  dtypes[std::string("X")].push_back(std::string("float32"));
+  op_supplement_events.push_back(OperatorSupplementEvent(
+      11600, "op1", input_shapes, dtypes, "op1()", 10, 10));
   runtime_events.push_back(RuntimeTraceEvent(
       std::string("cudalaunch1"), 15000, 17000, 10, 10, 1, 0));
   runtime_events.push_back(RuntimeTraceEvent(
@@ -128,6 +159,8 @@ TEST(SerializationLoggerTest, dump_case0) {
     if ((*it)->Name() == "op1") {
       EXPECT_EQ((*it)->GetChildren().size(), 0u);
       EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
     }
   }
   for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
@@ -137,6 +170,7 @@ TEST(SerializationLoggerTest, dump_case0) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(SerializationLoggerTest, dump_case1) {
@@ -224,6 +258,7 @@ TEST(SerializationLoggerTest, dump_case1) {
     }
   }
   tree.LogMe(&logger);
+  logger.LogMetaInfo(std::unordered_map<std::string, std::string>());
 }
 
 TEST(DeserializationReaderTest, restore_case0) {
@@ -243,6 +278,8 @@ TEST(DeserializationReaderTest, restore_case0) {
     if ((*it)->Name() == "op1") {
       EXPECT_EQ((*it)->GetChildren().size(), 0u);
       EXPECT_EQ((*it)->GetRuntimeTraceEventNodes().size(), 2u);
+      EXPECT_EQ((*it)->GetMemTraceEventNodes().size(), 2u);
+      EXPECT_NE((*it)->GetOperatorSupplementEventNode(), nullptr);
     }
   }
   for (auto it = thread2_nodes.begin(); it != thread2_nodes.end(); it++) {
diff --git a/paddle/fluid/platform/profiler/event_node.h b/paddle/fluid/platform/profiler/event_node.h
index 13ec115100505..34e6556f7f47a 100644
--- a/paddle/fluid/platform/profiler/event_node.h
+++ b/paddle/fluid/platform/profiler/event_node.h
@@ -47,6 +47,8 @@ class MemTraceEventNode {
   std::string Place() const { return mem_event_.place; }
   uint64_t CurrentAllocated() const { return mem_event_.current_allocated; }
   uint64_t CurrentReserved() const { return mem_event_.current_reserved; }
+  uint64_t PeakAllocated() const { return mem_event_.peak_allocated; }
+  uint64_t PeakReserved() const { return mem_event_.peak_reserved; }
 
   // member function
   void LogMe(BaseLogger* logger) { logger->LogMemTraceEventNode(*this); }
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 1a6f19d2f93af..028d666f35537 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -31,6 +31,9 @@ HostPythonNode::~HostPythonNode() {
   for (auto it = device_node_ptrs.begin(); it != device_node_ptrs.end(); ++it) {
     delete *it;
   }
+  for (auto it = mem_node_ptrs.begin(); it != mem_node_ptrs.end(); ++it) {
+    delete *it;
+  }
 }
 
 HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
@@ -52,7 +55,8 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   }
   // copy its CudaRuntimeTraceEventNode
   for (auto runtimenode = root->GetRuntimeTraceEventNodes().begin();
-       runtimenode != root->GetRuntimeTraceEventNodes().end(); ++runtimenode) {
+       runtimenode != root->GetRuntimeTraceEventNodes().end();
+       ++runtimenode) {
     HostPythonNode* runtime_python_node = new HostPythonNode();
     runtime_python_node->name = (*runtimenode)->Name();
     runtime_python_node->type = (*runtimenode)->Type();
@@ -76,6 +80,32 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
       runtime_python_node->device_node_ptrs.push_back(device_python_node);
     }
   }
+  // copy MemTraceEventNode
+  for (auto memnode = root->GetMemTraceEventNodes().begin();
+       memnode != root->GetMemTraceEventNodes().end();
+       memnode++) {
+    MemPythonNode* mem_python_node = new MemPythonNode();
+    mem_python_node->timestamp_ns = (*memnode)->TimeStampNs();
+    mem_python_node->addr = (*memnode)->Addr();
+    mem_python_node->type = (*memnode)->Type();
+    mem_python_node->process_id = (*memnode)->ProcessId();
+    mem_python_node->thread_id = (*memnode)->ThreadId();
+    mem_python_node->increase_bytes = (*memnode)->IncreaseBytes();
+    mem_python_node->place = (*memnode)->Place();
+    mem_python_node->current_allocated = (*memnode)->CurrentAllocated();
+    mem_python_node->current_reserved = (*memnode)->CurrentReserved();
+    mem_python_node->peak_allocated = (*memnode)->PeakAllocated();
+    mem_python_node->peak_reserved = (*memnode)->PeakReserved();
+    host_python_node->mem_node_ptrs.push_back(mem_python_node);
+  }
+  // copy OperatorSupplementEventNode's information if exists
+  OperatorSupplementEventNode* op_supplement_node =
+      root->GetOperatorSupplementEventNode();
+  if (op_supplement_node != nullptr) {
+    host_python_node->input_shapes = op_supplement_node->InputShapes();
+    host_python_node->dtypes = op_supplement_node->Dtypes();
+    host_python_node->callstack = op_supplement_node->CallStack();
+  }
   return host_python_node;
 }
 
@@ -93,7 +123,8 @@ ProfilerResult::ProfilerResult(std::unique_ptr<NodeTrees> tree,
 ProfilerResult::~ProfilerResult() {
   // delete all root nodes
   for (auto it = thread_event_trees_map_.begin();
-       it != thread_event_trees_map_.end(); ++it) {
+       it != thread_event_trees_map_.end();
+       ++it) {
     delete it->second;
   }
 }
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index 12ecb9fde32aa..44f6e61fd3737 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -43,6 +43,35 @@ struct DevicePythonNode {
   uint64_t stream_id;
 };
 
+struct MemPythonNode {
+  MemPythonNode() = default;
+  ~MemPythonNode() {}
+
+  // timestamp of the record
+  uint64_t timestamp_ns;
+  // memory addr of allocation or free
+  uint64_t addr;
+  // memory manipulation type
+  TracerMemEventType type;
+  // process id of the record
+  uint64_t process_id;
+  // thread id of the record
+  uint64_t thread_id;
+  // increase bytes after this manipulation, allocation for sign +, free for
+  // sign -
+  int64_t increase_bytes;
+  // place
+  std::string place;
+  // current total allocated memory
+  uint64_t current_allocated;
+  // current total reserved memory
+  uint64_t current_reserved;
+  // peak  allocated memory
+  uint64_t peak_allocated;
+  // peak  reserved memory
+  uint64_t peak_reserved;
+};
+
 struct HostPythonNode {
   HostPythonNode() = default;
   ~HostPythonNode();
@@ -58,12 +87,19 @@ struct HostPythonNode {
   uint64_t process_id;
   // thread id of the record
   uint64_t thread_id;
+  // input shapes
+  std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+  std::map<std::string, std::vector<std::string>> dtypes;
+  // call stack
+  std::string callstack;
   // children node
   std::vector<HostPythonNode*> children_node_ptrs;
   // runtime node
   std::vector<HostPythonNode*> runtime_node_ptrs;
   // device node
   std::vector<DevicePythonNode*> device_node_ptrs;
+  // mem node
+  std::vector<MemPythonNode*> mem_node_ptrs;
 };
 
 class ProfilerResult {
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index b7eb53331b793..7923a8fba0051 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -11,8 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+
+#include <sstream>
+
 #include "glog/logging.h"
 #include "paddle/fluid/platform/flags.h"
 #include "paddle/fluid/platform/profiler/common_event.h"
@@ -20,7 +22,8 @@
 
 // Used to filter events, works like glog VLOG(level).
 // RecordEvent will works if host_trace_level >= level.
-PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
+PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
+                             1,
                              "RecordEvent will works "
                              "if host_trace_level >= level.");
 
@@ -49,6 +52,79 @@ void ProcessHostEvents(const HostEventSection& host_events,
   }
 }
 
+void ProcessHostMemEvents(
+    const HostEventSection<CommonMemEvent>& host_mem_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : host_mem_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      MemTraceEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.addr = evt.addr;
+      event.type = evt.type;
+      event.increase_bytes = evt.increase_bytes;
+      event.place = evt.place.DebugString();
+      event.current_allocated = evt.current_allocated;
+      event.current_reserved = evt.current_reserved;
+      event.peak_allocated = evt.peak_allocated;
+      event.peak_reserved = evt.peak_reserved;
+      event.process_id = host_mem_events.process_id;
+      event.thread_id = tid;
+      collector->AddMemEvent(std::move(event));
+    }
+  }
+}
+
+void ProcessOperatorSupplementEvents(
+    const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
+    TraceEventCollector* collector) {
+  for (const auto& thr_sec : op_supplement_events.thr_sections) {
+    uint64_t tid = thr_sec.thread_id;
+    if (thr_sec.thread_name != kDefaultThreadName) {
+      collector->AddThreadName(tid, thr_sec.thread_name);
+    }
+    for (const auto& evt : thr_sec.events) {
+      OperatorSupplementEvent event;
+      event.timestamp_ns = evt.timestamp_ns;
+      event.op_type = evt.op_type;
+      std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
+      std::map<std::string, std::vector<std::string>> dtypes;
+      std::string callstack;
+      for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
+           it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          input_shapes[it->first].push_back(std::vector<int64_t>());
+          for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
+               dim_idx++) {
+            input_shapes[it->first][idx].push_back(
+                it->second.at(idx).at(dim_idx));
+          }
+        }
+      }
+      for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
+        for (auto idx = 0lu; idx < it->second.size(); idx++) {
+          dtypes[it->first].push_back(
+              framework::proto::VarType::Type_Name(it->second.at(idx)));
+        }
+      }
+
+      std::ostringstream result_string;
+      for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
+        result_string << (*it) << std::endl;
+      }
+      event.input_shapes = input_shapes;
+      event.dtypes = dtypes;
+      event.callstack = result_string.str();
+      event.process_id = op_supplement_events.process_id;
+      event.thread_id = tid;
+      collector->AddOperatorSupplementEvent(std::move(event));
+    }
+  }
+}
+
 }  // namespace
 
 void HostTracer::PrepareTracing() {
@@ -59,16 +135,21 @@ void HostTracer::PrepareTracing() {
 
 void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
-      state_ == TracerState::READY || state_ == TracerState::STOPED, true,
+      state_ == TracerState::READY || state_ == TracerState::STOPED,
+      true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
-  HostEventRecorder::GetInstance().GatherEvents();
+  HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+      .GatherEvents();
   HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
 
 void HostTracer::StopTracing() {
   PADDLE_ENFORCE_EQ(
-      state_, TracerState::STARTED,
+      state_,
+      TracerState::STARTED,
       platform::errors::PreconditionNotMet("TracerState must be STARTED"));
   HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
   state_ = TracerState::STOPED;
@@ -76,11 +157,19 @@ void HostTracer::StopTracing() {
 
 void HostTracer::CollectTraceData(TraceEventCollector* collector) {
   PADDLE_ENFORCE_EQ(
-      state_, TracerState::STOPED,
+      state_,
+      TracerState::STOPED,
       platform::errors::PreconditionNotMet("TracerState must be STOPED"));
   HostEventSection host_events =
       HostEventRecorder::GetInstance().GatherEvents();
   ProcessHostEvents(host_events, collector);
+  HostEventSection<CommonMemEvent> host_mem_events =
+      HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
+  ProcessHostMemEvents(host_mem_events, collector);
+  HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
+      HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
+          .GatherEvents();
+  ProcessOperatorSupplementEvents(op_supplement_events, collector);
 }
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/mem_tracing.h b/paddle/fluid/platform/profiler/mem_tracing.h
new file mode 100644
index 0000000000000..3d3508c7bd570
--- /dev/null
+++ b/paddle/fluid/platform/profiler/mem_tracing.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+namespace platform {
+// Memory event tracing. A trace marks memory manipulation such as allocation
+// and free.
+// The events can be used to draw memory variation curve.
+class RecordMemEvent {
+ public:
+  /**
+   * @param ptr:  Pointer address allocated or free.
+   * @param place: Device for this memory event.
+   * @param size: Memory size allocated or free.
+   * @param type: Denote manipulation type for this memory event.
+   */
+  explicit RecordMemEvent(
+      const void* ptr,
+      const Place& place,
+      size_t size,
+      const TracerMemEventType type = TracerMemEventType::Allocate);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 32310b9e86228..ab9da63c1165c 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -22,16 +22,18 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/profiler.h"
 
 TEST(ProfilerTest, TestHostTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
   using paddle::platform::RecordInstantEvent;
   using paddle::platform::TracerEventType;
-  using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 2;
   options.trace_switch = 3;
@@ -40,10 +42,10 @@ TEST(ProfilerTest, TestHostTracer) {
   profiler->Prepare();
   profiler->Start();
   {
-    RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
-                       2);
-    RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
-                       3);
+    RecordInstantEvent(
+        "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
+    RecordInstantEvent(
+        "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
   }
   auto profiler_result = profiler->Stop();
   auto& nodetree = profiler_result->GetNodeTrees();
@@ -58,8 +60,8 @@ TEST(ProfilerTest, TestHostTracer) {
 }
 
 TEST(ProfilerTest, TestCudaTracer) {
-  using paddle::platform::ProfilerOptions;
   using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
   using paddle::platform::ProfilerResult;
   ProfilerOptions options;
   options.trace_level = 0;
@@ -92,3 +94,49 @@ TEST(ProfilerTest, TestCudaTracer) {
   EXPECT_GT(runtime_events.size(), 0u);
 #endif
 }
+
+TEST(ProfilerTest, TestHostTracerForMem) {
+  using paddle::platform::CPUPlace;
+  using paddle::platform::EnableHostEventRecorder;
+  using paddle::platform::MemTraceEventNode;
+  using paddle::platform::Profiler;
+  using paddle::platform::ProfilerOptions;
+  using paddle::platform::ProfilerResult;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::RecordInstantEvent;
+  using paddle::platform::RecordMemEvent;
+  using paddle::platform::TracerEventType;
+  using paddle::platform::TracerMemEventType;
+  ProfilerOptions options;
+  options.trace_level = 1;
+  options.trace_switch = 3;
+  auto profiler = Profiler::Create(options);
+  EXPECT_TRUE(profiler);
+  EnableHostEventRecorder();
+  profiler->Prepare();
+  profiler->Start();
+  {
+    RecordEvent event1(
+        "TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(0),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(
+        reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
+  }
+  {
+    RecordEvent event2(
+        "TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Allocate);
+    RecordMemEvent(reinterpret_cast<void*>(1024),
+                   CPUPlace(),
+                   1024,
+                   TracerMemEventType::Free);
+  }
+  auto profiler_result = profiler->Stop();
+  auto nodetree = profiler_result->GetNodeTrees();
+}
diff --git a/paddle/fluid/platform/profiler/supplement_tracing.h b/paddle/fluid/platform/profiler/supplement_tracing.h
new file mode 100644
index 0000000000000..46b1616d71cc3
--- /dev/null
+++ b/paddle/fluid/platform/profiler/supplement_tracing.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/platform/profiler/trace_event.h"
+
+namespace paddle {
+
+namespace framework {
+class RuntimeContext;
+}
+namespace platform {
+
+class RecordOpInfoSupplement {
+ public:
+  /**
+   * @param type:  Operator type name.
+   * @param attrs: Attribute map of op.
+   * @param shape_ctx: Infershape context object.
+   * @param ctx: Runtime context object.
+   */
+  explicit RecordOpInfoSupplement(const std::string& type,
+                                  const framework::AttributeMap& attrs,
+                                  const framework::InferShapeContext& shape_ctx,
+                                  const framework::RuntimeContext& ctx);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/profiler/test_event_node.cc b/paddle/fluid/platform/profiler/test_event_node.cc
index 41a5ebce023a0..dcf6dd56d74af 100644
--- a/paddle/fluid/platform/profiler/test_event_node.cc
+++ b/paddle/fluid/platform/profiler/test_event_node.cc
@@ -60,9 +60,20 @@ TEST(NodeTreesTest, LogMe_case0) {
                                      50,
                                      "GPU:0",
                                      50,
-                                     50));
-  mem_events.push_back(MemTraceEvent(
-      11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
+                                     50,
+                                     100,
+                                     100));
+  mem_events.push_back(MemTraceEvent(11900,
+                                     0x1000,
+                                     TracerMemEventType::Free,
+                                     10,
+                                     10,
+                                     -50,
+                                     "GPU:0",
+                                     0,
+                                     50,
+                                     100,
+                                     100));
   std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
   std::map<std::string, std::vector<std::string>> dtypes;
   input_shapes[std::string("X")].push_back(std::vector<int64_t>{1, 2, 3});
@@ -267,9 +278,20 @@ TEST(NodeTreesTest, HandleTrees_case0) {
                                      50,
                                      "GPU:0",
                                      50,
-                                     50));
-  mem_events.push_back(MemTraceEvent(
-      11900, 0x1000, TracerMemEventType::Free, 10, 10, -50, "GPU:0", 0, 50));
+                                     50,
+                                     100,
+                                     100));
+  mem_events.push_back(MemTraceEvent(11900,
+                                     0x1000,
+                                     TracerMemEventType::Free,
+                                     10,
+                                     10,
+                                     -50,
+                                     "GPU:0",
+                                     0,
+                                     50,
+                                     100,
+                                     100));
   op_supplement_events.push_back(OperatorSupplementEvent(
       11600,
       "op1",
diff --git a/paddle/fluid/platform/profiler/trace_event.h b/paddle/fluid/platform/profiler/trace_event.h
index d50c5584f5c4b..62d82c19d1796 100644
--- a/paddle/fluid/platform/profiler/trace_event.h
+++ b/paddle/fluid/platform/profiler/trace_event.h
@@ -59,10 +59,14 @@ enum class TracerEventType {
 };
 
 enum class TracerMemEventType {
-  // Used to mark memory allocation
+  // Used to mark memory allocation which is managed by paddle
   Allocate = 0,
-  // Used to mark memory free
+  // Used to mark memory free which is managed by paddle
   Free = 1,
+  // Used to mark reserved memory allocation which is applied from device.
+  ReservedAllocate = 2,
+  // Used to mark reserved memory free which is released to device.
+  ReservedFree = 3,
   // A flag to denote the number of current types
   NumTypes
 };
@@ -318,7 +322,9 @@ struct MemTraceEvent {
                 int64_t increase_bytes,
                 const std::string& place,
                 uint64_t current_allocated,
-                uint64_t current_reserved)
+                uint64_t current_reserved,
+                uint64_t peak_allocated,
+                uint64_t peak_reserved)
       : timestamp_ns(timestamp_ns),
         addr(addr),
         type(type),
@@ -327,7 +333,9 @@ struct MemTraceEvent {
         increase_bytes(increase_bytes),
         place(place),
         current_allocated(current_allocated),
-        current_reserved(current_reserved) {}
+        current_reserved(current_reserved),
+        peak_allocated(peak_allocated),
+        peak_reserved(peak_reserved) {}
 
   // timestamp of the record
   uint64_t timestamp_ns;
@@ -348,6 +356,10 @@ struct MemTraceEvent {
   uint64_t current_allocated;
   // current total reserved memory
   uint64_t current_reserved;
+  // current peak allocated memory
+  uint64_t peak_allocated;
+  // current peak reserved memory
+  uint64_t peak_reserved;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc
index bbfc687738dd9..11035867416b8 100644
--- a/paddle/fluid/platform/profiler/utils.cc
+++ b/paddle/fluid/platform/profiler/utils.cc
@@ -91,7 +91,8 @@ float CalculateEstOccupancy(uint32_t DeviceId,
 #endif
 
 const char* StringTracerMemEventType(TracerMemEventType type) {
-  static const char* categary_name_[] = {"Allocate", "Free"};
+  static const char* categary_name_[] = {
+      "Allocate", "Free", "ReservedAllocate", "ReservedFree"};
   return categary_name_[static_cast<int>(type)];
 }
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index a003de812a3ac..e3dffc6442a48 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -922,6 +922,13 @@ PYBIND11_MODULE(core_noavx, m) {
              return reinterpret_cast<uintptr_t>(
                  self.mutable_data(place, framework::TransToPhiDataType(type)));
            })
+      .def("_mutable_data",
+           [](framework::Tensor &self,
+              paddle::platform::CustomPlace &place,
+              paddle::framework::proto::VarType::Type type) {
+             return reinterpret_cast<uintptr_t>(
+                 self.mutable_data(place, framework::TransToPhiDataType(type)));
+           })
       .def("_mutable_data",
            [](framework::Tensor &self,
               paddle::platform::XPUPlace &place,
@@ -963,6 +970,11 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("tensor"),
            py::arg("place"),
            py::arg("batch_size") = -1)
+      .def("_copy_from",
+           &TensorCopyFrom<paddle::platform::CustomPlace>,
+           py::arg("tensor"),
+           py::arg("place"),
+           py::arg("batch_size") = -1)
       .def("_copy_from",
            &TensorCopyFrom<paddle::platform::XPUPlace>,
            py::arg("tensor"),
@@ -998,6 +1010,11 @@ PYBIND11_MODULE(core_noavx, m) {
            py::arg("array"),
            py::arg("place"),
            py::arg("zero_copy") = false)
+      .def("set",
+           SetTensorFromPyArray<paddle::platform::CustomPlace>,
+           py::arg("array"),
+           py::arg("place"),
+           py::arg("zero_copy") = false)
       .def("set",
            SetTensorFromPyArray<paddle::platform::XPUPlace>,
            py::arg("array"),
@@ -2200,9 +2217,9 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
     return devices;
   });
-  py::class_<platform::CustomPlace>(m,
-                                    "CustomPlace",
-                                    R"DOC(
+  py::class_<platform::CustomPlace> customplace(m,
+                                                "CustomPlace",
+                                                R"DOC(
     CustomPlace is a descriptor of a device.
     It represents a custom device on which a tensor will be allocated and a model will run.
 
@@ -2852,6 +2869,13 @@ All parameter, weight, gradient are variables in Paddle.
              pybind11::gil_scoped_release release;
              self.Run(scope, place);
            })
+      .def("run",
+           [](OperatorBase &self,
+              const Scope &scope,
+              const platform::CustomPlace &place) {
+             pybind11::gil_scoped_release release;
+             self.Run(scope, place);
+           })
       .def("type",
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
@@ -3493,6 +3517,26 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
 
+  py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
+      .def(py::init<>())
+      .def_readwrite("timestamp_ns",
+                     &paddle::platform::MemPythonNode::timestamp_ns)
+      .def_readwrite("addr", &paddle::platform::MemPythonNode::addr)
+      .def_readwrite("type", &paddle::platform::MemPythonNode::type)
+      .def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id)
+      .def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id)
+      .def_readwrite("increase_bytes",
+                     &paddle::platform::MemPythonNode::increase_bytes)
+      .def_readwrite("place", &paddle::platform::MemPythonNode::place)
+      .def_readwrite("current_allocated",
+                     &paddle::platform::MemPythonNode::current_allocated)
+      .def_readwrite("current_reserved",
+                     &paddle::platform::MemPythonNode::current_reserved)
+      .def_readwrite("peak_allocated",
+                     &paddle::platform::MemPythonNode::peak_allocated)
+      .def_readwrite("peak_reserved",
+                     &paddle::platform::MemPythonNode::peak_reserved);
+
   py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
       .def(py::init<>())
       .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
@@ -3515,12 +3559,18 @@ All parameter, weight, gradient are variables in Paddle.
       .def_readwrite("process_id",
                      &paddle::platform::HostPythonNode::process_id)
       .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
+      .def_readwrite("input_shapes",
+                     &paddle::platform::HostPythonNode::input_shapes)
+      .def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
+      .def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack)
       .def_readwrite("children_node",
                      &paddle::platform::HostPythonNode::children_node_ptrs)
       .def_readwrite("runtime_node",
                      &paddle::platform::HostPythonNode::runtime_node_ptrs)
       .def_readwrite("device_node",
-                     &paddle::platform::HostPythonNode::device_node_ptrs);
+                     &paddle::platform::HostPythonNode::device_node_ptrs)
+      .def_readwrite("mem_node",
+                     &paddle::platform::HostPythonNode::mem_node_ptrs);
 
   py::class_<paddle::platform::Profiler>(m, "_Profiler")
       .def("create",
@@ -3555,6 +3605,14 @@ All parameter, weight, gradient are variables in Paddle.
       }))
       .def("end", [](platform::RecordEvent *event) { event->End(); });
 
+  py::enum_<paddle::platform::TracerMemEventType>(m, "TracerMemEventType")
+      .value("Allocate", paddle::platform::TracerMemEventType::Allocate)
+      .value("Free", paddle::platform::TracerMemEventType::Free)
+      .value("ReservedAllocate",
+             paddle::platform::TracerMemEventType::ReservedAllocate)
+      .value("ReservedFree",
+             paddle::platform::TracerMemEventType::ReservedFree);
+
   py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
       .value("Operator", paddle::platform::TracerEventType::Operator)
       .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
@@ -4566,6 +4624,12 @@ All parameter, weight, gradient are variables in Paddle.
                          option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "replicated_collectives_settings") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetReplicatedCollectivesSettings(
+                         option.first.cast<std::string>(),
+                         option.second.cast<std::uint64_t>());
+                   }
                  } else if (option_name == "accumulate_outer_fragment") {
                    for (auto option : element.second.cast<py::dict>()) {
                      std::vector<int> values;