PaddlePaddle · AnnaTrainingG · Apr 18, 2023 · Apr 7, 2023 · Apr 10, 2023 · Apr 10, 2023
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -55,9 +55,12 @@ paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x) {
   VLOG(3) << "Final State Running: "
           << "add_n_ad_func";
   auto api_result = paddle::experimental::add_n(x);
+
+  std::string forward_trace = "";
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
     egr::CheckTensorHasNanOrInf("add_n", api_result);
+    forward_trace = egr::Controller::Instance().GetPythonStack();
   }
 
   // Get Outputs
@@ -83,6 +86,12 @@ paddle::Tensor add_n_ad_func(const std::vector<paddle::Tensor>& x) {
     // Node Construction
     auto grad_node =
         std::shared_ptr<AddNGradNodeFinal>(new AddNGradNodeFinal(1, 1));
+
+    // Set forward's stack
+    if (FLAGS_check_nan_inf) {
+      grad_node->SetForwardTrace(forward_trace);
+    }
+
     // SetAttributes if needed
 
     // Set TensorWrappers for Forward Inputs if needed

diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -110,9 +110,11 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
                                                  dilations,
                                                  groups,
                                                  data_format);
+  std::string forward_trace = "";
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
     egr::CheckTensorHasNanOrInf("conv2d", api_result);
+    forward_trace = egr::Controller::Instance().GetPythonStack();
   }
 
   // Get Outputs
@@ -138,6 +140,12 @@ paddle::Tensor conv2d_ad_func(const paddle::Tensor& input,
     // Node Construction
     auto grad_node =
         std::shared_ptr<Conv2dGradNodeFinal>(new Conv2dGradNodeFinal(1, 2));
+
+    // Set forward's stack
+    if (FLAGS_check_nan_inf) {
+      grad_node->SetForwardTrace(forward_trace);
+    }
+
     // SetAttributes if needed
     grad_node->SetAttributestrides(strides);
     grad_node->SetAttributepaddings(paddings);

diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/sync_batch_norm_fwd_func.cc
@@ -172,9 +172,11 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
                                              data_layout,
                                              use_global_stats,
                                              trainable_statistics);
+  std::string forward_trace = "";
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
     egr::CheckTensorHasNanOrInf("sync_batch_norm_", api_result);
+    forward_trace = egr::Controller::Instance().GetPythonStack();
   }
 
   // Get Outputs
@@ -226,6 +228,12 @@ sync_batch_norm__ad_func(const paddle::Tensor& x,
     // Node Construction
     auto grad_node =
         std::shared_ptr<SyncBatchNormGradNode>(new SyncBatchNormGradNode(6, 5));
+
+    // Set forward's stack
+    if (FLAGS_check_nan_inf) {
+      grad_node->SetForwardTrace(forward_trace);
+    }
+
     egr::Controller::Instance().PushBackForceSequentialNodes(grad_node.get());
     // SetAttributes if needed
     grad_node->SetAttributemomentum(momentum);

diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
@@ -74,6 +74,12 @@ class Controller {
 
   void EnableLayoutAutoTune() { tracer_->EnableLayoutAutoTune(); }
 
+  void SetPythonStack(std::string stack_str) {
+    tracer_->SetPythonStack(stack_str);
+  }
+
+  std::string GetPythonStack() { return tracer_->GetPythonStack(); }
+
   bool HasGrad() const { return tracer_->HasGrad(); }
   void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); }
   std::string GenerateUniqueName(std::string key = "eager_in_tmp") {

diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -296,6 +296,10 @@ class {} : public egr::GradNodeBase {{
 
     // Node Construction
 {}
+    // Set for forward trace
+  if (FLAGS_check_nan_inf) {{
+  {}
+  }}
     // SetAttributes if needed
 {}
     // Set TensorWrappers for Forward Inputs if needed
@@ -484,7 +488,25 @@ class {} : public egr::GradNodeBase {{
     }}
   }}"""
 
-CHECK_NAN_AND_INF_TEMPLATE = """  if (FLAGS_check_nan_inf) {{ egr::CheckTensorHasNanOrInf("{}", {}); }}
+CHECK_NAN_AND_INF_TEMPLATE_FORWARD = """
+  std::string forward_trace ="";
+  if (FLAGS_check_nan_inf) {{
+      egr::CheckTensorHasNanOrInf("{}", {});
+      forward_trace = egr::Controller::Instance().GetPythonStack();
+  }}
+"""
+
+CHECK_NAN_AND_INF_TEMPLATE_BACKWARD = """
+  if (FLAGS_check_nan_inf) {{
+     try{{
+       egr::CheckTensorHasNanOrInf("{}", {});
+     }} catch(...) {{
+       LOG(WARNING) << "There are nan/inf in ({})";
+       auto forward_trace = GetForwardTrace();
+       std::cout<<forward_trace<<std::endl;
+       std::rethrow_exception(std::current_exception());
+     }}
+  }}
 """
 
 inplace_optional_out_type_map = {
@@ -1047,11 +1069,15 @@ def GenerateNodeCreationCodes(self, for_backward=False):
 
         node_event_name = forward_api_name + " node_creation"
         node_creation_event_str = f"{indent}paddle::platform::RecordEvent node_creation_record_event(\"{node_event_name}\", paddle::platform::TracerEventType::OperatorInner, 1);\n"
+        set_forward_trace = (
+            f"{indent} grad_node->SetForwardTrace(forward_trace);"
+        )
         if not for_backward:
             self.node_creation_str = FORWARD_BODY_TEMPLATE.format(
                 node_creation_event_str,
                 pass_stop_gradient_args_str,
                 node_construction_str,
+                set_forward_trace,
                 set_attributes_str,
                 set_input_tensor_wrappers_str,
                 set_grad_out_meta_str,
@@ -1426,7 +1452,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         )
 
         # Check Nan and Inf
-        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE_FORWARD.format(
             function_name, "api_result"
         )
 
@@ -2320,8 +2346,8 @@ def GenerateNodeDefinition(
 {indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"""
 
         # Check Nan and Inf
-        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE.format(
-            backward_api_name, "returns"
+        check_nan_inf_str = CHECK_NAN_AND_INF_TEMPLATE_BACKWARD.format(
+            backward_api_name, "returns", backward_api_name
         )
 
         # Prepare for Node Creation if Necessary

diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -121,7 +121,10 @@ def FindParsingFunctionFromAttributeType(atype):
 NOAMP_DYGRAPH_FUNCTION_TEMPLATE = "decltype({}({})) out = {}({});"
 
 
-FUNCTION_SET_DEVICE_TEMPLATE = """{}    if (paddle::platform::is_gpu_place(place)) {{
+FUNCTION_SET_DEVICE_TEMPLATE = """{}
+    LOG(INFO)<<"this is SetPythonStack";
+    SetPythonStack();
+    if (paddle::platform::is_gpu_place(place)) {{
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::backends::gpu::SetDeviceId(place.device);
       VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
@@ -170,7 +173,6 @@ def FindParsingFunctionFromAttributeType(atype):
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
-
 namespace paddle {{
 namespace pybind {{
 

@@ -292,6 +292,10 @@ class GradNodeBase {
     is_tensor_wrappers_cleared_ = is_tensor_wrappers_cleared;
   }
 
+  void SetForwardTrace(std::string trace) { forward_trace_ = trace; }
+
+  std::string GetForwardTrace() { return forward_trace_; }
+
  private:
   // bwd_out_meta_ is used to record Grad output info for backward
   paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>
@@ -317,6 +321,8 @@ class GradNodeBase {
   bool need_complex_to_real_ = false;
 
   bool is_tensor_wrappers_cleared_ = false;
+  // The trace of forward function
+  std::string forward_trace_ = "";
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
@@ -37,6 +37,7 @@ DECLARE_string(tracer_mkldnn_ops_off);
 
 namespace paddle {
 namespace imperative {
+thread_local std::string Tracer::python_stack_ = "";
 
 thread_local bool Tracer::enable_program_desc_tracing_ = false;
 

diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
@@ -199,7 +199,8 @@ class Tracer {
     use_layout_autotune_ = false;
     return false;
   }
-
+  void SetPythonStack(std::string stack_str) { python_stack_ = stack_str; }
+  std::string GetPythonStack() { return python_stack_; }
   phi::KernelSignature GetExpectedKernelSignature(
       const std::string& type,
       const NameTensorMap& ins,
@@ -215,6 +216,7 @@ class Tracer {
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
+  static thread_local std::string python_stack_;
   static thread_local bool enable_program_desc_tracing_;
   static thread_local bool use_layout_autotune_;
   static thread_local bool has_grad_;

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
@@ -38,6 +38,7 @@ limitations under the License. */
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+DECLARE_bool(check_nan_inf);
 
 namespace paddle {
 namespace pybind {
@@ -216,6 +217,21 @@ std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
   return py::cast<std::shared_ptr<imperative::VarBase>>(obj);
 }
 
+void SetPythonStack() {
+  if (FLAGS_check_nan_inf) {
+    pybind11::gil_scoped_acquire gil;
+    PyObject* mod = PyImport_ImportModule("traceback");
+    PyObject* traceback_list = PyObject_CallMethod(mod, "format_stack", "");
+    std::string str = "";
+    for (Py_ssize_t i = 0; i < PyList_Size(traceback_list); i++) {
+      PyObject* line = PyList_GetItem(traceback_list, i);
+      str += py::str(PyUnicode_AsUTF8(line));
+    }
+    std::string last = str + egr::Controller::Instance().GetPythonStack();
+    egr::Controller::Instance().SetPythonStack(last);
+  }
+}
+
 std::shared_ptr<jit::Function> CastPyArg2JitFunction(PyObject* obj,
                                                      ssize_t arg_pos) {
   if (PyObject_IsInstance(obj,

diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
@@ -79,6 +79,7 @@ std::vector<std::string> CastPyArg2VectorOfString(PyObject* obj,
                                                   ssize_t arg_pos);
 std::shared_ptr<jit::Function> CastPyArg2JitFunction(PyObject* obj,
                                                      ssize_t arg_pos);
+void SetPythonStack();
 
 PyObject* ToPyObject(int value);
 PyObject* ToPyObject(uint32_t value);

diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py b/python/paddle/fluid/tests/unittests/check_nan_inf_backward_stack.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle
+
+
+def main():
+    paddle.set_flags({"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0})
+    cpu_place = paddle.CPUPlace()
+    x = paddle.to_tensor([1, 0.0, 3], stop_gradient=False, place=cpu_place)
+    y = paddle.to_tensor([0.2, 0.0, 0.5], place=cpu_place)
+    z = paddle.pow(x, y)
+    paddle.autograd.backward([z])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py
@@ -78,6 +78,13 @@ def setUp(self):
 
 
 class TestNanInfCheckResult(unittest.TestCase):
+    def setUp(self):
+        self._python_interp = sys.executable
+        if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
+            self._python_interp += " -m coverage run --branch -p"
+
+        self.env = os.environ.copy()
+
     def generate_inputs(self, shape, dtype="float32"):
         data = np.random.random(size=shape).astype(dtype)
         # [-10, 10)
@@ -141,6 +148,25 @@ def _check_num_nan_inf(use_cuda):
         if paddle.fluid.core.is_compiled_with_cuda():
             _check_num_nan_inf(use_cuda=True)
 
+    def test_check_stack(self):
+        self._python_interp += " check_nan_inf_backward_stack.py"
+        cmd = self._python_interp
+        proc = subprocess.Popen(
+            cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=self.env,
+        )
+
+        out, err = proc.communicate()
+        returncode = proc.returncode
+
+        print(out)
+        print(err)
+
+        # in python3, type(out+err) is 'bytes', need use encode
+        assert (out + err).find(b' z = paddle.pow(x, y)') != -1
+
     def check_nan_inf_level(self, use_cuda, dtype):
         shape = [8, 8]
         x_np, y_np = self.generate_inputs(shape, dtype)