From 4b236136bcfca04bb560a7b7710cd6582a5cc3be Mon Sep 17 00:00:00 2001 From: KAClough Date: Mon, 10 Jul 2023 11:10:21 +0100 Subject: [PATCH 1/6] Update nancheck to ensure exit using openmp --- Source/BoxUtils/NanCheck.hpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index 2e814c6fe..c70b892cc 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -16,32 +16,46 @@ class NanCheck protected: const std::string m_error_info = "NanCheck"; const double m_max_abs = 1e20; + const double m_dx; public: - NanCheck() {} + NanCheck(const double a_dx = 1.0) : m_dx(a_dx) {} /// This constructor takes a string which will be displayed when nans happen - NanCheck(const std::string a_error_info) : m_error_info(a_error_info) {} + NanCheck(const std::string a_error_info, const double a_dx = 1.0) + : m_error_info(a_error_info), m_dx(a_dx) + { + } - NanCheck(const std::string a_error_info, const double a_max_abs) - : m_error_info(a_error_info), m_max_abs(a_max_abs) + NanCheck(const std::string a_error_info, const double a_max_abs, + const double a_dx = 1.0) + : m_error_info(a_error_info), m_max_abs(a_max_abs), m_dx(a_dx) { } void compute(Cell current_cell) const { - bool stop = false; + // stop is shared between all threads + bool stop; +// guard update to prevent a race +#pragma omp atomic write + stop = false; + int num_vars = current_cell.get_num_in_vars(); for (int ivar = 0; ivar < num_vars; ++ivar) { double val; current_cell.load_vars(val, ivar); if (std::isnan(val) || abs(val) > m_max_abs) +// we want to exit if any of the threads find a nan +#pragma omp atomic write stop = true; } if (stop) { -#pragma omp single +// This needs to be the master thread, otherwise some schedulers have trouble +// exiting +#pragma omp master { pout() << m_error_info << "::Values have become nan. The current state is: \n"; @@ -52,6 +66,7 @@ class NanCheck } pout() << "Integer coordinates: " << current_cell.get_int_vect() << std::endl; + pout() << "m_dx: " << m_dx << std::endl; } MayDay::Error("Values have become nan."); } From f9feb451bd96e3f524f309ef276896528b5cbfbb Mon Sep 17 00:00:00 2001 From: KAClough Date: Mon, 10 Jul 2023 12:07:00 +0100 Subject: [PATCH 2/6] Improve output of Nancheck --- Examples/BinaryBH/BinaryBHLevel.cpp | 5 +++-- Source/BoxUtils/NanCheck.hpp | 34 +++++++++++++++++++---------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/Examples/BinaryBH/BinaryBHLevel.cpp b/Examples/BinaryBH/BinaryBHLevel.cpp index cf17984d7..56294bec5 100644 --- a/Examples/BinaryBH/BinaryBHLevel.cpp +++ b/Examples/BinaryBH/BinaryBHLevel.cpp @@ -32,8 +32,9 @@ void BinaryBHLevel::specificAdvance() // Check for nan's if (m_p.nan_check) - BoxLoops::loop(NanCheck("NaNCheck in specific Advance: "), m_state_new, - m_state_new, EXCLUDE_GHOST_CELLS, disable_simd()); + BoxLoops::loop(NanCheck(m_dx, "NaNCheck in specific Advance: "), + m_state_new, m_state_new, EXCLUDE_GHOST_CELLS, + disable_simd()); } // This initial data uses an approximation for the metric which diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index c70b892cc..74ef139a0 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -16,19 +16,23 @@ class NanCheck protected: const std::string m_error_info = "NanCheck"; const double m_max_abs = 1e20; - const double m_dx; + const double m_dx = 0.0; public: - NanCheck(const double a_dx = 1.0) : m_dx(a_dx) {} + // This allows us to output the physical coords if the dx value is passed + NanCheck(const double a_dx) : m_dx(a_dx) {} /// This constructor takes a string which will be displayed when nans happen - NanCheck(const std::string a_error_info, const double a_dx = 1.0) + /// as well as the grid spacing + NanCheck(const double a_dx, const std::string a_error_info) : m_error_info(a_error_info), m_dx(a_dx) { } - NanCheck(const std::string a_error_info, const double a_max_abs, - const double a_dx = 1.0) + // This constructor takes all arguments, note ordering to reduce potential + // to confuse dx and max_abs + NanCheck(const double a_dx, const std::string a_error_info, + const double a_max_abs) : m_error_info(a_error_info), m_max_abs(a_max_abs), m_dx(a_dx) { } @@ -37,7 +41,7 @@ class NanCheck { // stop is shared between all threads bool stop; -// guard update to prevent a race +// guard assignment to prevent a race #pragma omp atomic write stop = false; @@ -51,11 +55,11 @@ class NanCheck #pragma omp atomic write stop = true; } - if (stop) - { // This needs to be the master thread, otherwise some schedulers have trouble // exiting #pragma omp master + if (stop) + { { pout() << m_error_info << "::Values have become nan. The current state is: \n"; @@ -64,9 +68,17 @@ class NanCheck pout() << UserVariables::variable_names[ivar] << ": " << current_cell.load_vars(ivar) << std::endl; } - pout() << "Integer coordinates: " << current_cell.get_int_vect() - << std::endl; - pout() << "m_dx: " << m_dx << std::endl; + IntVect iv = current_cell.get_int_vect(); + pout() << "Integer coordinates: " << iv << std::endl; + if (m_dx != 0.0) + { + pout() << "with m_dx: " << m_dx << std::endl; + RealVect position(iv + 0.5 * RealVect::Unit); + position *= m_dx; + pout() << "Physical coords relative to bottom left corner " + "of domain: " + << position << std::endl; + } } MayDay::Error("Values have become nan."); } From 90aa74efe23e9cd431aea6ece39f9f45b177a8c8 Mon Sep 17 00:00:00 2001 From: KAClough Date: Mon, 10 Jul 2023 12:09:52 +0100 Subject: [PATCH 3/6] Add back default contrictor so as not to break old code --- Source/BoxUtils/NanCheck.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index 74ef139a0..a55790860 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -19,6 +19,8 @@ class NanCheck const double m_dx = 0.0; public: + NanCheck() {} + // This allows us to output the physical coords if the dx value is passed NanCheck(const double a_dx) : m_dx(a_dx) {} From ab6068c1673614dd7b4f8ccf0852f2f3cf68e117 Mon Sep 17 00:00:00 2001 From: KAClough Date: Mon, 10 Jul 2023 12:12:18 +0100 Subject: [PATCH 4/6] Oops should have kept this too --- Source/BoxUtils/NanCheck.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index a55790860..0680d209d 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -21,7 +21,10 @@ class NanCheck public: NanCheck() {} - // This allows us to output the physical coords if the dx value is passed + /// This constructor takes a string which will be displayed when nans happen + NanCheck(const std::string a_error_info) : m_error_info(a_error_info) {} + + /// This allows us to output the physical coords if the dx value is passed NanCheck(const double a_dx) : m_dx(a_dx) {} /// This constructor takes a string which will be displayed when nans happen From d64f11962d2584df288405943f2e5fbfc9059bb3 Mon Sep 17 00:00:00 2001 From: the-florist Date: Wed, 25 Sep 2024 17:31:02 +0100 Subject: [PATCH 5/6] Bug fix for NanCheck exit inconsistency This commit addresses Issue 238 primarily in the NanCheck.cpp file, where the switch to the master thread (#pragma omp master) is placed AFTER the start of the "if" statement involving the "stop" variable, rather than before. I suspect that since the "stop" variable was altered under the "atomic write" OpenMP command, if any thread other than the master thread caught the Nan then the memory storing the "stop" variable would not be shared with master, and thus the if statement leading to the error trap would not be tripped if evaluated on the master thread. This change effectively moves the evaluation of that logic statement to each OpenMP thread individually. This commit also has an error-trip built into the ScalarField example, where in InitialScalarData I set the value of the extrinsic curvature K somewhere in the box to Nan. This version has a commented-out NanCheck right after the initial data as well, which can be used to debug the current issue (see end). I also added the specificPostTimeStep function to the ScalarField example, and moved the NanCheck that was called in specificAdvance to this function. PS. Another way to solve this problem could be to transfer the "stop" variable to the master thread in some way before the logic statement. This may address certain error output issues that are present in the current commit, meaning that if the error is caught on a non-master thread it is also printed by that same thread. We may need to look into this in the future. --- Examples/ScalarField/InitialScalarData.hpp | 9 +++++++- Examples/ScalarField/ScalarFieldLevel.cpp | 24 ++++++++++++++++------ Examples/ScalarField/ScalarFieldLevel.hpp | 2 ++ Examples/ScalarField/params.txt | 4 ++-- Source/BoxUtils/NanCheck.hpp | 13 ++++++++---- 5 files changed, 39 insertions(+), 13 deletions(-) diff --git a/Examples/ScalarField/InitialScalarData.hpp b/Examples/ScalarField/InitialScalarData.hpp index 037fc4cda..a98a984db 100644 --- a/Examples/ScalarField/InitialScalarData.hpp +++ b/Examples/ScalarField/InitialScalarData.hpp @@ -42,10 +42,17 @@ class InitialScalarData Coordinates coords(current_cell, m_dx, m_params.center); data_t rr = coords.get_radius(); data_t rr2 = rr * rr; + IntVect iv = current_cell.get_int_vect(); + + if(iv[0] == 4 && iv[1] == 4 && iv[2] == 4) + { + data_t K = NAN; + current_cell.store_vars(K, c_K); + } // calculate the field value data_t phi = m_params.amplitude * - (1.0 + 0.01 * rr2 * exp(-pow(rr / m_params.width, 2.0))); + (1.0 + 0.01 * rr2 * exp(-pow(rr / m_params.width, 2.0))); // store the vars current_cell.store_vars(phi, c_phi); diff --git a/Examples/ScalarField/ScalarFieldLevel.cpp b/Examples/ScalarField/ScalarFieldLevel.cpp index 9fc0b0d77..ec7ab4840 100644 --- a/Examples/ScalarField/ScalarFieldLevel.cpp +++ b/Examples/ScalarField/ScalarFieldLevel.cpp @@ -37,11 +37,6 @@ void ScalarFieldLevel::specificAdvance() make_compute_pack(TraceARemoval(), PositiveChiAndAlpha(m_p.min_chi, m_p.min_lapse)), m_state_new, m_state_new, INCLUDE_GHOST_CELLS); - - // Check for nan's - if (m_p.nan_check) - BoxLoops::loop(NanCheck(), m_state_new, m_state_new, - EXCLUDE_GHOST_CELLS, disable_simd()); } // Initial data for field and metric variables @@ -56,11 +51,18 @@ void ScalarFieldLevel::initialData() BoxLoops::loop( make_compute_pack(SetValue(0.), KerrBH(m_p.kerr_params, m_dx), InitialScalarData(m_p.initial_params, m_dx)), - m_state_new, m_state_new, INCLUDE_GHOST_CELLS); + m_state_new, m_state_new, INCLUDE_GHOST_CELLS, disable_simd()); fillAllGhosts(); BoxLoops::loop(GammaCalculator(m_dx), m_state_new, m_state_new, EXCLUDE_GHOST_CELLS); + + // Check for nan's + /*if (m_p.nan_check) + BoxLoops::loop(NanCheck(), m_state_new, m_state_new, + EXCLUDE_GHOST_CELLS, disable_simd());*/ + + MayDay::Error("Nan check didn't work."); } #ifdef CH_USE_HDF5 @@ -130,3 +132,13 @@ void ScalarFieldLevel::computeTaggingCriterion( FixedGridsTaggingCriterion(m_dx, m_level, 2.0 * m_p.L, m_p.center), current_state, tagging_criterion); } + +void ScalarFieldLevel::specificPostTimeStep() +{ + CH_TIME("ScalarFieldLevel::specificPostTimeStep"); + + // Check for nan's + if (m_p.nan_check) + BoxLoops::loop(NanCheck(), m_state_new, m_state_new, + EXCLUDE_GHOST_CELLS, disable_simd()); +} diff --git a/Examples/ScalarField/ScalarFieldLevel.hpp b/Examples/ScalarField/ScalarFieldLevel.hpp index 8eff44612..d177c0c90 100644 --- a/Examples/ScalarField/ScalarFieldLevel.hpp +++ b/Examples/ScalarField/ScalarFieldLevel.hpp @@ -56,6 +56,8 @@ class ScalarFieldLevel : public GRAMRLevel virtual void computeTaggingCriterion( FArrayBox &tagging_criterion, const FArrayBox ¤t_state, const FArrayBox ¤t_state_diagnostics) override; + + virtual void specificPostTimeStep(); }; #endif /* SCALARFIELDLEVEL_HPP_ */ diff --git a/Examples/ScalarField/params.txt b/Examples/ScalarField/params.txt index 450301bca..cd9c81233 100644 --- a/Examples/ScalarField/params.txt +++ b/Examples/ScalarField/params.txt @@ -7,7 +7,7 @@ verbosity = 0 # location / naming of output files -# output_path = "" # Main path for all files. Must exist! +output_path = "/nfs/st01/hpc-gr-epss/eaf49/PR-dump" # Main path for all files. Must exist! chk_prefix = ScalarField_ plot_prefix = ScalarFieldp_ # restart_file = ScalarField_000000.3d.hdf5 @@ -131,7 +131,7 @@ extrapolating_vars = phi Pi # dt will be dx*dt_multiplier on each grid level dt_multiplier = 0.25 stop_time = 100.0 -# max_steps = 4 +max_steps = 4 # Spatial derivative order (only affects CCZ4 RHS) max_spatial_derivative_order = 4 # can be 4 or 6 diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index 0680d209d..3409f091f 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -46,6 +46,7 @@ class NanCheck { // stop is shared between all threads bool stop; + int thrd = 0; // guard assignment to prevent a race #pragma omp atomic write stop = false; @@ -55,17 +56,19 @@ class NanCheck { double val; current_cell.load_vars(val, ivar); - if (std::isnan(val) || abs(val) > m_max_abs) + if (std::isnan(val) || abs(val) > m_max_abs) // we want to exit if any of the threads find a nan #pragma omp atomic write stop = true; + thrd = omp_get_thread_num(); } -// This needs to be the master thread, otherwise some schedulers have trouble -// exiting -#pragma omp master + if (stop) { { +// This needs to be the master thread, otherwise some schedulers have trouble +// exiting +#pragma omp master pout() << m_error_info << "::Values have become nan. The current state is: \n"; for (int ivar = 0; ivar < num_vars; ++ivar) @@ -73,6 +76,8 @@ class NanCheck pout() << UserVariables::variable_names[ivar] << ": " << current_cell.load_vars(ivar) << std::endl; } + pout() << "Nan was caught on thread: " << thrd << "\n"; + pout() << "Error message printed on thread: " << omp_get_thread_num() << "\n"; IntVect iv = current_cell.get_int_vect(); pout() << "Integer coordinates: " << iv << std::endl; if (m_dx != 0.0) From 098ee78ed66140f1b75f7feea28944cea1330700 Mon Sep 17 00:00:00 2001 From: the-florist Date: Mon, 14 Oct 2024 18:28:55 +0100 Subject: [PATCH 6/6] Clang tidy fixes --- Examples/ScalarField/InitialScalarData.hpp | 4 ++-- Examples/ScalarField/ScalarFieldLevel.cpp | 2 +- Source/BoxUtils/NanCheck.hpp | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Examples/ScalarField/InitialScalarData.hpp b/Examples/ScalarField/InitialScalarData.hpp index a98a984db..d882a76da 100644 --- a/Examples/ScalarField/InitialScalarData.hpp +++ b/Examples/ScalarField/InitialScalarData.hpp @@ -44,7 +44,7 @@ class InitialScalarData data_t rr2 = rr * rr; IntVect iv = current_cell.get_int_vect(); - if(iv[0] == 4 && iv[1] == 4 && iv[2] == 4) + if (iv[0] == 4 && iv[1] == 4 && iv[2] == 4) { data_t K = NAN; current_cell.store_vars(K, c_K); @@ -52,7 +52,7 @@ class InitialScalarData // calculate the field value data_t phi = m_params.amplitude * - (1.0 + 0.01 * rr2 * exp(-pow(rr / m_params.width, 2.0))); + (1.0 + 0.01 * rr2 * exp(-pow(rr / m_params.width, 2.0))); // store the vars current_cell.store_vars(phi, c_phi); diff --git a/Examples/ScalarField/ScalarFieldLevel.cpp b/Examples/ScalarField/ScalarFieldLevel.cpp index ec7ab4840..9bcaa470e 100644 --- a/Examples/ScalarField/ScalarFieldLevel.cpp +++ b/Examples/ScalarField/ScalarFieldLevel.cpp @@ -61,7 +61,7 @@ void ScalarFieldLevel::initialData() /*if (m_p.nan_check) BoxLoops::loop(NanCheck(), m_state_new, m_state_new, EXCLUDE_GHOST_CELLS, disable_simd());*/ - + MayDay::Error("Nan check didn't work."); } diff --git a/Source/BoxUtils/NanCheck.hpp b/Source/BoxUtils/NanCheck.hpp index 3409f091f..da0ea76d3 100644 --- a/Source/BoxUtils/NanCheck.hpp +++ b/Source/BoxUtils/NanCheck.hpp @@ -56,11 +56,11 @@ class NanCheck { double val; current_cell.load_vars(val, ivar); - if (std::isnan(val) || abs(val) > m_max_abs) + if (std::isnan(val) || abs(val) > m_max_abs) // we want to exit if any of the threads find a nan #pragma omp atomic write stop = true; - thrd = omp_get_thread_num(); + thrd = omp_get_thread_num(); } if (stop) @@ -77,7 +77,8 @@ class NanCheck << current_cell.load_vars(ivar) << std::endl; } pout() << "Nan was caught on thread: " << thrd << "\n"; - pout() << "Error message printed on thread: " << omp_get_thread_num() << "\n"; + pout() << "Error message printed on thread: " + << omp_get_thread_num() << "\n"; IntVect iv = current_cell.get_int_vect(); pout() << "Integer coordinates: " << iv << std::endl; if (m_dx != 0.0)