From ef18156a2281078762ab6a861ab742a7ed14961b Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Wed, 28 Jul 2021 17:08:07 -0700 Subject: [PATCH] flush subnormals on x86 (#1709) * flush subnormals on x86 * less aggressive error threshold in single precision * increase single-precision tolerance * update tols in test_array_metadata * update for assertVectorsClose rename * lower tolerance in single precision --- configure.ac | 2 +- python/simulation.py | 1 + python/tests/test_array_metadata.py | 10 +++---- python/tests/test_pw_source.py | 8 +++--- python/tests/test_source.py | 2 +- src/meep.hpp | 3 +++ src/mympi.cpp | 42 +++++++++++++++++++++++++++++ tests/harmonics.cpp | 2 +- 8 files changed, 57 insertions(+), 13 deletions(-) diff --git a/configure.ac b/configure.ac index 90d2aab37..9c8040e4c 100644 --- a/configure.ac +++ b/configure.ac @@ -514,7 +514,7 @@ fi ############################################################################## # Miscellaneous function and header checks -AC_CHECK_HEADERS([sys/time.h]) +AC_CHECK_HEADERS([sys/time.h immintrin.h]) AC_CHECK_FUNCS([BSDgettimeofday gettimeofday cblas_ddot cblas_daxpy jn]) ############################################################################## diff --git a/python/simulation.py b/python/simulation.py index e8ef07d83..0a880fa5f 100644 --- a/python/simulation.py +++ b/python/simulation.py @@ -40,6 +40,7 @@ verbosity = Verbosity(mp.cvar, 'meep', 1) +mp.set_zero_subnormals(True) # Send output from Meep, ctlgeom, and MPB to Python's stdout mp.set_meep_printf_callback(mp.py_master_printf_wrap) diff --git a/python/tests/test_array_metadata.py b/python/tests/test_array_metadata.py index 4b433e961..9838612f9 100644 --- a/python/tests/test_array_metadata.py +++ b/python/tests/test_array_metadata.py @@ -1,8 +1,9 @@ import meep as mp import unittest import numpy as np +from utils import ApproxComparisonTestCase -class TestArrayMetadata(unittest.TestCase): +class TestArrayMetadata(ApproxComparisonTestCase): def test_array_metadata(self): resolution = 25 @@ -80,11 +81,8 @@ def vec_func(r): vec_func_sum = np.sum(W*(xm**2 + 2*ym**2)) pulse_modal_volume = np.sum(W*EpsE2)/np.max(EpsE2) * vec_func_sum - if ((mp.count_processors() % 2 == 0) and mp.is_single_precision()): - ref_val = 0.94 - else: - ref_val = 1.00 - self.assertAlmostEqual(cw_modal_volume/pulse_modal_volume, ref_val, places=2) + tol = 5e-2 if mp.is_single_precision() else 1e-2 + self.assertClose(cw_modal_volume/pulse_modal_volume, 1.0, epsilon=tol) if __name__ == '__main__': unittest.main() diff --git a/python/tests/test_pw_source.py b/python/tests/test_pw_source.py index ddd1f4b15..797e3c894 100644 --- a/python/tests/test_pw_source.py +++ b/python/tests/test_pw_source.py @@ -2,9 +2,9 @@ import cmath import math import unittest +from utils import ApproxComparisonTestCase - -class TestPwSource(unittest.TestCase): +class TestPwSource(ApproxComparisonTestCase): def setUp(self): s = 11 @@ -69,8 +69,8 @@ def test_pw_source(self): pt1 = self.sim.get_field_point(mp.Ez, v1) pt2 = self.sim.get_field_point(mp.Ez, v2) - places = 3 if mp.is_single_precision() else 7 - self.assertAlmostEqual(pt1 / pt2, 27.557668029008262, places=places) + tol = 1e-4 if mp.is_single_precision() else 1e-9 + self.assertClose(pt1 / pt2, 27.557668029008262, epsilon=tol) self.assertAlmostEqual(cmath.exp(1j * self.k.dot(v1 - v2)), 0.7654030066070924 - 0.6435512702783076j) diff --git a/python/tests/test_source.py b/python/tests/test_source.py index 314e0e52a..abab7a564 100644 --- a/python/tests/test_source.py +++ b/python/tests/test_source.py @@ -120,7 +120,7 @@ def my_src_func(t): sim.run(mp.after_sources(h), until_after_sources=200) fp = sim.get_field_point(mp.Ez, mp.Vector3(1)) - self.assertAlmostEqual(fp, -0.021997617628500023 + 0j) + self.assertAlmostEqual(fp, -0.021997617628500023 + 0j, 5 if mp.is_single_precision() else 7) def amp_fun(p): diff --git a/src/meep.hpp b/src/meep.hpp index 5d8485023..2b08b8e7c 100644 --- a/src/meep.hpp +++ b/src/meep.hpp @@ -2332,6 +2332,9 @@ class binary_partition { std::unique_ptr right; }; +// control whether CPU flushes subnormal values; see mympi.cpp +void set_zero_subnormals(bool iszero); + } /* namespace meep */ #endif /* MEEP_H */ diff --git a/src/mympi.cpp b/src/mympi.cpp index b1c351831..4193e5c69 100644 --- a/src/mympi.cpp +++ b/src/mympi.cpp @@ -32,6 +32,12 @@ #include #endif +#ifdef _OPENMP +#include "omp.h" +#else +#define omp_get_num_threads() (1) +#endif + #ifdef IGNORE_SIGFPE #include #endif @@ -59,6 +65,10 @@ extern "C" int feenableexcept(int EXCEPTS); #endif #endif +#if HAVE_IMMINTRIN_H +#include +#endif + #define UNUSED(x) (void)x // silence compiler warnings #define MPI_REALNUM (sizeof(realnum) == sizeof(double) ? MPI_DOUBLE : MPI_FLOAT) @@ -127,6 +137,37 @@ std::unique_ptr create_comms_manager() { int verbosity = 1; // defined in meep.h +/* Set CPU to flush subnormal values to zero (if iszero == true). This slightly + reduces the range of floating-point numbers, but can greatly increase the speed + in cases where subnormal values might arise (e.g. deep in the tails of + exponentially decaying sources). + + See also meep#1708. + + code based on github.com/JuliaLang/julia/blob/master/src/processor_x86.cpp#L1087-L1104, + which is free software under the GPL-compatible "MIT license" */ +static void _set_zero_subnormals(bool iszero) +{ +#if HAVE_IMMINTRIN_H + unsigned int flags = 0x00008040; // assume a non-ancient processor with SSE2, supporting both FTZ and DAZ flags + unsigned int state = _mm_getcsr(); + if (iszero) + state |= flags; + else + state &= ~flags; + _mm_setcsr(state); +#endif +} +void set_zero_subnormals(bool iszero) +{ + int n = omp_get_num_threads(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static,1) +#endif + for (int i = 0; i < n; ++i) + _set_zero_subnormals(iszero); // This has to be done in every thread for OpenMP. +} + initialize::initialize(int &argc, char **&argv) { #ifdef HAVE_MPI MPI_Init(&argc, &argv); @@ -144,6 +185,7 @@ initialize::initialize(int &argc, char **&argv) { #ifdef IGNORE_SIGFPE signal(SIGFPE, SIG_IGN); #endif + set_zero_subnormals(true); t_start = wall_time(); } diff --git a/tests/harmonics.cpp b/tests/harmonics.cpp index 66dc48eef..81a8eb9ec 100644 --- a/tests/harmonics.cpp +++ b/tests/harmonics.cpp @@ -98,7 +98,7 @@ int main(int argc, char **argv) { double a2, a3, a2_2, a3_2; - double thresh = sizeof(realnum) == sizeof(float) ? 1e-4 : 1e-5; + double thresh = sizeof(realnum) == sizeof(float) ? 1e-3 : 1e-5; harmonics(freq, 0.27e-4, 1e-4, 1.0, a2, a3); if (different(a2, 9.80330e-07, thresh, "2nd harmonic mismatches known val")) return 1; if (sizeof(realnum) == sizeof(float)) {