From ef18156a2281078762ab6a861ab742a7ed14961b Mon Sep 17 00:00:00 2001
From: "Steven G. Johnson" <stevenj@mit.edu>
Date: Wed, 28 Jul 2021 17:08:07 -0700
Subject: [PATCH] flush subnormals on x86 (#1709)

* flush subnormals on x86

* less aggressive error threshold in single precision

* increase single-precision tolerance

* update tols in test_array_metadata

* update for assertVectorsClose rename

* lower tolerance in single precision
---
 configure.ac                        |  2 +-
 python/simulation.py                |  1 +
 python/tests/test_array_metadata.py | 10 +++----
 python/tests/test_pw_source.py      |  8 +++---
 python/tests/test_source.py         |  2 +-
 src/meep.hpp                        |  3 +++
 src/mympi.cpp                       | 42 +++++++++++++++++++++++++++++
 tests/harmonics.cpp                 |  2 +-
 8 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/configure.ac b/configure.ac
index 90d2aab37..9c8040e4c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -514,7 +514,7 @@ fi
 
 ##############################################################################
 # Miscellaneous function and header checks
-AC_CHECK_HEADERS([sys/time.h])
+AC_CHECK_HEADERS([sys/time.h immintrin.h])
 AC_CHECK_FUNCS([BSDgettimeofday gettimeofday cblas_ddot cblas_daxpy jn])
 
 ##############################################################################
diff --git a/python/simulation.py b/python/simulation.py
index e8ef07d83..0a880fa5f 100644
--- a/python/simulation.py
+++ b/python/simulation.py
@@ -40,6 +40,7 @@
 
 verbosity = Verbosity(mp.cvar, 'meep', 1)
 
+mp.set_zero_subnormals(True)
 
 # Send output from Meep, ctlgeom, and MPB to Python's stdout
 mp.set_meep_printf_callback(mp.py_master_printf_wrap)
diff --git a/python/tests/test_array_metadata.py b/python/tests/test_array_metadata.py
index 4b433e961..9838612f9 100644
--- a/python/tests/test_array_metadata.py
+++ b/python/tests/test_array_metadata.py
@@ -1,8 +1,9 @@
 import meep as mp
 import unittest
 import numpy as np
+from utils import ApproxComparisonTestCase
 
-class TestArrayMetadata(unittest.TestCase):
+class TestArrayMetadata(ApproxComparisonTestCase):
 
     def test_array_metadata(self):
         resolution = 25
@@ -80,11 +81,8 @@ def vec_func(r):
         vec_func_sum = np.sum(W*(xm**2 + 2*ym**2))
         pulse_modal_volume = np.sum(W*EpsE2)/np.max(EpsE2) * vec_func_sum
 
-        if ((mp.count_processors() % 2 == 0) and mp.is_single_precision()):
-            ref_val = 0.94
-        else:
-            ref_val = 1.00
-        self.assertAlmostEqual(cw_modal_volume/pulse_modal_volume, ref_val, places=2)
+        tol = 5e-2 if mp.is_single_precision() else 1e-2
+        self.assertClose(cw_modal_volume/pulse_modal_volume, 1.0, epsilon=tol)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/tests/test_pw_source.py b/python/tests/test_pw_source.py
index ddd1f4b15..797e3c894 100644
--- a/python/tests/test_pw_source.py
+++ b/python/tests/test_pw_source.py
@@ -2,9 +2,9 @@
 import cmath
 import math
 import unittest
+from utils import ApproxComparisonTestCase
 
-
-class TestPwSource(unittest.TestCase):
+class TestPwSource(ApproxComparisonTestCase):
 
     def setUp(self):
         s = 11
@@ -69,8 +69,8 @@ def test_pw_source(self):
         pt1 = self.sim.get_field_point(mp.Ez, v1)
         pt2 = self.sim.get_field_point(mp.Ez, v2)
 
-        places = 3 if mp.is_single_precision() else 7
-        self.assertAlmostEqual(pt1 / pt2, 27.557668029008262, places=places)
+        tol = 1e-4 if mp.is_single_precision() else 1e-9
+        self.assertClose(pt1 / pt2, 27.557668029008262, epsilon=tol)
 
         self.assertAlmostEqual(cmath.exp(1j * self.k.dot(v1 - v2)), 0.7654030066070924 - 0.6435512702783076j)
 
diff --git a/python/tests/test_source.py b/python/tests/test_source.py
index 314e0e52a..abab7a564 100644
--- a/python/tests/test_source.py
+++ b/python/tests/test_source.py
@@ -120,7 +120,7 @@ def my_src_func(t):
         sim.run(mp.after_sources(h), until_after_sources=200)
         fp = sim.get_field_point(mp.Ez, mp.Vector3(1))
 
-        self.assertAlmostEqual(fp, -0.021997617628500023 + 0j)
+        self.assertAlmostEqual(fp, -0.021997617628500023 + 0j, 5 if mp.is_single_precision() else 7)
 
 
 def amp_fun(p):
diff --git a/src/meep.hpp b/src/meep.hpp
index 5d8485023..2b08b8e7c 100644
--- a/src/meep.hpp
+++ b/src/meep.hpp
@@ -2332,6 +2332,9 @@ class binary_partition {
   std::unique_ptr<binary_partition> right;
 };
 
+// control whether CPU flushes subnormal values; see mympi.cpp
+void set_zero_subnormals(bool iszero);
+
 } /* namespace meep */
 
 #endif /* MEEP_H */
diff --git a/src/mympi.cpp b/src/mympi.cpp
index b1c351831..4193e5c69 100644
--- a/src/mympi.cpp
+++ b/src/mympi.cpp
@@ -32,6 +32,12 @@
 #include <mpi.h>
 #endif
 
+#ifdef _OPENMP
+#include "omp.h"
+#else
+#define omp_get_num_threads() (1)
+#endif
+
 #ifdef IGNORE_SIGFPE
 #include <signal.h>
 #endif
@@ -59,6 +65,10 @@ extern "C" int feenableexcept(int EXCEPTS);
 #endif
 #endif
 
+#if HAVE_IMMINTRIN_H
+#include <immintrin.h>
+#endif
+
 #define UNUSED(x) (void)x // silence compiler warnings
 
 #define MPI_REALNUM (sizeof(realnum) == sizeof(double) ? MPI_DOUBLE : MPI_FLOAT)
@@ -127,6 +137,37 @@ std::unique_ptr<comms_manager> create_comms_manager() {
 
 int verbosity = 1; // defined in meep.h
 
+/* Set CPU to flush subnormal values to zero (if iszero == true).  This slightly
+   reduces the range of floating-point numbers, but can greatly increase the speed
+   in cases where subnormal values might arise (e.g. deep in the tails of
+   exponentially decaying sources).
+
+   See also meep#1708.
+
+   code based on github.com/JuliaLang/julia/blob/master/src/processor_x86.cpp#L1087-L1104,
+   which is free software under the GPL-compatible "MIT license" */
+static void _set_zero_subnormals(bool iszero)
+{
+#if HAVE_IMMINTRIN_H
+    unsigned int flags = 0x00008040; // assume a non-ancient processor with SSE2, supporting both FTZ and DAZ flags
+    unsigned int state = _mm_getcsr();
+    if (iszero)
+      state |= flags;
+    else
+      state &= ~flags;
+    _mm_setcsr(state);
+#endif
+}
+void set_zero_subnormals(bool iszero)
+{
+  int n = omp_get_num_threads();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static,1)
+#endif
+  for (int i = 0; i < n; ++i)
+    _set_zero_subnormals(iszero); // This has to be done in every thread for OpenMP.
+}
+
 initialize::initialize(int &argc, char **&argv) {
 #ifdef HAVE_MPI
   MPI_Init(&argc, &argv);
@@ -144,6 +185,7 @@ initialize::initialize(int &argc, char **&argv) {
 #ifdef IGNORE_SIGFPE
   signal(SIGFPE, SIG_IGN);
 #endif
+  set_zero_subnormals(true);
   t_start = wall_time();
 }
 
diff --git a/tests/harmonics.cpp b/tests/harmonics.cpp
index 66dc48eef..81a8eb9ec 100644
--- a/tests/harmonics.cpp
+++ b/tests/harmonics.cpp
@@ -98,7 +98,7 @@ int main(int argc, char **argv) {
 
   double a2, a3, a2_2, a3_2;
 
-  double thresh = sizeof(realnum) == sizeof(float) ? 1e-4 : 1e-5;
+  double thresh = sizeof(realnum) == sizeof(float) ? 1e-3 : 1e-5;
   harmonics(freq, 0.27e-4, 1e-4, 1.0, a2, a3);
   if (different(a2, 9.80330e-07, thresh, "2nd harmonic mismatches known val")) return 1;
   if (sizeof(realnum) == sizeof(float)) {