GitLab CI: hack to deal with GHC heisenbug

Every now and then, GHC will exit with the error ``` out: mmap 131072 bytes at (nil): Cannot allocate memory out: Try specifying an address with +RTS -xm<addr> -RTS out: internal error: m32_allocator_init: Failed to map (GHC version 9.0.2 for x86_64_unknown_linux) Please report this as a GHC bug: https://www.haskell.org/ghc/reportabug ``` (when the binary is named `out`). For some reason this problem has become more pronounced for us. Since we invoke GHC/Clash an awful amount of times in some of our CI tests, the chances of hitting it in one of those invocations are really high. This commit wraps the `ghc`, `ghci`, `clash` and `clashi` binaries in a Bash script that will retry for a total of three times when this error message is observed. It seems reasonable to think that the odds of it failing three times in a row are very small. However, the test suite also compiles Haskell code to a binary and then runs that binary. These binaries have the same issues, but they don't come from the PATH, so we can't intercept them like we can for things that are on the PATH. For this, we introduce a new Tasty test provider that also tries up to three times when the heisenbug's error message is observed. We need both solutions because we are also seeing the problem on `doctests` wich don't involve our Tasty test providers, so these need to be covered by the script approach. Any `clash` invocations from Tasty are not retried since the Bash script already does that. We think this problem occurs on every combination of GHC version and Linux kernel version, but we are seeing it (almost?) exclusively on GHC 9.0.2.
clash-lang · Mar 25, 2023 · d4338a9 · d4338a9
1 parent 4791875
commit d4338a9
Showing 8 changed files with 168 additions and 9 deletions.
diff --git a/.ci/retry-ghc-heisenbug/clash b/.ci/retry-ghc-heisenbug/clash
@@ -0,0 +1 @@
+script
diff --git a/.ci/retry-ghc-heisenbug/clashi b/.ci/retry-ghc-heisenbug/clashi
@@ -0,0 +1 @@
+script
diff --git a/.ci/retry-ghc-heisenbug/ghc b/.ci/retry-ghc-heisenbug/ghc
@@ -0,0 +1 @@
+script
diff --git a/.ci/retry-ghc-heisenbug/ghci b/.ci/retry-ghc-heisenbug/ghci
@@ -0,0 +1 @@
+script
diff --git a/.ci/retry-ghc-heisenbug/script b/.ci/retry-ghc-heisenbug/script
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# Every now and then, GHC will exit with the error
+#
+# out: mmap 131072 bytes at (nil): Cannot allocate memory
+# out: Try specifying an address with +RTS -xm<addr> -RTS
+# out: internal error: m32_allocator_init: Failed to map
+#     (GHC version 9.0.2 for x86_64_unknown_linux)
+#     Please report this as a GHC bug:  https://www.haskell.org/ghc/reportabug
+#
+# (when the binary is named `out`).
+#
+# Since we invoke GHC/Clash an awful amount of times in some of our CI tests,
+# the chances of hitting it in one of those invocations are really high. This
+# script wraps the relevant binaries and will retry for a total of three times
+# when this error message is observed. It seems reasonable to think that the
+# odds of it failing three times in a row are very small.
+#
+# The mmap error message itself will not be observed in stdout and stderr of
+# this script, but anything printed before that by the wrapped binary will
+# occur multiple times as the binary is run again. Given that we usually test
+# either that stdout or stderr is empty or that it contains or does not
+# contain a certain string, duplication does not affect the outcome of the
+# test.
+#
+# We think this problem occurs on every combination of GHC version and
+# Linux kernel version, but we are seeing it on (almost?) exclusively on
+# GHC 9.0.2.
+
+PROG="$(basename $0)"
+
+if [[ -n "$RETRY_GHC_HEISENBUG_LOOP_DETECT" ]]; then
+  echo "retry-ghc-heisenbug: Loop detected, aborting" >&2
+  exit 1
+fi
+export RETRY_GHC_HEISENBUG_LOOP_DETECT=1
+
+REALPROG="$(which -a $PROG | tail -n+2 | head -n1)"
+
+if [[ -z "$REALPROG" ]]; then
+  echo "retry-ghc-heisenbug: Real $PROG not found, aborting" >&2
+  exit 1
+fi
+
+exec 3>&1
+
+TRIES=0
+while [[ $TRIES -lt 2 ]]; do
+  "$REALPROG" "$@" 2>&1 >&3 | \
+    awk '
+      /mmap 131072 bytes at \(nil\): Cannot allocate memory/ \
+        { exit 1 }
+      { print }' >&2
+  STATUS=("${PIPESTATUS[@]}")
+  [[ ${STATUS[1]} -eq 0 ]] && exit ${STATUS[0]}
+  ((TRIES++))
+done
+
+exec 3>&-
+
+"$REALPROG" "$@"
diff --git a/.ci/setup.sh b/.ci/setup.sh
@@ -1,6 +1,9 @@
 #!/bin/bash
 set -xou pipefail
 
+# Temporary hack to deal with GHC heisenbug
+PATH="$(readlink -f .ci/retry-ghc-heisenbug):$PATH"
+
 grep -E ' $' -n -r . --include=*.{hs,hs-boot,sh} --exclude-dir=dist-newstyle
 if [[ $? == 0 ]]; then
     echo "EOL whitespace detected. See ^"

diff --git a/tests/src/Test/Tasty/Clash.hs b/tests/src/Test/Tasty/Clash.hs
@@ -307,7 +307,7 @@ instance IsTest ClashBinaryTest where
       ]
 
     execProgram oDir =
-      TestProgram (oDir </> "out") (oDir:cbExtraExecArgs) NoGlob PrintStdErr False Nothing []
+      TestHeisenbugProgram (oDir </> "out") (oDir:cbExtraExecArgs) NoGlob PrintStdErr False Nothing []
 
   testOptions = coerce (testOptions @TestProgram)
 

diff --git a/tests/src/Test/Tasty/Program.hs b/tests/src/Test/Tasty/Program.hs
@@ -71,6 +71,8 @@ module Test.Tasty.Program (
  , ExpectOutput(..)
  , TestProgram(..)
  , TestFailingProgram(..)
+ , TestHeisenbugProgram(..)
+ , TestFailingHeisenbugProgram(..)
  ) where
 
 import qualified Clash.Util.Interpolate  as I
@@ -87,7 +89,7 @@ import System.Directory        ( findExecutable,
 import System.Environment      ( getEnvironment,                          )
 import System.Exit             ( ExitCode(..)                             )
 import System.Process          ( cwd, env, readCreateProcessWithExitCode,
-                                 proc                                     )
+                                 proc, CreateProcess                      )
 import Test.Tasty.Providers    ( IsTest (..), Result, TestName, TestTree,
                                  singleTest, testPassed, testFailed       )
 
@@ -138,6 +140,24 @@ data TestProgram =
     -- ^ Additional environment variables
       deriving (Typeable)
 
+data TestHeisenbugProgram =
+  TestHeisenbugProgram
+    String
+    -- ^ Executable
+    [String]
+    -- ^ Executable args
+    GlobArgs
+    -- ^ Whether to interpret glob patterns in arguments
+    PrintOutput
+    -- ^ What output to print on test success
+    Bool
+    -- ^ Whether a non-empty stdout means failure
+    (Maybe FilePath)
+    -- ^ Work directory
+    [(String, String)]
+    -- ^ Additional environment variables
+      deriving (Typeable)
+
 data TestFailingProgram =
   TestFailingProgram
     Bool
@@ -162,6 +182,30 @@ data TestFailingProgram =
     -- ^ Additional environment variables
       deriving (Typeable)
 
+data TestFailingHeisenbugProgram =
+  TestFailingHeisenbugProgram
+    Bool
+    -- ^ Test exit code
+    String
+    -- ^ Executable
+    [String]
+    -- ^ Executable args
+    GlobArgs
+    -- ^ Whether to interpret glob patterns in arguments
+    PrintOutput
+    -- ^ What output to print on test success
+    Bool
+    -- ^ Whether an empty stderr means test failure
+    (Maybe Int)
+    -- ^ Expected return code
+    (ExpectOutput T.Text)
+    -- ^ Expected string in stderr
+    (Maybe FilePath)
+    -- ^ Work directory
+    [(String, String)]
+    -- ^ Additional environment variables
+      deriving (Typeable)
+
 testOutput
   :: PrintOutput
   -- ^ What output to return
@@ -275,7 +319,20 @@ instance IsTest TestProgram where
     -- Execute program
     case execFound of
       Nothing       -> return $ execNotFoundFailure program
-      Just progPath -> runProgram progPath args' stdO stdF workDir addEnv
+      Just progPath -> runProgram False progPath args' stdO stdF workDir addEnv
+
+  testOptions = return []
+
+instance IsTest TestHeisenbugProgram where
+  run opts (TestHeisenbugProgram program args glob stdO stdF workDir addEnv) _ = do
+    execFound <- findExecutableAlt program
+
+    args' <- globArgs glob workDir args
+
+    -- Execute program
+    case execFound of
+      Nothing       -> return $ execNotFoundFailure program
+      Just progPath -> runProgram True progPath args' stdO stdF workDir addEnv
 
   testOptions = return []
 
@@ -288,14 +345,29 @@ instance IsTest TestFailingProgram where
     -- Execute program
     case execFound of
       Nothing       -> return $ execNotFoundFailure program
-      Just progPath -> runFailingProgram testExitCode progPath args stdO stdF errCode expectedOutput workDir addEnv
+      Just progPath -> runFailingProgram False testExitCode progPath args stdO stdF errCode expectedOutput workDir addEnv
+
+  testOptions = return []
+
+instance IsTest TestFailingHeisenbugProgram where
+  run _opts (TestFailingHeisenbugProgram testExitCode program args glob stdO stdF errCode expectedOutput workDir addEnv) _ = do
+    execFound <- findExecutableAlt program
+
+    args' <- globArgs glob workDir args
+
+    -- Execute program
+    case execFound of
+      Nothing       -> return $ execNotFoundFailure program
+      Just progPath -> runFailingProgram True testExitCode progPath args stdO stdF errCode expectedOutput workDir addEnv
 
   testOptions = return []
 
 -- | Run a program with given options and optional working directory.
 -- Return success if program exits with success code.
 runProgram
-  :: String
+  :: Bool
+  -- ^ Should we retry when we see the Heisenbug error?
+  -> String
   -- ^ Program name
   -> [String]
   -- ^ Program options
@@ -308,10 +380,10 @@ runProgram
   -> [(String, String)]
   -- ^ Additional environment variables
   -> IO Result
-runProgram program args stdO stdF workDir addEnv = do
+runProgram heisen program args stdO stdF workDir addEnv = do
   e <- getEnvironment
   let cp = (proc program args) { cwd = workDir, env = Just (addEnv ++ e) }
-  (exitCode, stdout, stderr) <- readCreateProcessWithExitCode cp ""
+  (exitCode, stdout, stderr) <- filterHeisenbugProcess heisen cp ""
 
   -- For debugging: Uncomment this to print executable and and its arguments
   --putStrLn $ show program ++ " " ++ concatMap (++ " ") args
@@ -333,6 +405,8 @@ runProgram program args stdO stdF workDir addEnv = do
 -- all.
 runFailingProgram
   :: Bool
+  -- ^ Should we retry when we see the heisenbug error?
+  -> Bool
   -- ^ Test exit code?
   -> String
   -- ^ Program name
@@ -352,10 +426,10 @@ runFailingProgram
   -> [(String, String)]
   -- ^ Additional environment variables
   -> IO Result
-runFailingProgram testExitCode program args stdO errOnEmptyStderr expectedCode expectedStderr workDir addEnv = do
+runFailingProgram heisen testExitCode program args stdO errOnEmptyStderr expectedCode expectedStderr workDir addEnv = do
   e <- getEnvironment
   let cp = (proc program args) { cwd = workDir, env = Just (addEnv ++ e) }
-  (exitCode0, stdout, stderr) <- readCreateProcessWithExitCode cp ""
+  (exitCode0, stdout, stderr) <- filterHeisenbugProcess heisen cp ""
 
   -- For debugging: Uncomment this to print executable and and its arguments
   --putStrLn $ show program ++ " " ++ concatMap (++ " ") args
@@ -406,6 +480,23 @@ runFailingProgram testExitCode program args stdO errOnEmptyStderr expectedCode e
                 else
                   passed
 
+filterHeisenbugProcess
+  :: Bool
+  -- ^ Should we retry when we see the heisenbug error?
+  -> CreateProcess
+  -> String
+  -> IO (ExitCode, String, String)
+filterHeisenbugProcess heisen cp stdin = do
+  let numTries = if heisen then 3 else 1
+      tries = replicate numTries $ readCreateProcessWithExitCode cp stdin
+  foldl1 skipHeisenbug tries
+ where
+  skipHeisenbug try1 try2 = do
+    res@(_exitCode, _stdout, stderr) <- try1
+    if heisenMsg `T.isInfixOf` (T.pack stderr)
+      then try2
+      else pure res
+  heisenMsg = T.pack "mmap 131072 bytes at (nil): Cannot allocate memory"
 
 -- | Indicates that program does not exist in the path
 execNotFoundFailure :: String -> Result