From e95a70a96fb0dc17f7470686e923087a75e24c38 Mon Sep 17 00:00:00 2001
From: Tom Scogland <scogland1@llnl.gov>
Date: Fri, 13 Mar 2020 11:10:21 -0700
Subject: [PATCH] sharness: add a per-test global timeout option

After recent frustration with a variety of hangs, this provides a new
environment variable `FLUX_TEST_TIMEOUT` that provides each individual
top-level sharness test (test_expect_success or similar) with an
individual timeout of that value in seconds.  After that many seconds
the sharness script receives a signal, prints an error, kills the
current command, and proceeds to subsequent tests.  The mechanism for
this is a little bit awkward because of the way that sharness uses eval
to execute tests, but has proven reliable in my tests so far.
---
 t/sharness.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/t/sharness.sh b/t/sharness.sh
index c9f2bcac40fe..157adcb474a0 100644
--- a/t/sharness.sh
+++ b/t/sharness.sh
@@ -325,7 +325,27 @@ test_pause() {
 	fi
 }
 
+die_on_alarm() {
+	kill -9 $! > /dev/null 2>&1 # kill currently executing command
+	echo "Top-level test timed out"
+}
+
 test_eval_() {
+	( # start a subshell in the background to provide a timeout
+	  set -e
+	  parent_pid=$$
+	  i=0
+	  while kill -0 $parent_pid ; do
+		  sleep 1
+		  if test "$i" -gt ${FLUX_TEST_TIMEOUT:-120} ; then
+			  break
+		  fi
+		  i=$(($i+1))
+	  done
+	  kill -ALRM $$ # send ALRM to parent
+	) &
+	ALRM=$!
+	trap die_on_alarm ALRM
 	# This is a separate function because some tests use
 	# "return" to end a test_expect_success block early.
 	case ",$test_prereq," in
@@ -336,6 +356,10 @@ test_eval_() {
 		eval </dev/null >&3 2>&4 "$*"
 		;;
 	esac
+	ret=$?
+	trap - ALRM
+	kill $ALRM >/dev/null 2>&1
+	return $ret
 }
 
 test_run_() {