sharness: add a per-test global timeout option

After recent frustration with a variety of hangs, this provides a new environment variable `FLUX_TEST_TIMEOUT` that provides each individual top-level sharness test (test_expect_success or similar) with an individual timeout of that value in seconds. After that many seconds the sharness script receives a signal, prints an error, kills the current command, and proceeds to subsequent tests. The mechanism for this is a little bit awkward because of the way that sharness uses eval to execute tests, but has proven reliable in my tests so far.
flux-framework · Mar 13, 2020 · e95a70a · e95a70a
1 parent 17e788a
commit e95a70a
Showing 1 changed file with 24 additions and 0 deletions.
diff --git a/t/sharness.sh b/t/sharness.sh
@@ -325,7 +325,27 @@ test_pause() {
 	fi
 }
 
+die_on_alarm() {
+	kill -9 $! > /dev/null 2>&1 # kill currently executing command
+	echo "Top-level test timed out"
+}
+
 test_eval_() {
+	( # start a subshell in the background to provide a timeout
+	  set -e
+	  parent_pid=$$
+	  i=0
+	  while kill -0 $parent_pid ; do
+		  sleep 1
+		  if test "$i" -gt ${FLUX_TEST_TIMEOUT:-120} ; then
+			  break
+		  fi
+		  i=$(($i+1))
+	  done
+	  kill -ALRM $$ # send ALRM to parent
+	) &
+	ALRM=$!
+	trap die_on_alarm ALRM
 	# This is a separate function because some tests use
 	# "return" to end a test_expect_success block early.
 	case ",$test_prereq," in
@@ -336,6 +356,10 @@ test_eval_() {
 		eval </dev/null >&3 2>&4 "$*"
 		;;
 	esac
+	ret=$?
+	trap - ALRM
+	kill $ALRM >/dev/null 2>&1
+	return $ret
 }
 
 test_run_() {