From 61df254629be1e9290a5111fb384054688491780 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Thu, 11 Nov 2021 22:00:36 -0800 Subject: [PATCH] testsuite: Add job instance restart tests Add initial tests to see that jobs can survive instance restarts using the job-exec testexec execution plugin. --- t/Makefile.am | 3 +- t/t3202-instance-restart-testexec.t | 57 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100755 t/t3202-instance-restart-testexec.t diff --git a/t/Makefile.am b/t/Makefile.am index b317cd8cf56e..225704516186 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -47,7 +47,8 @@ clean-local: LONGTESTSCRIPTS = \ t5000-valgrind.t \ t3100-flux-in-flux.t \ - t3200-instance-restart.t + t3200-instance-restart.t \ + t3202-instance-restart-testexec.t # This list is included in both TESTS and dist_check_SCRIPTS. TESTSCRIPTS = \ diff --git a/t/t3202-instance-restart-testexec.t b/t/t3202-instance-restart-testexec.t new file mode 100755 index 000000000000..f5e1a8dadf25 --- /dev/null +++ b/t/t3202-instance-restart-testexec.t @@ -0,0 +1,57 @@ +#!/bin/sh + +test_description='Test instance restart and still running jobs with testexec' + +# Append --logfile option if FLUX_TESTS_LOGFILE is set in environment: +test -n "$FLUX_TESTS_LOGFILE" && set -- "$@" --logfile +. `dirname $0`/sharness.sh + +export FLUX_INSTANCE_RESTART=t + +test_expect_success 'run a testexec job in persistent instance (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + flux mini submit \ + --flags=debug \ + --setattr=system.exec.test.run_duration=100s \ + hostname >id1.out +' + +test_expect_success 'restart instance, reattach to running job, cancel it (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id1.out) > eventlog_long1.out; \ + flux jobs -n > jobs_long1.out; \ + flux job cancel $(cat id1.out)" && + grep "reattach-start" eventlog_long1.out && + grep "reattach-finish" eventlog_long1.out && + grep $(cat id1.out) jobs_long1.out +' + +test_expect_success 'restart instance, job completed (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id1.out) > eventlog_long2.out; \ + flux jobs -n > jobs_long2.out" && + grep "finish" eventlog_long2.out | grep status && + test_must_fail grep $(cat id1.out) jobs_long2.out +' + +# reattach_finish will indicate to testexcec that the job finished +# right after reattach, emulating a job that finished before the +# instance restarted +test_expect_success 'run a testexec job in persistent instance (exit run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + flux mini submit \ + --flags=debug \ + --setattr=system.exec.test.reattach_finish=1 \ + --setattr=system.exec.test.run_duration=100s \ + hostname >id2.out +' + +test_expect_success 'restart instance, reattach to running job, its finished (exit run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id2.out) > eventlog_exit1.out" && + grep "reattach-start" eventlog_exit1.out && + grep "reattach-finish" eventlog_exit1.out && + grep "finish" eventlog_exit1.out | grep status +' + +test_done