From d0a26cddc316cde8483d79300ea3dc0ba3853c14 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Thu, 11 Nov 2021 22:00:36 -0800 Subject: [PATCH] testsuite: Add job instance restart tests Add initial tests to see that jobs can survive instance restarts using the job-exec testexec execution plugin. --- t/Makefile.am | 3 +- t/t3202-instance-restart-testexec.t | 92 +++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) create mode 100755 t/t3202-instance-restart-testexec.t diff --git a/t/Makefile.am b/t/Makefile.am index b317cd8cf56e..225704516186 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -47,7 +47,8 @@ clean-local: LONGTESTSCRIPTS = \ t5000-valgrind.t \ t3100-flux-in-flux.t \ - t3200-instance-restart.t + t3200-instance-restart.t \ + t3202-instance-restart-testexec.t # This list is included in both TESTS and dist_check_SCRIPTS. TESTSCRIPTS = \ diff --git a/t/t3202-instance-restart-testexec.t b/t/t3202-instance-restart-testexec.t new file mode 100755 index 000000000000..f15466ac6b89 --- /dev/null +++ b/t/t3202-instance-restart-testexec.t @@ -0,0 +1,92 @@ +#!/bin/sh + +test_description='Test instance restart and still running jobs with testexec' + +# Append --logfile option if FLUX_TESTS_LOGFILE is set in environment: +test -n "$FLUX_TESTS_LOGFILE" && set -- "$@" --logfile +. `dirname $0`/sharness.sh + +export FLUX_INSTANCE_RESTART=t + +test_under_flux 1 job + +test_expect_success 'run a testexec job in persistent instance (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + flux mini submit \ + --flags=debug \ + --setattr=system.exec.test.run_duration=100s \ + hostname >id1.out +' + +test_expect_success 'restart instance, reattach to running job, cancel it (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id1.out) > eventlog_long1.out; \ + flux jobs -n > jobs_long1.out; \ + flux job cancel $(cat id1.out)" && + grep "reattach-start" eventlog_long1.out && + grep "reattach-finish" eventlog_long1.out && + grep $(cat id1.out) jobs_long1.out +' + +test_expect_success 'restart instance, job completed (long run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id1.out) > eventlog_long2.out; \ + flux jobs -n > jobs_long2.out" && + grep "finish" eventlog_long2.out | grep status && + test_must_fail grep $(cat id1.out) jobs_long2.out +' + +# reattach_finish will indicate to testexcec that the job finished +# right after reattach, emulating a job that finished before the +# instance restarted +test_expect_success 'run a testexec job in persistent instance (exit run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + flux mini submit \ + --flags=debug \ + --setattr=system.exec.test.reattach_finish=1 \ + --setattr=system.exec.test.run_duration=100s \ + hostname >id2.out +' + +test_expect_success 'restart instance, reattach to running job, its finished (exit run)' ' + flux start -o,--setattr=content.backing-path=$(pwd)/content.sqlite \ + sh -c "flux job eventlog $(cat id2.out) > eventlog_exit1.out" && + grep "reattach-start" eventlog_exit1.out && + grep "reattach-finish" eventlog_exit1.out && + grep "finish" eventlog_exit1.out | grep status +' + +test_expect_success 'start job under flux instance' ' + flux mini submit \ + --flags=debug \ + --setattr=system.exec.test.run_duration=100s \ + hostname >id3.out +' + +test_expect_success 'remove all job related modules but not KVS' ' + flux module remove job-exec && + flux module remove sched-simple && + flux module remove job-list && + flux module remove job-info && + flux module remove job-manager && + flux module remove job-ingest +' + +test_expect_success 're-load all job related modules' ' + flux module load job-manager && + flux module load job-info && + flux module load job-list && + flux module load job-ingest && + flux module load job-exec && + flux module load sched-simple +' + +test_expect_success 'job reattach with KVS namespace still existing' ' + flux job eventlog $(cat id3.out) > eventlog_kvsexists1.out && + flux jobs -n > jobs_kvsexists1.out && + grep "reattach-start" eventlog_kvsexists1.out && + grep "reattach-finish" eventlog_kvsexists1.out && + grep $(cat id3.out) jobs_kvsexists1.out +' + +test_done