Skip to content

Commit

Permalink
testsuite: add running job update tests
Browse files Browse the repository at this point in the history
Problem: There are no tests of updates to the duration of a running
job.

Add a set of tests specific to updating the duration of running jobs
in a new sharness test t2292-job-update-running.t.
  • Loading branch information
grondo committed Oct 31, 2023
1 parent 18ccb29 commit ffc8cb4
Show file tree
Hide file tree
Showing 2 changed files with 217 additions and 0 deletions.
1 change: 1 addition & 0 deletions t/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ TESTSCRIPTS = \
t2280-job-memo.t \
t2290-job-update.t \
t2291-job-update-queue.t \
t2292-job-update-running.t \
t2300-sched-simple.t \
t2302-sched-simple-up-down.t \
t2303-sched-hello.t \
Expand Down
216 changes: 216 additions & 0 deletions t/t2292-job-update-running.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#!/bin/sh
test_description='Test update of running jobs'

. $(dirname $0)/sharness.sh

if flux job submit --help 2>&1 | grep -q sign-type; then
test_set_prereq HAVE_FLUX_SECURITY
fi

test_under_flux 4 job

flux setattr log-stderr-level 1
export FLUX_URI_RESOLVE_LOCAL=t
runas_guest() {
local userid=$(($(id -u)+1))
FLUX_HANDLE_USERID=$userid FLUX_HANDLE_ROLEMASK=0x2 "$@"
}

submit_job_as_guest()
{
local duration=$1
local userid=$(($(id -u)+1))
flux run --dry-run -t $duration \
--setattr=exec.test.run_duration=\"600\" \
sleep inf | \
flux python ${SHARNESS_TEST_SRCDIR}/scripts/sign-as.py $userid \
>job.signed
FLUX_HANDLE_USERID=$userid \
flux job submit --flags=signed job.signed
}

# Usage: job_manager_get_R ID
# This needs to be a shell script since it will be run under flux-proxy(1):
cat <<'EOF' >job_manager_get_R
#!/bin/sh
flux python -c \
"import flux; \
payload = {\"id\":$(flux job id $1),\"attrs\":[\"R\"]}; \
print(flux.Flux().rpc(\"job-manager.getattr\", payload).get_str()) \
"
EOF
chmod +x job_manager_get_R

export PATH=$(pwd):$PATH

job_manager_get_expiration() {
job_manager_get_R $1 | jq .R.execution.expiration
}
job_manager_get_starttime() {
job_manager_get_R $1 | jq .R.execution.starttime
}

test_expect_success 'instance owner can adjust expiration of their own job' '
jobid=$(flux submit --wait-event=start -t5m sleep 300) &&
expiration=$(job_manager_get_expiration $jobid) &&
test_debug "echo expiration=$expiration" &&
flux update $jobid duration=+1m &&
job_manager_get_expiration $jobid | jq -e ". == $expiration + 60"
'
test_expect_success 'duration update with expiration in past fails' '
test_must_fail flux update $jobid duration=10ms 2>err.log &&
grep past err.log
'
# Test that job execution systems sees an expiration update by
# waiting for an expected log message from the job-exec module
#
test_expect_success 'duration update is processed by execution system' '
flux update $jobid duration=+1m &&
$SHARNESS_TEST_SRCDIR/scripts/dmesg-grep.py \
-vt 30 "job-exec.*updated expiration of $jobid" &&
flux job cancel $jobid &&
flux job wait-event $jobid clean
'
# Test that the job shell correctly processes an expiration update.
# Set up the job shell to send SIGTERM to the job 60s before expiration.
# for a job with a 5m duration. Then adjust duration such that expiration
# will occur 30s from now, and ensure the job shell picks up the update and
# sends SIGTERM immediately.
#
test_expect_success 'duration update is processed by job shell' '
jobid=$(flux submit --wait-event start \
-o verbose --signal=SIGTERM@60 -t5 sleep 300) &&
duration=$(job_manager_get_starttime $jobid | jq "now - . + 30") &&
flux update $jobid duration=$duration &&
test_must_fail_or_be_terminated \
flux job attach $jobid >shell.log 2>&1 &&
test_debug "cat shell.log" &&
grep "Will send SIGTERM to job in 0\.0" shell.log &&
grep "sending SIGTERM" shell.log
'
test_expect_success 'duration can be set to unlimited for a running job' '
jobid=$(flux submit --wait-event=start -t5m sleep 300) &&
expiration=$(job_manager_get_expiration $jobid) &&
test_debug "echo expiration=$expiration" &&
flux update $jobid duration=0 &&
job_manager_get_expiration $jobid | jq -e ". == 0." &&
flux cancel $jobid &&
flux job wait-event $jobid clean
'
test_expect_success HAVE_FLUX_SECURITY 'duration update of running job is denied for guest' '
jobid=$(submit_job_as_guest 5m) &&
flux job wait-event -vt 30 $jobid start &&
test_must_fail runas_guest flux update $jobid duration=+1m &&
flux cancel $jobid
'
test_expect_success HAVE_FLUX_SECURITY 'duration update of running guest job is allowed for owner' '
jobid=$(submit_job_as_guest 5m) &&
flux job wait-event -vt 30 $jobid start &&
expiration=$(job_manager_get_expiration $jobid) &&
flux update $jobid duration=+1m &&
job_manager_get_expiration $jobid | jq -e ". == $expiration + 60" &&
flux cancel $jobid
'
# Set module debug flag 0x4 on sched-simple so it will deny sched.expiration
# RPCs:
test_expect_success 'expiration update can be denied by scheduler' '
flux module debug --set 4 sched-simple &&
jobid=$(flux submit --wait-event=start -t 5m sleep 300) &&
test_must_fail flux update $jobid duration=+10m >sched-deny.out 2>&1 &&
test_debug "cat sched-deny.out" &&
grep "scheduler refused" sched-deny.out &&
flux cancel $jobid &&
flux module debug --clear sched-simple
'
subinstance_get_R() {
flux proxy $1 flux kvs get resource.R
}
subinstance_get_expiration() {
subinstance_get_R $1 | jq .execution.expiration
}
# Usage: subinstance_get_duration ID JOBID
subinstance_get_job_duration() {
flux proxy $1 job_manager_get_R $2 |
jq '.R.execution | .expiration - .starttime'
}
#
# The following tests ensure that an expiration modification of a job
# is propagated to child jobs when appropriate. The tests may be somewhat
# difficult to follow, so are summarized in comments:
#
# 1. submit an instance with 5m time limit
# 2. submit a job in that instance with no duration - its expiration should
# be set to that of the instance. (duration <5m)
# 3. Submit a job with a set duration for testing (duration 4m)
# 4. update duration of instance to 10m (+5m)
# 5. wait for instance resource module to post resource-update event
# which indicates the expiration update is reflected in resource.R.
# 6. wait for sched-simple to register expiration update by checking
# for expected log message
# 7. Ensure subinstance now reflects updated expiration in resource.R.
# 8. Ensure new expiration is 300s greater than old expiration
#
test_expect_success 'expiration update is detected by subinstance' '
id=$(flux alloc --bg -t5m -n4) &&
exp1=$(subinstance_get_expiration $id) &&
test_debug "echo instance expiration is $exp1" &&
id1=$(flux proxy $id flux submit sleep 300) &&
id2=$(flux proxy $id flux submit -t4m sleep 300) &&
tleft1=$(flux proxy $id flux job timeleft $id1) &&
duration1=$(subinstance_get_job_duration $id $id1) &&
duration2=$(subinstance_get_job_duration $id $id2) &&
test_debug "echo initial duration of job1 is $duration1" &&
test_debug "echo initial timeleft of job1 is $tleft1" &&
echo $duration1 | jq -e ". < 300" &&
echo $duration2 | jq -e ". == 240" &&
test_debug "echo updating duration of alloc job +5m" &&
flux update $id duration=+5m &&
test_debug "echo waiting for resource-update event" &&
flux proxy $id flux kvs eventlog wait-event -vt 30 \
resource.eventlog resource-update &&
test_debug "echo waiting for scheduler to see updated expiration" &&
flux proxy $id $SHARNESS_TEST_SRCDIR/scripts/dmesg-grep.py \
-vt 30 \"sched-simple.*expiration updated\" &&
exp2=$(subinstance_get_expiration $id) &&
subinstance_get_R $id &&
test_debug "echo expiration updated from $exp1 to $exp2" &&
echo $exp2 | jq -e ". == $exp1 + 300"
'
#
# 9. Submit job to previous instance and ensure its expiration matches
# the updated value
# 10. Ensure flux-job timeleft returns > 5m for the new job
# 11. Wait for expected resource-update event to be propagated to the first job
# 12. Ensure the timeleft of the first job is now > 5m
#
test_expect_success 'instance expiration update propagates to jobs' '
id3=$(flux proxy $id flux submit --wait-event=start sleep 300) &&
tleft3=$(flux proxy $id flux job timeleft $id3) &&
duration3=$(subinstance_get_job_duration $id $id3) &&
test_debug "echo timeleft of job submitted after update is $tleft3" &&
test_debug "echo duration of job submitted after update is $duration3" &&
echo $duration3 | jq -e ". > 300" &&
flux proxy $id flux job wait-event -vt 20 \
$id1 resource-update &&
tleft1=$(flux proxy $id flux job timeleft $id1) &&
duration1=$(subinstance_get_job_duration $id $id1) &&
test_debug "echo timeleft of job submitted extended to $tleft1" &&
test_debug "echo duration of job submitted extended to $duration1" &&
echo $duration1 | jq -e ". > 300"
'
#
# 13. Check that expiration of job submitted with a duration is untouched.
#
test_expect_success 'instance expiration is not propagated to jobs with duration' '
tleft2=$(flux proxy $id flux job timeleft $id2) &&
duration2=$(subinstance_get_job_duration $id $id2) &&
test_debug "echo timeleft of job with duration is now $tleft2" &&
test_debug "echo duration of same job is now $duration2" &&
echo $duration2 | jq -e ". == 240"
'
test_expect_success 'shutdown test instance' '
flux proxy $id flux cancel --all &&
flux shutdown --quiet $id &&
flux job wait-event $id clean
'
test_done

0 comments on commit ffc8cb4

Please sign in to comment.