diff --git a/t/Makefile.am b/t/Makefile.am index 23c69e4f2703..ed6f162f93f2 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -90,6 +90,7 @@ TESTSCRIPTS = \ t0023-jobspec1-validate.t \ t0026-flux-R.t \ t0033-size-override.t \ + t0034-flub.t \ t1000-kvs.t \ t1001-kvs-internals.t \ t1003-kvs-stress.t \ diff --git a/t/t0034-flub.t b/t/t0034-flub.t new file mode 100755 index 000000000000..398e5c0eb1eb --- /dev/null +++ b/t/t0034-flub.t @@ -0,0 +1,264 @@ +#!/bin/sh +# + +test_description='Test flub bootstrap method' + +. `dirname $0`/sharness.sh + +test_under_flux 8 full + +export FLUX_SSH="${SHARNESS_TEST_SRCDIR}/scripts/tssh" + +# usage: get_job_uri id +get_job_uri() { + flux job wait-event -t10 $1 memo >/dev/null && flux uri $1 +} + +# usage: wait_for_service uri name +wait_for_service() { + flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\" +} + +test_expect_success 'broker fails with bad broker.boot-server' ' + test_must_fail flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Sbroker.boot-server=local://noexist/path \ + /bin/true 2>server.err && + grep "was not found" server.err +' + +test_expect_success 'start a 1 node job with 0 extra ranks' ' + id=$(flux batch -N1 --wrap sleep inf) && + get_job_uri $id >test1.uri +' +test_expect_success 'job has size 1' ' + size=$(flux proxy $(cat test1.uri) flux getattr size) && + test $size -eq 1 +' +test_expect_success 'flub bootstrap fails with no available ranks' ' + test_must_fail flux broker \ + -Sbroker.boot-server=$(cat test1.uri) 2>noranks.err && + grep "no available ranks" noranks.err +' +test_expect_success 'clean up' ' + flux cancel --all +' + + +# +# Start 2 node batch job with one extra slot. +# Submit 1 node broker job that fills the slot. +# Run a parallel job across all three nodes in the batch job. +# This test is constrained so that all flubbed nodes are leaf nodes, +# and the flubbed nodes connect to rank 0 only. + +test_expect_success 'create config with fake resources' ' + cat >fake2.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c" + cores = "0-3" + EOT +' +test_expect_success 'start a 2 node job with 1 extra rank' ' + id=$(flux batch -N2 \ + --broker-opts=--config-path=fake2.toml \ + --broker-opts=-Ssize=3 \ + --broker-opts=-Sbroker.quorum=2 \ + --broker-opts=-Stbon.topo=kary:0 \ + --wrap sleep inf) && + get_job_uri $id >test2.uri +' +test_expect_success 'job has size 3' ' + size=$(flux proxy $(cat test2.uri) flux getattr size) && + test $size -eq 3 +' +test_expect_success 'overlay status shows extra node offline' ' + flux proxy $(cat test2.uri) \ + flux overlay status --no-pretty >ov2.out && + grep "2 extra0: offline" ov2.out +' +test_expect_success 'run a 2 node job in the initial instance' ' + wait_for_service $(cat test2.uri) job-ingest && + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N2 flux pmi barrier +' +test_expect_success 'submit a job that starts 1 extra broker' ' + id=$(flux submit -N1 flux broker \ + --config-path=fake2.toml \ + -Stbon.topo=kary:0 \ + -Sbroker.boot-server=$(cat test2.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test2.uri) \ + flux overlay status --wait full --timeout 10s +' +test_expect_success 'run a 3 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N3 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +# +# Start 3 node batch job with four extra slots (kary:2). +# Submit 4 node broker job that fills the slots. +# Run a parallel job across all seven nodes in the batch job. +# This test is constrained so that all flubbed nodes are leaf nodes, +# but they are grafted on different nodes depending on topology. +# 0 +# 1 2 +# 3 4 5 6 <-- flubbed + +test_expect_success 'create config with fake resources' ' + cat >fake3.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c,d,e,f,g" + cores = "0-3" + EOT +' +test_expect_success 'start a 3 node job with 4 extra ranks' ' + id=$(flux batch -N3 \ + --broker-opts=--config-path=fake3.toml \ + --broker-opts=-Ssize=7 \ + --broker-opts=-Sbroker.quorum=3 \ + --broker-opts=-Stbon.topo=kary:2 \ + --wrap sleep inf) && + get_job_uri $id >test3.uri +' +test_expect_success 'job has size 7' ' + size=$(flux proxy $(cat test3.uri) flux getattr size) && + test $size -eq 7 +' +test_expect_success 'run a 3 node job in the initial instance' ' + wait_for_service $(cat test3.uri) job-ingest && + run_timeout 30 flux proxy $(cat test3.uri) \ + flux run --label-io -N3 flux pmi barrier +' +test_expect_success 'submit a job that starts 4 extra brokers' ' + id=$(flux submit -N4 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test3.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test3.uri) \ + flux overlay status --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test3.uri) \ + flux run --label-io -N7 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +# +# Start 1 node batch job with 6 extra slots (kary:2). +# Submit 2 node broker job that fills the first level slots. +# Run a 3 node parallel job. +# Submit 4 node broker job that fills the second level slots. +# Run a 7 node parallel job. +# 0 +# 1 2 <-- flubbed (phase 1) +# 3 4 5 6 <-- flubbed (phase 2) +# This test is constrained so the first level wires up before +# the second level is started. + +test_expect_success 'start a 1 node job with 6 extra ranks' ' + id=$(flux batch -N1 \ + --broker-opts=--config-path=fake3.toml \ + --broker-opts=-Ssize=7 \ + --broker-opts=-Sbroker.quorum=1 \ + --broker-opts=-Stbon.topo=kary:2 \ + --wrap sleep inf) && + get_job_uri $id >test4.uri +' +test_expect_success 'run a 1 node job in the initial instance' ' + wait_for_service $(cat test4.uri) job-ingest && + run_timeout 30 flux proxy $(cat test4.uri) \ + flux run --label-io -N1 flux pmi barrier +' +test_expect_success 'job has size 7' ' + size=$(flux proxy $(cat test4.uri) flux getattr size) && + test $size -eq 7 +' +test_expect_success 'submit a job that starts 2 extra brokers' ' + id=$(flux submit -N2 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test4.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'run a 3 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test4.uri) \ + flux run --label-io -N3 flux pmi barrier +' +test_expect_success 'submit a job that starts 4 extra brokers' ' + id=$(flux submit -N4 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test4.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test4.uri) \ + flux overlay status --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test4.uri) \ + flux run --label-io -N7 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +# +# Start 1 node batch job with 6 extra slots (kary:2). +# Submit 6 node broker job that fills all the slots. +# Run a 7 node parallel job. +# +test_expect_success 'start a 1 node job with 6 extra ranks' ' + id=$(flux batch -N1 \ + --broker-opts=--config-path=fake3.toml \ + --broker-opts=-Ssize=7 \ + --broker-opts=-Sbroker.quorum=1 \ + --broker-opts=-Stbon.topo=kary:2 \ + --wrap sleep inf) && + get_job_uri $id >test5.uri +' +test_expect_success 'run a 1 node job in the initial instance' ' + wait_for_service $(cat test5.uri) job-ingest && + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N1 flux pmi barrier +' +test_expect_success 'job has size 7' ' + size=$(flux proxy $(cat test5.uri) flux getattr size) && + test $size -eq 7 +' +test_expect_success 'submit a job that starts 6 extra brokers' ' + id=$(flux submit -N6 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test5.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test5.uri) \ + flux overlay status --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N7 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +test_done