Skip to content

Commit

Permalink
Merge pull request flux-framework#1183 from grondo/issue#1182
Browse files Browse the repository at this point in the history
resource: ensure all resources start in DOWN state when some ranks are excluded by configuration
  • Loading branch information
mergify[bot] authored Apr 20, 2024
2 parents 17f0ed1 + a5607c3 commit 561ec6f
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 8 deletions.
12 changes: 4 additions & 8 deletions resource/modules/resource_match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1144,14 +1144,10 @@ static int grow_resource_db (std::shared_ptr<resource_ctx_t> &ctx,
static int decode_all (std::shared_ptr<resource_ctx_t> &ctx,
std::set<int64_t> &ranks)
{
int64_t size = ctx->db->metadata.by_rank.size();

for (int64_t rank = 0; rank < size; ++rank) {
auto ret = ranks.insert (rank);
if (!ret.second) {
errno = EEXIST;
return -1;
}
ranks.clear ();
for (auto const& kv: ctx->db->metadata.by_rank) {
if (kv.first >= 0)
ranks.insert (kv.first);
}
return 0;
}
Expand Down
69 changes: 69 additions & 0 deletions t/issues/t1182-exclude-with-down-ranks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash -e
#
# Ensure Fluxion marks all ranks down even if some ranks are excluded
#

log() { printf "issue#1182: $@\n" >&2; }

# Need a few ranks for this test, so start a new instance of size=4
if test "$ISSUE_1182_ACTIVE" != "t"; then
export ISSUE_1182_ACTIVE=t
log "Re-launching test script under flux-start"
exec flux start -s 4 $0
fi

cat <<'EOF' >rcheck.py
import sys
import flux
from flux.resource.list import ResourceListRPC
h = flux.Flux()
rpc1 = ResourceListRPC(h, "resource.sched-status", nodeid=0)
rpc2 = ResourceListRPC(h, "sched.resource-status", nodeid=0)
rset = rpc1.get()
fluxion = rpc2.get()
def symmetric_diff(a, b):
return (a|b) - (a&b)
diff = symmetric_diff(rset.down, fluxion.down)
if diff.ranks:
print("difference detected between fluxion and core down ranks:")
print(f"hosts: {diff.nodelist}")
print(f"ranks: {diff.ranks}")
sys.exit(1)
sys.exit(0)
EOF

log "Unloading modules..."
flux module remove sched-simple
flux module remove resource

# Exclude rank 0
flux config load <<EOF
[resource]
exclude = "0,2"
EOF

flux module load resource monitor-force-up

# Drain rank 3. Scheduler should only see rank 1 as up
log "draining rank 3"
flux resource drain 3

flux resource status

flux module load sched-fluxion-resource
flux module load sched-fluxion-qmanager

log "comparing fluxion down ranks with flux-core resource module:"
flux resource list
FLUX_RESOURCE_LIST_RPC=sched.resource-status flux resource list
flux python ./rcheck.py

log "reloading sched-simple..."
flux module remove sched-fluxion-qmanager
flux module remove sched-fluxion-resource
flux module load sched-simple

0 comments on commit 561ec6f

Please sign in to comment.