From dd112a1393f50a0073233647b65e187aba249c73 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Sun, 27 Mar 2022 08:52:16 -0700
Subject: [PATCH 01/17] librlist: add structure for allocation parameters

Problem: Passing allocation parameters like nnodes, nslots, and slot
size on as separate parameters to rlist allocation functions makes
extensibility difficult.

Add a new rlist_alloc_info structure which contains allocation parameters
for rlist allocation functions. Refactor code internally to take advantage
of the new structure.
---
 src/common/librlist/rlist.c | 86 ++++++++++++++++++++++++++-----------
 src/common/librlist/rlist.h |  8 ++++
 2 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/src/common/librlist/rlist.c b/src/common/librlist/rlist.c
index 56c265a18c43..020688c9c7f3 100644
--- a/src/common/librlist/rlist.c
+++ b/src/common/librlist/rlist.c
@@ -1809,18 +1809,19 @@ static zlistx_t *rlist_get_nnodes (struct rlist *rl, int nnodes)
  *  Works by getting the first N least utilized nodes and spreading
  *  the nslots evenly across the result.
  */
-static struct rlist *rlist_alloc_nnodes (struct rlist *rl, int nnodes,
-                                         int cores_per_slot, int slots)
+static struct rlist *rlist_alloc_nnodes (struct rlist *rl,
+                                         struct rlist_alloc_info *ai)
 {
     struct rlist *result = NULL;
     struct rnode *n = NULL;
     zlistx_t *cl = NULL;
+    int slots = ai->nslots;
 
-    if (rlist_nnodes (rl) < nnodes) {
+    if (rlist_nnodes (rl) < ai->nnodes) {
         errno = ENOSPC;
         return NULL;
     }
-    if (slots < nnodes) {
+    if (ai->nslots < ai->nnodes) {
         errno = EINVAL;
         return NULL;
     }
@@ -1834,7 +1835,7 @@ static struct rlist *rlist_alloc_nnodes (struct rlist *rl, int nnodes,
 
     /* 2. get a list of the first up n nodes
      */
-    if (!(cl = rlist_get_nnodes (rl, nnodes)))
+    if (!(cl = rlist_get_nnodes (rl, ai->nnodes)))
         goto unwind;
 
     /* We will sort candidate list by used cores on each iteration to
@@ -1855,7 +1856,7 @@ static struct rlist *rlist_alloc_nnodes (struct rlist *rl, int nnodes,
          *  least loaded node from the least loaded nodelist, we know
          *  we don't have enough resources to satisfy request.
          */
-        if (rlist_rnode_alloc (rl, n, cores_per_slot, &ids) < 0)
+        if (rlist_rnode_alloc (rl, n, ai->slot_size, &ids) < 0)
             goto unwind;
         rc = rlist_append_cores (result, n->hostname, n->rank, ids);
         idset_destroy (ids);
@@ -1883,12 +1884,13 @@ static struct rlist *rlist_alloc_nnodes (struct rlist *rl, int nnodes,
     return NULL;
 }
 
-static struct rlist *rlist_try_alloc (struct rlist *rl, const char *mode,
-                                      int nnodes, int slots, int cores_per_slot)
+static struct rlist *rlist_try_alloc (struct rlist *rl,
+                                      struct rlist_alloc_info *ai)
 {
     struct rlist *result = NULL;
+    const char *mode = ai->mode;
 
-    if (!rl) {
+    if (!rl || !ai) {
         errno = EINVAL;
         return NULL;
     }
@@ -1896,14 +1898,14 @@ static struct rlist *rlist_try_alloc (struct rlist *rl, const char *mode,
     /*  Reset default sort to order nodes by "rank" */
     zlistx_set_comparator (rl->nodes, by_rank);
 
-    if (nnodes > 0)
-        result = rlist_alloc_nnodes (rl, nnodes, cores_per_slot, slots);
+    if (ai->nnodes > 0)
+        result = rlist_alloc_nnodes (rl, ai);
     else if (mode == NULL || strcmp (mode, "worst-fit") == 0)
-        result = rlist_alloc_worst_fit (rl, cores_per_slot, slots);
+        result = rlist_alloc_worst_fit (rl, ai->slot_size, ai->nslots);
     else if (mode && strcmp (mode, "best-fit") == 0)
-        result = rlist_alloc_best_fit (rl, cores_per_slot, slots);
+        result = rlist_alloc_best_fit (rl, ai->slot_size, ai->nslots);
     else if (mode && strcmp (mode, "first-fit") == 0)
-        result = rlist_alloc_first_fit (rl, cores_per_slot, slots);
+        result = rlist_alloc_first_fit (rl, ai->slot_size, ai->nslots);
     else
         errno = EINVAL;
     return result;
@@ -1916,42 +1918,76 @@ static bool rlist_alloc_feasible (const struct rlist *rl, const char *mode,
 {
     bool rc = false;
     struct rlist *result = NULL;
+    struct rlist_alloc_info ai = {
+        .nnodes = nnodes,
+        .slot_size = slotsz,
+        .nslots = slots,
+        .mode = mode,
+    };
+    int saved_errno = errno;
     struct rlist *all = rlist_copy_empty (rl);
-    if (all && (result = rlist_try_alloc (all, mode, nnodes, slots, slotsz)))
+    if (all && (result = rlist_try_alloc (all, &ai)))
         rc = true;
     rlist_destroy (all);
     rlist_destroy (result);
+    errno = saved_errno;
     return rc;
 }
 
-struct rlist *rlist_alloc (struct rlist *rl, const char *mode,
-                          int nnodes, int slots, int slotsz)
+static int alloc_info_check (struct rlist *rl,
+                             struct rlist_alloc_info *ai,
+                             flux_error_t *errp)
 {
+    int slots = ai->nslots;
+    int nnodes = ai->nnodes;
+    int slotsz = ai->slot_size;
     int total = slots * slotsz;
-    struct rlist *result = NULL;
 
     if (slots <= 0 || slotsz <= 0 || nnodes < 0) {
         errno = EINVAL;
-        return NULL;
+        return -1;
     }
     if (total > rl->total) {
+        errprintf (errp, "unsatisfiable request");
         errno = EOVERFLOW;
-        return NULL;
+        return -1;
     }
     if (total > rl->avail) {
-        if (rlist_alloc_feasible (rl, mode, nnodes, slots, slotsz))
-            errno = ENOSPC;
-        else
+        if (!rlist_alloc_feasible (rl,
+                                   ai->mode,
+                                   ai->nnodes,
+                                   ai->nslots,
+                                   ai->slot_size)) {
+            errprintf (errp, "unsatisfiable request");
             errno = EOVERFLOW;
-        return NULL;
+        }
+        else
+            errno = ENOSPC;
+        return -1;
     }
+    return 0;
+}
+
+struct rlist *rlist_alloc (struct rlist *rl, const char *mode,
+                          int nnodes, int slots, int slotsz)
+{
+    struct rlist *result = NULL;
+    struct rlist_alloc_info ai = {
+        .nnodes = nnodes,
+        .nslots = slots,
+        .slot_size = slotsz,
+        .mode = mode
+    };
+
+    if (alloc_info_check (rl, &ai, NULL) < 0)
+        return NULL;
 
     /*
      *   Try allocation. If it fails with not enough resources (ENOSPC),
      *    then try again on an empty copy of rlist to see the request could
      *    *ever* be satisfied. Adjust errno to EOVERFLOW if not.
      */
-    result = rlist_try_alloc (rl, mode, nnodes, slots, slotsz);
+    result = rlist_try_alloc (rl, &ai);
     if (!result && (errno == ENOSPC)) {
         if (rlist_alloc_feasible (rl, mode, nnodes, slots, slotsz))
             errno = ENOSPC;
diff --git a/src/common/librlist/rlist.h b/src/common/librlist/rlist.h
index f133290833a3..d6d628933bac 100644
--- a/src/common/librlist/rlist.h
+++ b/src/common/librlist/rlist.h
@@ -43,6 +43,14 @@ struct rlist {
     json_t *scheduling;
 };
 
+struct rlist_alloc_info {
+    int nnodes;
+    int slot_size;
+    int nslots;
+    const char *mode;
+    json_t *constraints;
+};
+
 /*  Create an empty rlist object */
 struct rlist *rlist_create (void);
 

From ec1e680871fe9d30fc7d13bd617febde72c12d4c Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 08:08:22 -0700
Subject: [PATCH 02/17] librlist: add rlist_alloc_ex()

Problem: librlist doesn't expose a public allocation function that
takes the more convenient struct rlist_alloc_info structure.

Add rlist_alloc_ex() which offers an alternative to rlist_alloc()
(so that all callers do not need to be immediately updated.). Since
job constraints are (optionally) provided in the alloc_info argument
to rlist_alloc_ex(), handle constraints directly in this function,
instead of requiring the caller to filter the rlist by constraint
before calling rlist_alloc().
---
 src/common/librlist/rlist.c | 82 +++++++++++++++++++++++++++++++++++--
 src/common/librlist/rlist.h |  5 +++
 2 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/src/common/librlist/rlist.c b/src/common/librlist/rlist.c
index 020688c9c7f3..061ab33d2a20 100644
--- a/src/common/librlist/rlist.c
+++ b/src/common/librlist/rlist.c
@@ -1810,7 +1810,7 @@ static zlistx_t *rlist_get_nnodes (struct rlist *rl, int nnodes)
  *  the nslots evenly across the result.
  */
 static struct rlist *rlist_alloc_nnodes (struct rlist *rl,
-                                         struct rlist_alloc_info *ai)
+                                         const struct rlist_alloc_info *ai)
 {
     struct rlist *result = NULL;
     struct rnode *n = NULL;
@@ -1885,7 +1885,7 @@ static struct rlist *rlist_alloc_nnodes (struct rlist *rl,
 }
 
 static struct rlist *rlist_try_alloc (struct rlist *rl,
-                                      struct rlist_alloc_info *ai)
+                                      const struct rlist_alloc_info *ai)
 {
     struct rlist *result = NULL;
     const char *mode = ai->mode;
@@ -1935,7 +1935,7 @@ static bool rlist_alloc_feasible (const struct rlist *rl, const char *mode,
 }
 
 static int alloc_info_check (struct rlist *rl,
-                             struct rlist_alloc_info *ai,
+                             const struct rlist_alloc_info *ai,
                              flux_error_t *errp)
 {
     int slots = ai->nslots;
@@ -1968,6 +1968,82 @@ static int alloc_info_check (struct rlist *rl,
     return 0;
 }
 
+static struct rlist *
+rlist_alloc_constrained (struct rlist *rl,
+                         const struct rlist_alloc_info *ai,
+                         flux_error_t *errp)
+{
+    struct rlist *result;
+    struct rlist *cpy;
+    int saved_errno;
+
+    if (!(cpy = rlist_copy_constraint (rl, ai->constraints, errp)))
+        return NULL;
+
+    if (rlist_count (cpy, "core") == 0) {
+        errprintf (errp, "no resources satisfy provided constraints");
+        errno = EOVERFLOW;
+    }
+
+    result = rlist_try_alloc (cpy, ai);
+    saved_errno = errno;
+
+    if (!result && errno == ENOSPC) {
+        if (!rlist_alloc_feasible (cpy,
+                                   ai->mode,
+                                   ai->nnodes,
+                                   ai->nslots,
+                                   ai->slot_size)) {
+            saved_errno = EOVERFLOW;
+            errprintf (errp, "unsatisfiable constrained request");
+        }
+    }
+    rlist_destroy (cpy);
+
+    if (result && rlist_set_allocated (rl, result) < 0) {
+        errprintf (errp, "rlist_set_allocated: %s", strerror (errno));
+        rlist_destroy (result);
+        result = NULL;
+    }
+
+    errno = saved_errno;
+    return result;
+}
+
+struct rlist *rlist_alloc_ex (struct rlist *rl,
+                              const struct rlist_alloc_info *ai,
+                              flux_error_t *errp)
+{
+    struct rlist *result = NULL;
+
+    if (!rl || !ai) {
+        errno = EINVAL;
+        return NULL;
+    }
+
+    if (alloc_info_check (rl, ai, errp) < 0)
+        return NULL;
+
+    if (ai->constraints)
+        result = rlist_alloc_constrained (rl, ai, errp);
+    else {
+        result = rlist_try_alloc (rl, ai);
+
+        if (!result && (errno == ENOSPC)) {
+            if (!rlist_alloc_feasible (rl,
+                                       ai->mode,
+                                       ai->nnodes,
+                                       ai->nslots,
+                                       ai->slot_size)) {
+                errprintf (errp, "unsatisfiable request");
+                errno = EOVERFLOW;
+            }
+        }
+    }
+
+    return result;
+}
+
 struct rlist *rlist_alloc (struct rlist *rl, const char *mode,
                           int nnodes, int slots, int slotsz)
 {
diff --git a/src/common/librlist/rlist.h b/src/common/librlist/rlist.h
index d6d628933bac..fcacadbd843e 100644
--- a/src/common/librlist/rlist.h
+++ b/src/common/librlist/rlist.h
@@ -252,6 +252,11 @@ int rlist_verify (flux_error_t *error,
 struct rlist * rlist_alloc (struct rlist *rl, const char *mode,
                             int nnodes, int slot_size, int nslots);
 
+/*  As above, but arguments are passed in an rlist_alloc_info object
+ */
+struct rlist * rlist_alloc_ex (struct rlist *rl,
+                               const struct rlist_alloc_info *ai,
+                               flux_error_t *errp);
 
 /*  Mark rlist "alloc" as allocated in rlist "rl".
  */

From f1c553f03a2bfa301c43d141f89f29a9d8cd21ce Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 08:56:55 -0700
Subject: [PATCH 03/17] librlist/test: convert test/rlist.c to rlist_alloc_ex()

Problem: rlist_alloc_ex() is a preferred interface over rlist_alloc(),
but there are still a lot of users of the older rlist_alloc()
interface.

Add rl_alloc() wrapper for rlist_alloc_ex() to test/rlist.c and switch
all callers. This moves the unit test away from the older rlist_alloc()
and begins to exercise the preferred rlist_alloc_ex() interface.
---
 src/common/librlist/test/rlist.c | 43 +++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/src/common/librlist/test/rlist.c b/src/common/librlist/test/rlist.c
index 8101887c01b6..557061d33510 100644
--- a/src/common/librlist/test/rlist.c
+++ b/src/common/librlist/test/rlist.c
@@ -256,13 +256,28 @@ char *R_create (const char *ranklist,
     return NULL;
 }
 
+static struct rlist * rl_alloc (struct rlist *rl,
+                                const char *mode,
+                                int nnodes,
+                                int nslots,
+                                int slot_size)
+{
+    struct rlist_alloc_info ai = {
+        .mode = mode,
+        .nnodes = nnodes,
+        .nslots = nslots,
+        .slot_size = slot_size
+    };
+    return rlist_alloc_ex (rl, &ai, NULL);
+}
+
 static struct rlist * rlist_testalloc (struct rlist *rl,
                                        struct rlist_test_entry *e)
 {
-    return rlist_alloc (rl, e->mode,
-                        e->alloc.nnodes,
-                        e->alloc.nslots,
-                        e->alloc.slot_size);
+    return rl_alloc (rl, e->mode,
+                     e->alloc.nnodes,
+                     e->alloc.nslots,
+                     e->alloc.slot_size);
 }
 
 static char * rlist_tostring (struct rlist *rl, bool allocated)
@@ -387,7 +402,7 @@ static void test_simple (void)
         "rlist_append_rank_cores 1, 0-3");
     ok (rl->total == 8 && rl->avail == 8,
         "rlist: avail and total == 4");
-    ok ((alloc = rlist_alloc (rl, NULL, 0, 8, 1)) != NULL,
+    ok ((alloc = rl_alloc (rl, NULL, 0, 8, 1)) != NULL,
         "rlist: alloc all cores works");
     ok (alloc->total == 8 && alloc->avail == 8,
         "rlist: alloc: got %d/%d (expected 8/8)",
@@ -486,7 +501,7 @@ static void test_issue2202 (void)
         "issue2202: rlist_dumps works");
     free (result);
 
-    a = rlist_alloc (rl, "best-fit", 1, 1, 1);
+    a = rl_alloc (rl, "best-fit", 1, 1, 1);
     ok (a != NULL,
         "issue2202: rlist_alloc worked");
     if (a) {
@@ -523,7 +538,7 @@ static void test_issue2202 (void)
         "issue2202b: rlist_dumps works");
     free (result);
 
-    a = rlist_alloc (rl, "best-fit", 1, 1, 1);
+    a = rl_alloc (rl, "best-fit", 1, 1, 1);
     ok (a != NULL,
         "issue2202b: rlist_alloc worked");
     if (a) {
@@ -587,7 +602,7 @@ static void test_issue2473 (void)
     free (result);
 
     /* problem: allocated 3 cores on one node */
-    a = rlist_alloc (rl, "worst-fit", 3, 3, 1);
+    a = rl_alloc (rl, "worst-fit", 3, 3, 1);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=3 slots=3 slotsz=1 worked");
     if (!a)
@@ -604,7 +619,7 @@ static void test_issue2473 (void)
     rlist_destroy (a);
 
     /* problem: unsatisfiable */
-    a = rlist_alloc (rl, "worst-fit", 3, 8, 1);
+    a = rl_alloc (rl, "worst-fit", 3, 8, 1);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=3 slots=8 slotsz=1 worked");
     if (a) {
@@ -618,7 +633,7 @@ static void test_issue2473 (void)
      * - ask for 2 cores spread across 2 nodes
      * - we should get cores on rank[0-1] not rank[1-2]
      */
-    a = rlist_alloc (rl, "worst-fit", 1, 1, 1);
+    a = rl_alloc (rl, "worst-fit", 1, 1, 1);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=1 slots=1 slotsz=1 worked");
     if (!a)
@@ -630,7 +645,7 @@ static void test_issue2473 (void)
         "issue2473: one core was allocated from rank0");
     free (result);
 
-    a2 = rlist_alloc (rl, "worst-fit", 2, 2, 1);
+    a2 = rl_alloc (rl, "worst-fit", 2, 2, 1);
     ok (a2 != NULL,
         "issue2473: rlist_alloc nnodes=2 slots=2 slotsz=1 worked");
     result = rlist_dumps (a2);
@@ -709,7 +724,7 @@ static void test_updown ()
     ok (rl->avail == 16,
         "rl avail == 16");
 
-    rl2 = rlist_alloc (rl, NULL, 0, 4, 1);
+    rl2 = rl_alloc (rl, NULL, 0, 4, 1);
     ok (rl2 != NULL,
         "rlist_alloc() works when all nodes up");
 
@@ -732,12 +747,12 @@ static void test_updown ()
     ok (rlist_mark_up (rl, "0-2") == 0,
         "rlist_mark_up all but rank 3 up");
 
-    ok (rlist_alloc (rl, NULL, 4, 4, 1) == NULL && errno == ENOSPC,
+    ok (rl_alloc (rl, NULL, 4, 4, 1) == NULL && errno == ENOSPC,
         "allocation with 4 nodes fails with ENOSPC");
 
     ok (rlist_mark_up (rl, "3") == 0,
         "rlist_mark_up 3");
-    rl2 = rlist_alloc (rl, NULL, 4, 4, 1);
+    rl2 = rl_alloc (rl, NULL, 4, 4, 1);
 
     ok (rl2 != NULL,
         "rlist_alloc() for 4 nodes now succeeds");

From 41ddc1af6a692e851bf85af3e1110a3ca4ef51ca Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 09:55:43 -0700
Subject: [PATCH 04/17] librlist: ensure errno set on error from
 rlist_copy_constraint()

Problem: If rnode_match_validate() fails, rlist_copy_constraint()
returns an error without errno set. This is inconvenient because
functions like flux_respond_error() etc. expect a nonzero errno.

Set errno to EINVAL if rnode_match_validate() fails in
rlist_copy_constraint().
---
 src/bindings/python/flux/resource/Rlist.py | 7 ++++---
 src/common/librlist/rlist.c                | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/bindings/python/flux/resource/Rlist.py b/src/bindings/python/flux/resource/Rlist.py
index 4d40158fcdac..92b412bb7a59 100644
--- a/src/bindings/python/flux/resource/Rlist.py
+++ b/src/bindings/python/flux/resource/Rlist.py
@@ -124,9 +124,10 @@ def copy_constraint(self, constraint):
         error = ffi.new("flux_error_t *")
         if not isinstance(constraint, str):
             constraint = json.dumps(constraint)
-        handle = self.pimpl.copy_constraint_string(constraint, error)
-        if not handle:
+        try:
+            handle = self.pimpl.copy_constraint_string(constraint, error)
+        except OSError as exc:
             raise ValueError(
                 "copy_constraint: " + ffi.string(error.text).decode("utf-8")
-            )
+            ) from exc
         return Rlist(handle=handle)
diff --git a/src/common/librlist/rlist.c b/src/common/librlist/rlist.c
index 061ab33d2a20..5813c3624f7e 100644
--- a/src/common/librlist/rlist.c
+++ b/src/common/librlist/rlist.c
@@ -309,8 +309,10 @@ struct rlist *rlist_copy_constraint (const struct rlist *orig,
                                      json_t *constraint,
                                      flux_error_t *errp)
 {
-    if (rnode_match_validate (constraint, errp) < 0)
+    if (rnode_match_validate (constraint, errp) < 0) {
+        errno = EINVAL;
         return NULL;
+    }
     return rlist_copy_internal (orig,
                                 (rnode_copy_f) rnode_copy_match,
                                 (void *) constraint);

From 6277bf765ab36c3145ac66a96baf32009966af72 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 09:57:53 -0700
Subject: [PATCH 05/17] sched-simple: update to rlist_alloc_ex()

---
 src/modules/sched-simple/sched.c | 114 +++++++++----------------------
 t/t2110-job-ingest-validator.t   |   9 ++-
 2 files changed, 38 insertions(+), 85 deletions(-)

diff --git a/src/modules/sched-simple/sched.c b/src/modules/sched-simple/sched.c
index 1c9703120b13..711c82e1e175 100644
--- a/src/modules/sched-simple/sched.c
+++ b/src/modules/sched-simple/sched.c
@@ -190,35 +190,14 @@ static struct rlist *sched_alloc (struct simple_sched *ss,
                                   struct jobreq *job,
                                   flux_error_t *errp)
 {
-    struct rlist *alloc;
-
-    if (job->constraints) {
-        struct rlist *rl;
-
-        if (!(rl = rlist_copy_constraint (ss->rlist,
-                                          job->constraints,
-                                          errp)))
-            return NULL;
-
-        alloc = rlist_alloc (rl,
-                             ss->alloc_mode,
-                             job->jj.nnodes,
-                             job->jj.nslots,
-                             job->jj.slot_size);
-        rlist_destroy (rl);
-        if (alloc && rlist_set_allocated (ss->rlist, alloc) < 0) {
-            errprintf (errp, "rlist_set_allocated: %s", strerror (errno));
-            rlist_destroy (alloc);
-            alloc = NULL;
-        }
-    }
-    else
-        alloc = rlist_alloc (ss->rlist,
-                             ss->alloc_mode,
-                             job->jj.nnodes,
-                             job->jj.nslots,
-                             job->jj.slot_size);
-    return alloc;
+    struct rlist_alloc_info ai = {
+        .mode = ss->alloc_mode,
+        .nnodes = job->jj.nnodes,
+        .nslots = job->jj.nslots,
+        .slot_size = job->jj.slot_size,
+        .constraints = job->constraints
+    };
+    return rlist_alloc_ex (ss->rlist, &ai, errp);
 }
 
 static int try_alloc (flux_t *h, struct simple_sched *ss)
@@ -616,8 +595,6 @@ static void feasibility_cb (flux_t *h,
     const char *errmsg = NULL;
     flux_error_t error;
 
-    struct rlist *rl = ss->rlist;
-
     if (flux_request_unpack (msg, NULL, "{s:o}",
                             "jobspec", &jobspec) < 0)
         goto err;
@@ -627,71 +604,44 @@ static void feasibility_cb (flux_t *h,
                      "system",
                      "constraints", &constraints) < 0)
         goto err;
-    if (constraints) {
-        /*  If the job has constraints, first copy the set of resources
-         *   which match the constraint. rlist_copy_constraint() will
-         *   also validate the constraints object on error.
-         */
-        if (!(rl = rlist_copy_constraint (ss->rlist,
-                                          constraints,
-                                          &error))) {
-            errno = EINVAL;
-            errmsg = error.text;
-            goto err;
-        }
-        if (rlist_count (rl, "core") == 0) {
-            errno = ENOSPC;
-            errmsg = "no resources satisfy provided constraints";
-            goto err;
-        }
-    }
+
     if (libjj_get_counts_json (jobspec, &jj) < 0) {
         errmsg = jj.error;
         goto err;
     }
-    if (!(alloc = rlist_alloc (rl,
-                               ss->alloc_mode,
-                               jj.nnodes,
-                               jj.nslots,
-                               jj.slot_size))) {
-        if (errno == EOVERFLOW) {
-            errmsg = "request is not satisfiable";
-            goto err;
-        }
-        else if (errno != ENOSPC) {
-            errmsg = "cannot allocate this jobspec";
+
+    struct rlist_alloc_info ai = {
+        .mode = ss->alloc_mode,
+        .nnodes = jj.nnodes,
+        .nslots = jj.nslots,
+        .slot_size = jj.slot_size,
+        .constraints = constraints
+    };
+    if (!(alloc = rlist_alloc_ex (ss->rlist, &ai, &error))) {
+        if (errno != ENOSPC) {
+            errmsg = error.text;
             goto err;
         }
-        /* Fall-through: job is satisfiable */
-    }
-    else if (rl == ss->rlist) {
-        /*  If we didn't try allocation from a copy, then we have to free
-         *   the test allocation now.
+        /* Fall-through: if ENOSPC then job is satisfiable */
+    }
+    if (alloc && rlist_free (ss->rlist, alloc) < 0) {
+        /*  If rlist_free() fails we're in trouble because
+         *  ss->rlist will have an invalid allocation. This should
+         *  be rare if not impossible, so just exit the reactor.
+         *
+         *  The sched module can then be reloaded without loss of jobs.
          */
-        int rc = rlist_free (ss->rlist, alloc);
-        if (rc < 0) {
-            /*  If rlist_free() fails we're in trouble because
-             *  ss->rlist will have an invalid allocation. This should
-             *  be rare if not impossible, so just exit the reactor.
-             *
-             *  The sched module can then be reloaded without loss of jobs.
-             */
-            flux_log_error (h, "feasibility_cb: failed to free fake alloc");
-            flux_reactor_stop_error (flux_get_reactor (h));
-            errmsg = "Internal scheduler error";
-            goto err;
-        }
+        flux_log_error (h, "feasibility_cb: failed to free fake alloc");
+        flux_reactor_stop_error (flux_get_reactor (h));
+        errmsg = "Internal scheduler error";
+        goto err;
     }
     rlist_destroy (alloc);
-    if (rl != ss->rlist)
-        rlist_destroy (rl);
     if (flux_respond_pack (h, msg, "{s:i}", "errnum", 0) < 0)
         flux_log_error (h, "feasibility_cb: flux_respond_pack");
     return;
 err:
     rlist_destroy (alloc);
-    if (rl != ss->rlist)
-        rlist_destroy (rl);
     if (flux_respond_error (h, msg, errno, errmsg) < 0)
         flux_log_error (h, "feasibility_cb: flux_respond_error");
 }
diff --git a/t/t2110-job-ingest-validator.t b/t/t2110-job-ingest-validator.t
index d94f815e7d7a..91680e249f17 100755
--- a/t/t2110-job-ingest-validator.t
+++ b/t/t2110-job-ingest-validator.t
@@ -128,11 +128,14 @@ test_expect_success 'job-ingest: feasibility check succceeds with ENOSYS' '
 '
 test_expect_success 'job-ingest: infeasible jobs are now rejected' '
 	test_must_fail flux mini submit -g 1 hostname 2>infeasible1.err &&
+	test_debug "cat infeasible1.err" &&
 	grep -i "unsupported resource type" infeasible1.err &&
 	test_must_fail flux mini submit -n 10000 hostname 2>infeasible2.err &&
-	grep "request is not satisfiable" infeasible2.err &&
+	test_debug "cat infeasible2.err" &&
+	grep "unsatisfiable request" infeasible2.err &&
 	test_must_fail flux mini submit -N 12 -n12 hostname 2>infeasible3.err &&
-	grep "request is not satisfiable" infeasible3.err
+	test_debug "cat infeasible3.err" &&
+	grep "unsatisfiable request" infeasible3.err
 '
 test_expect_success 'job-ingest: feasibility validator works with jobs running' '
 	ncores=$(flux resource list -s up -no {ncores}) &&
@@ -142,7 +145,7 @@ test_expect_success 'job-ingest: feasibility validator works with jobs running'
 	flux queue stop &&
 	flux mini submit -n 2 hostname &&
 	test_must_fail flux mini submit -N 12 -n12 hostname 2>infeasible4.err &&
-	grep "request is not satisfiable" infeasible4.err &&
+	grep "unsatisfiable request" infeasible4.err &&
 	flux job cancel ${jobid} &&
 	flux job wait-event ${jobid} clean
 '

From 98ea16ebb4d562659102aa16d3d0d8510ab79212 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 11:07:13 -0700
Subject: [PATCH 06/17] librlist: replace rlist_alloc() with rlist_alloc_ex()

Problem: rlist_alloc() no longer has any callers

Rename rlist_alloc_ex() to rlist_alloc() and remove the latter.

Update all callers.
---
 src/common/librlist/rlist.c      | 35 +++-----------------------------
 src/common/librlist/rlist.h      | 13 ++++--------
 src/common/librlist/test/rlist.c |  2 +-
 src/modules/sched-simple/sched.c |  4 ++--
 4 files changed, 10 insertions(+), 44 deletions(-)

diff --git a/src/common/librlist/rlist.c b/src/common/librlist/rlist.c
index 5813c3624f7e..dd70826ae871 100644
--- a/src/common/librlist/rlist.c
+++ b/src/common/librlist/rlist.c
@@ -2012,9 +2012,9 @@ rlist_alloc_constrained (struct rlist *rl,
     return result;
 }
 
-struct rlist *rlist_alloc_ex (struct rlist *rl,
-                              const struct rlist_alloc_info *ai,
-                              flux_error_t *errp)
+struct rlist *rlist_alloc (struct rlist *rl,
+                           const struct rlist_alloc_info *ai,
+                           flux_error_t *errp)
 {
     struct rlist *result = NULL;
 
@@ -2046,35 +2046,6 @@ struct rlist *rlist_alloc_ex (struct rlist *rl,
     return result;
 }
 
-struct rlist *rlist_alloc (struct rlist *rl, const char *mode,
-                          int nnodes, int slots, int slotsz)
-{
-    struct rlist *result = NULL;
-    struct rlist_alloc_info ai = {
-        .nnodes = nnodes,
-        .nslots = slots,
-        .slot_size = slotsz,
-        .mode = mode
-    };
-
-    if (alloc_info_check (rl, &ai, NULL) < 0)
-        return NULL;
-
-    /*
-     *   Try allocation. If it fails with not enough resources (ENOSPC),
-     *    then try again on an empty copy of rlist to see the request could
-     *    *ever* be satisfied. Adjust errno to EOVERFLOW if not.
-     */
-    result = rlist_try_alloc (rl, &ai);
-    if (!result && (errno == ENOSPC)) {
-        if (rlist_alloc_feasible (rl, mode, nnodes, slots, slotsz))
-            errno = ENOSPC;
-        else
-            errno = EOVERFLOW;
-    }
-    return (result);
-}
-
 static int rlist_free_rnode (struct rlist *rl, struct rnode *n)
 {
     struct rnode *rnode = rlist_find_rank (rl, n->rank);
diff --git a/src/common/librlist/rlist.h b/src/common/librlist/rlist.h
index fcacadbd843e..b8202e9eeab0 100644
--- a/src/common/librlist/rlist.h
+++ b/src/common/librlist/rlist.h
@@ -244,19 +244,14 @@ int rlist_verify (flux_error_t *error,
  *   "first-fit"         - allocate first free slots found in rank order
  *
  *  Returns a new rlist representing the allocation on success,
- *   NULL on failure with errno set:
+ *   NULL on failure with errno set.
  *
  *   ENOSPC - unable to fulfill allocation.
  *   EINVAL - An argument was invalid.
  */
-struct rlist * rlist_alloc (struct rlist *rl, const char *mode,
-                            int nnodes, int slot_size, int nslots);
-
-/*  As above, but arguments are passed in an rlist_alloc_info object
- */
-struct rlist * rlist_alloc_ex (struct rlist *rl,
-                               const struct rlist_alloc_info *ai,
-                               flux_error_t *errp);
+struct rlist * rlist_alloc (struct rlist *rl,
+                            const struct rlist_alloc_info *ai,
+                            flux_error_t *errp);
 
 /*  Mark rlist "alloc" as allocated in rlist "rl".
  */
diff --git a/src/common/librlist/test/rlist.c b/src/common/librlist/test/rlist.c
index 557061d33510..a1d3956bcf23 100644
--- a/src/common/librlist/test/rlist.c
+++ b/src/common/librlist/test/rlist.c
@@ -268,7 +268,7 @@ static struct rlist * rl_alloc (struct rlist *rl,
         .nslots = nslots,
         .slot_size = slot_size
     };
-    return rlist_alloc_ex (rl, &ai, NULL);
+    return rlist_alloc (rl, &ai, NULL);
 }
 
 static struct rlist * rlist_testalloc (struct rlist *rl,
diff --git a/src/modules/sched-simple/sched.c b/src/modules/sched-simple/sched.c
index 711c82e1e175..d6794ad36a77 100644
--- a/src/modules/sched-simple/sched.c
+++ b/src/modules/sched-simple/sched.c
@@ -197,7 +197,7 @@ static struct rlist *sched_alloc (struct simple_sched *ss,
         .slot_size = job->jj.slot_size,
         .constraints = job->constraints
     };
-    return rlist_alloc_ex (ss->rlist, &ai, errp);
+    return rlist_alloc (ss->rlist, &ai, errp);
 }
 
 static int try_alloc (flux_t *h, struct simple_sched *ss)
@@ -617,7 +617,7 @@ static void feasibility_cb (flux_t *h,
         .slot_size = jj.slot_size,
         .constraints = constraints
     };
-    if (!(alloc = rlist_alloc_ex (ss->rlist, &ai, &error))) {
+    if (!(alloc = rlist_alloc (ss->rlist, &ai, &error))) {
         if (errno != ENOSPC) {
             errmsg = error.text;
             goto err;

From d8c6e497b095f139a35d235831f8c8a0d999c300 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 19:44:05 -0700
Subject: [PATCH 07/17] librlist: support exclusive node allocations

Problem: Jobspec v1 supports an exclusive flag for node exclusive
allocations, but rlist_alloc() does not have any support for an
exclusive flag.

Add an exclusive flag to `struct rlist_alloc_info`. Support exclusive
node allocation in rlist_alloc() via the new flag, but only when
there is an explicit request for a number of nodes.
---
 src/common/librlist/rlist.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/common/librlist/rlist.c b/src/common/librlist/rlist.c
index dd70826ae871..c78b1caf84cb 100644
--- a/src/common/librlist/rlist.c
+++ b/src/common/librlist/rlist.c
@@ -1835,6 +1835,30 @@ static struct rlist *rlist_alloc_nnodes (struct rlist *rl,
     zlistx_set_comparator (rl->nodes, by_used);
     zlistx_sort (rl->nodes);
 
+    if (ai->exclusive) {
+        int nleft = ai->nnodes;
+        struct rnode *cpy;
+        n = zlistx_first (rl->nodes);
+        while (n && nleft) {
+            /*  We can abort after we find the first non-idle node.
+             */
+            if (idset_count (n->cores->avail) < idset_count (n->cores->ids))
+                goto unwind;
+
+            if (!(cpy = rnode_copy (n))
+                || rlist_add_rnode_new (result, cpy) < 0) {
+                rnode_destroy (cpy);
+                goto unwind;
+            }
+            rnode_alloc_idset (n, n->cores->ids);
+            nleft--;
+            n = zlistx_next (rl->nodes);
+        }
+        if (nleft) /* Unable to allocate all nodes exclusively */
+            goto unwind;
+        return result;
+    }
+
     /* 2. get a list of the first up n nodes
      */
     if (!(cl = rlist_get_nnodes (rl, ai->nnodes)))
@@ -1949,6 +1973,11 @@ static int alloc_info_check (struct rlist *rl,
         errno = EINVAL;
         return -1;
     }
+    if (ai->exclusive && ai->nnodes <= 0) {
+        errprintf (errp, "exclusive allocation only supported with nnodes");
+        errno = EINVAL;
+        return -1;
+    }
     if (total > rl->total) {
         errprintf (errp, "unsatisfiable request");
         errno = EOVERFLOW;
@@ -2042,7 +2071,6 @@ struct rlist *rlist_alloc (struct rlist *rl,
             }
         }
     }
-
     return result;
 }
 

From bd8ed009e6307d2fbd0c7be48210738a514addd6 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 19:57:16 -0700
Subject: [PATCH 08/17] librlist/test: test node exclusive allocation

Problem: None of the librlist unit tests exercise the exclusive node
allocation option of rlist_alloc().

Add a small set of exclusive node allocation tests to
librlist/test/rlist.c
---
 src/common/librlist/rlist.h      |   1 +
 src/common/librlist/test/rlist.c | 156 +++++++++++++++++++++++--------
 2 files changed, 117 insertions(+), 40 deletions(-)

diff --git a/src/common/librlist/rlist.h b/src/common/librlist/rlist.h
index b8202e9eeab0..c5691165d75c 100644
--- a/src/common/librlist/rlist.h
+++ b/src/common/librlist/rlist.h
@@ -48,6 +48,7 @@ struct rlist_alloc_info {
     int slot_size;
     int nslots;
     const char *mode;
+    bool exclusive;
     json_t *constraints;
 };
 
diff --git a/src/common/librlist/test/rlist.c b/src/common/librlist/test/rlist.c
index a1d3956bcf23..2887b9a27ef6 100644
--- a/src/common/librlist/test/rlist.c
+++ b/src/common/librlist/test/rlist.c
@@ -19,6 +19,7 @@ struct testalloc {
     int nnodes;
     int nslots;
     int slot_size;
+    int exclusive;
 };
 
 struct rlist_test_entry {
@@ -34,97 +35,97 @@ struct rlist_test_entry {
 };
 
 #define RLIST_TEST_END { NULL, NULL, NULL, \
-                         { 0, 0, 0 },      \
+                         { 0, 0, 0, 0 },      \
                          NULL, NULL, NULL, \
                          0, false }
 
 struct rlist_test_entry test_2n_4c[] = {
     { "too large of slot returns EOVERFLOW", NULL, NULL,
-      { 0, 1, 5 },
+      { 0, 1, 5, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EOVERFLOW, false },
     { "too many slots returns error", NULL, NULL,
-      { 0, 9, 1 },
+      { 0, 9, 1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EOVERFLOW, false },
     { "invalid number of nodes returns error", NULL, NULL,
-      { -1, 1, 1 },
+      { -1, 1, 1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EINVAL, false },
     { "Too many nodes returns error", NULL, NULL,
-      { 3, 4, 1 },
+      { 3, 4, 1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EOVERFLOW, false },
     { "nodes > slots returns error", NULL, NULL,
-      { 2, 1, 1 },
+      { 2, 1, 1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EINVAL, false },
     { "invalid number of slots return error", NULL, NULL,
-      { 0, 0, 1 },
+      { 0, 0, 1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EINVAL, false },
     { "invalid slot size returns error", NULL, NULL,
-      { 0, 1, -1},
+      { 0, 1, -1, 0 },
       NULL,
       "",
       "rank[0-1]/core[0-3]",
       EINVAL, false },
     { "allocate with all nodes down returns ENOSPC", NULL, "0-1",
-      { 0, 1, 1},
+      { 0, 1, 1, 0 },
       NULL,
       "",
       "",
       ENOSPC, false },
     { "allocating a single core gets expected result", NULL, NULL,
-      { 0, 1, 1 },
+      { 0, 1, 1, 0 },
       "rank0/core0",
       "rank0/core0",
       "rank0/core[1-3] rank1/core[0-3]",
       0, true },
     { "allocating a single core with down rank", NULL, "0",
-      { 0, 1, 1 },
+      { 0, 1, 1, 0 },
       "rank1/core0",
       "rank1/core0",
       "rank1/core[1-3]",
       0, false },
     { "allocating another core (all ranks up)", NULL, NULL,
-      { 0, 1, 1 },
+      { 0, 1, 1, 0 },
       "rank0/core0",
       "rank[0-1]/core0",
       "rank[0-1]/core[1-3]",
       0, false },
     { "allocating another core gets expected result", NULL, NULL,
-      { 0, 1, 1 },
+      { 0, 1, 1, 0 },
       "rank0/core1",
       "rank0/core[0-1] rank1/core0",
       "rank0/core[2-3] rank1/core[1-3]",
       0, false },
     { "allocate 1 slot of size 3 lands on correct node", NULL, NULL,
-      { 0, 1, 3 },
+      { 0, 1, 3, 0 },
       "rank1/core[1-3]",
       "rank0/core[0-1] rank1/core[0-3]",
       "rank0/core[2-3]",
       0, false },
     { "allocate 4 slots of 1 core now returns ENOSPC", NULL, NULL,
-      { 0, 4, 1 },
+      { 0, 4, 1, 0 },
       NULL,
       "rank0/core[0-1] rank1/core[0-3]",
       "rank0/core[2-3]",
       ENOSPC, false },
     { "allocate remaining 2 cores", NULL, NULL,
-      { 0, 1, 2 },
+      { 0, 1, 2, 0 },
       "rank0/core[2-3]",
       "rank[0-1]/core[0-3]",
       "",
@@ -134,31 +135,31 @@ struct rlist_test_entry test_2n_4c[] = {
 
 struct rlist_test_entry test_6n_4c[] = {
     { "best-fit: alloc 1 core", "best-fit", NULL,
-      { 0, 1, 1 },
+      { 0, 1, 1, 0 },
       "rank0/core0",
       "rank0/core0",
       "rank0/core[1-3] rank[1-5]/core[0-3]",
       0, false },
     { "best-fit: alloc 1 slot/size 3 fits on rank0", "best-fit", NULL,
-      { 0, 1, 3 },
+      { 0, 1, 3, 0 },
       "rank0/core[1-3]",
       "rank0/core[0-3]",
       "rank[1-5]/core[0-3]",
       0, false },
     { "best-fit: alloc 2 slots/size 2 fits on rank1","best-fit", NULL,
-      { 0, 2, 2 },
+      { 0, 2, 2, 0 },
       "rank1/core[0-3]",
       "rank[0-1]/core[0-3]",
       "rank[2-5]/core[0-3]",
       0, false },
     { "best-fit: alloc 3 slot of size 1",            "best-fit", NULL,
-      { 0, 3, 1 },
+      { 0, 3, 1, 0 },
       "rank2/core[0-2]",
       "rank[0-1]/core[0-3] rank2/core[0-2]",
       "rank2/core3 rank[3-5]/core[0-3]",
       0, false },
     { "best-fit alloc 3 slots of 1 core",            "best-fit", NULL,
-      { 0, 3, 1 },
+      { 0, 3, 1, 0 },
       "rank2/core3 rank3/core[0-1]",
       "rank[0-2]/core[0-3] rank3/core[0-1]",
       "rank3/core[2-3] rank[4-5]/core[0-3]",
@@ -168,27 +169,27 @@ struct rlist_test_entry test_6n_4c[] = {
 
 struct rlist_test_entry test_1024n_4c[] = {
     { "large: 512 nodes with 2 cores", NULL, NULL,
-      { 512, 512, 2 },
+      { 512, 512, 2, 0 },
       "rank[0-511]/core[0-1]",
       "rank[0-511]/core[0-1]",
       "rank[0-511]/core[2-3] rank[512-1023]/core[0-3]",
       0, false
     },
     { "large: 512 slots of 4 cores", NULL, NULL,
-      { 0, 512, 4 },
+      { 0, 512, 4, 0 },
       "rank[512-1023]/core[0-3]",
       "rank[0-511]/core[0-1] rank[512-1023]/core[0-3]",
       "rank[0-511]/core[2-3]",
       0, true
     },
     { "large: 1 core on 10 nodes", NULL, NULL,
-      { 10, 10, 1 },
+      { 10, 10, 1, 0 },
       "rank[512-521]/core0",
       "rank[0-511]/core[0-1] rank[512-521]/core0",
       "rank[0-511]/core[2-3] rank[512-521]/core[1-3] rank[522-1023]/core[0-3]",
       0, false },
     { "large: alloc 2 cores on 128 nodes with free", NULL, NULL,
-      { 128, 256, 1 },
+      { 128, 256, 1, 0 },
       "rank[522-649]/core[0-1]",
       "rank[0-511,522-649]/core[0-1] rank[512-521]/core0",
       "rank[0-511,522-649]/core[2-3] rank[512-521]/core[1-3] rank[650-1023]/core[0-3]",
@@ -198,6 +199,71 @@ struct rlist_test_entry test_1024n_4c[] = {
 };
 
 
+struct rlist_test_entry test_exclusive[] = {
+    { "exclusive: exclusive without nnodes fails",
+      NULL,
+      NULL,
+      { 0, 1, 1, 1 },
+      NULL,
+      "",
+      "rank[0-3]/core[0-3]",
+      EINVAL,
+      false
+    },
+    { "exclusive: allocate one core first",
+      NULL,
+      NULL,
+      { 0, 1, 1, 0 },
+      "rank0/core0",
+      "rank0/core0",
+      "rank0/core[1-3] rank[1-3]/core[0-3]",
+      0,
+      false
+    },
+    { "exclusive: exclusively allocate 2 nodes",
+      NULL,
+      NULL,
+      { 2, 2, 1, 1 },
+      "rank[1-2]/core[0-3]",
+      "rank0/core0 rank[1-2]/core[0-3]",
+      "rank0/core[1-3] rank3/core[0-3]",
+      0,
+      false
+    },
+    { "exclusive: exclusively allocate 2 nodes fails",
+      NULL,
+      NULL,
+      { 2, 2, 1, 1 },
+      NULL,
+      "rank0/core0 rank[1-2]/core[0-3]",
+      "rank0/core[1-3] rank3/core[0-3]",
+      ENOSPC,
+      false
+    },
+    { "exclusive: but 1 node works",
+      NULL,
+      NULL,
+      { 1, 1, 1, 1 },
+      "rank3/core[0-3]",
+      "rank0/core0 rank[1-3]/core[0-3]",
+      "rank0/core[1-3]",
+      0,
+      false
+    },
+    { "exclusive: last 3 cores can be allocated non-exclusively",
+      NULL,
+      NULL,
+      { 0, 3, 1, 0 },
+      "rank0/core[1-3]",
+      "rank[0-3]/core[0-3]",
+      "",
+      0,
+      false,
+    },
+    RLIST_TEST_END,
+};
+
+
 char *R_create (const char *ranklist,
                 const char *corelist,
                 const char *gpus,
@@ -260,15 +326,21 @@ static struct rlist * rl_alloc (struct rlist *rl,
                                 const char *mode,
                                 int nnodes,
                                 int nslots,
-                                int slot_size)
+                                int slot_size,
+                                int exclusive)
 {
     struct rlist_alloc_info ai = {
         .mode = mode,
         .nnodes = nnodes,
         .nslots = nslots,
-        .slot_size = slot_size
+        .slot_size = slot_size,
+        .exclusive = exclusive
     };
-    return rlist_alloc (rl, &ai, NULL);
+    flux_error_t error;
+    struct rlist *result = rlist_alloc (rl, &ai, &error);
+    if (!result)
+        diag ("rlist_alloc: %s", error.text);
+    return result;
 }
 
 static struct rlist * rlist_testalloc (struct rlist *rl,
@@ -277,7 +349,8 @@ static struct rlist * rlist_testalloc (struct rlist *rl,
     return rl_alloc (rl, e->mode,
                      e->alloc.nnodes,
                      e->alloc.nslots,
-                     e->alloc.slot_size);
+                     e->alloc.slot_size,
+                     e->alloc.exclusive);
 }
 
 static char * rlist_tostring (struct rlist *rl, bool allocated)
@@ -367,7 +440,9 @@ void run_test_entries (struct rlist_test_entry tests[], int ranks, int cores)
                 rlist_destroy (alloc);
             }
             else {
-                fail ("%s: %s", e->description, strerror (errno));
+                fail ("%s: rlist_testalloc: %s",
+                      e->description,
+                      strerror (errno));
             }
         }
 
@@ -402,7 +477,7 @@ static void test_simple (void)
         "rlist_append_rank_cores 1, 0-3");
     ok (rl->total == 8 && rl->avail == 8,
         "rlist: avail and total == 4");
-    ok ((alloc = rl_alloc (rl, NULL, 0, 8, 1)) != NULL,
+    ok ((alloc = rl_alloc (rl, NULL, 0, 8, 1, 0)) != NULL,
         "rlist: alloc all cores works");
     ok (alloc->total == 8 && alloc->avail == 8,
         "rlist: alloc: got %d/%d (expected 8/8)",
@@ -501,7 +576,7 @@ static void test_issue2202 (void)
         "issue2202: rlist_dumps works");
     free (result);
 
-    a = rl_alloc (rl, "best-fit", 1, 1, 1);
+    a = rl_alloc (rl, "best-fit", 1, 1, 1, 0);
     ok (a != NULL,
         "issue2202: rlist_alloc worked");
     if (a) {
@@ -538,7 +613,7 @@ static void test_issue2202 (void)
         "issue2202b: rlist_dumps works");
     free (result);
 
-    a = rl_alloc (rl, "best-fit", 1, 1, 1);
+    a = rl_alloc (rl, "best-fit", 1, 1, 1, 0);
     ok (a != NULL,
         "issue2202b: rlist_alloc worked");
     if (a) {
@@ -602,7 +677,7 @@ static void test_issue2473 (void)
     free (result);
 
     /* problem: allocated 3 cores on one node */
-    a = rl_alloc (rl, "worst-fit", 3, 3, 1);
+    a = rl_alloc (rl, "worst-fit", 3, 3, 1, 0);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=3 slots=3 slotsz=1 worked");
     if (!a)
@@ -619,7 +694,7 @@ static void test_issue2473 (void)
     rlist_destroy (a);
 
     /* problem: unsatisfiable */
-    a = rl_alloc (rl, "worst-fit", 3, 8, 1);
+    a = rl_alloc (rl, "worst-fit", 3, 8, 1, 0);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=3 slots=8 slotsz=1 worked");
     if (a) {
@@ -633,7 +708,7 @@ static void test_issue2473 (void)
      * - ask for 2 cores spread across 2 nodes
      * - we should get cores on rank[0-1] not rank[1-2]
      */
-    a = rl_alloc (rl, "worst-fit", 1, 1, 1);
+    a = rl_alloc (rl, "worst-fit", 1, 1, 1, 0);
     ok (a != NULL,
         "issue2473: rlist_alloc nnodes=1 slots=1 slotsz=1 worked");
     if (!a)
@@ -645,7 +720,7 @@ static void test_issue2473 (void)
         "issue2473: one core was allocated from rank0");
     free (result);
 
-    a2 = rl_alloc (rl, "worst-fit", 2, 2, 1);
+    a2 = rl_alloc (rl, "worst-fit", 2, 2, 1, 0);
     ok (a2 != NULL,
         "issue2473: rlist_alloc nnodes=2 slots=2 slotsz=1 worked");
     result = rlist_dumps (a2);
@@ -724,7 +799,7 @@ static void test_updown ()
     ok (rl->avail == 16,
         "rl avail == 16");
 
-    rl2 = rl_alloc (rl, NULL, 0, 4, 1);
+    rl2 = rl_alloc (rl, NULL, 0, 4, 1, 0);
     ok (rl2 != NULL,
         "rlist_alloc() works when all nodes up");
 
@@ -747,12 +822,12 @@ static void test_updown ()
     ok (rlist_mark_up (rl, "0-2") == 0,
         "rlist_mark_up all but rank 3 up");
 
-    ok (rl_alloc (rl, NULL, 4, 4, 1) == NULL && errno == ENOSPC,
+    ok (rl_alloc (rl, NULL, 4, 4, 1, 0) == NULL && errno == ENOSPC,
         "allocation with 4 nodes fails with ENOSPC");
 
     ok (rlist_mark_up (rl, "3") == 0,
         "rlist_mark_up 3");
-    rl2 = rl_alloc (rl, NULL, 4, 4, 1);
+    rl2 = rl_alloc (rl, NULL, 4, 4, 1, 0);
 
     ok (rl2 != NULL,
         "rlist_alloc() for 4 nodes now succeeds");
@@ -1799,6 +1874,7 @@ int main (int ac, char *av[])
     run_test_entries (test_2n_4c,       2, 4);
     run_test_entries (test_6n_4c,       6, 4);
     run_test_entries (test_1024n_4c, 1024, 4);
+    run_test_entries (test_exclusive,   4, 4);
     test_issue2202 ();
     test_issue2473 ();
     test_updown ();

From 47561e9ebc3fa56f4e531851f89474faac681f37 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 20:21:43 -0700
Subject: [PATCH 09/17] sched-simple: support exclusive flag in jobspec reader

Problem: Jobspec v1 supports an exclusive flag, but the libjj jobspec
reader doesn't support reading that flag from jobspec.

Add an exclusive flags in the "jj_counts" structure returned by
libjj_get_counts() and libjj_get_counts_json().
---
 src/modules/sched-simple/libjj.c | 9 +++++++--
 src/modules/sched-simple/libjj.h | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/modules/sched-simple/libjj.c b/src/modules/sched-simple/libjj.c
index 2379358a5354..b78aff316a23 100644
--- a/src/modules/sched-simple/libjj.c
+++ b/src/modules/sched-simple/libjj.c
@@ -26,10 +26,12 @@ static int jj_read_vertex (json_t *o, int level, struct jj_counts *jj)
     const char *type = NULL;
     json_t *with = NULL;
     json_error_t error;
+    int exclusive = 0;
 
-    if (json_unpack_ex (o, &error, 0, "{ s:s s:i s?o }",
+    if (json_unpack_ex (o, &error, 0, "{ s:s s:i s?b s?o }",
                        "type", &type,
                        "count", &count,
+                       "exclusive", &exclusive,
                        "with", &with) < 0) {
         snprintf (jj->error, sizeof (jj->error) - 1,
                   "level %d: %s", level, error.text);
@@ -42,8 +44,11 @@ static int jj_read_vertex (json_t *o, int level, struct jj_counts *jj)
         errno = EINVAL;
         return -1;
     }
-    if (strcmp (type, "node") == 0)
+    if (strcmp (type, "node") == 0) {
         jj->nnodes = count;
+        if (exclusive)
+            jj->exclusive = true;
+    }
     else if (strcmp (type, "slot") == 0)
         jj->nslots = count;
     else if (strcmp (type, "core") == 0)
diff --git a/src/modules/sched-simple/libjj.h b/src/modules/sched-simple/libjj.h
index 7d9013897c0f..5eb93343a5fd 100644
--- a/src/modules/sched-simple/libjj.h
+++ b/src/modules/sched-simple/libjj.h
@@ -16,6 +16,7 @@
 #endif
 
 #include <jansson.h>
+#include <stdbool.h>
 
 #define JJ_ERROR_TEXT_LENGTH 256
 
@@ -24,6 +25,8 @@ struct jj_counts {
     int nslots;    /* total number of slots requested */
     int slot_size; /* number of cores per slot        */
 
+    bool exclusive;  /* enable node exclusive allocation if available */
+
     double duration; /* attributes.system.duration if set */
 
     char error[JJ_ERROR_TEXT_LENGTH]; /* On error, contains error description */

From 8162f4ad3c16f894d05a919b831930ee041c269e Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 20:32:08 -0700
Subject: [PATCH 10/17] testsuite: print exclusive flag in jj-reader tests

Problem: The sched-simple test program jj-reader doesn't emit the
exclusive flags along with the other data gleaned from the provided
test jobspec, and thus cannot be used to ensure the exclusive flag
is detected.

Print the value detected for the exclusive flag in the jj-reader
output. Update existing test.
---
 t/sched-simple/jj-reader.c |  8 ++++++--
 t/t0022-jj-reader.t        | 22 +++++++++++-----------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/t/sched-simple/jj-reader.c b/t/sched-simple/jj-reader.c
index 4e7aca91714e..cdc6157343ac 100644
--- a/t/sched-simple/jj-reader.c
+++ b/t/sched-simple/jj-reader.c
@@ -27,8 +27,12 @@ int main (int ac, char *av[])
         log_err_exit ("Failed to read stdin");
     if (libjj_get_counts (s, &jj) < 0)
         log_msg_exit ("%s", jj.error);
-    printf ("nnodes=%d nslots=%d slot_size=%d duration=%.1f\n",
-            jj.nnodes, jj.nslots, jj.slot_size, jj.duration);
+    printf ("nnodes=%d nslots=%d slot_size=%d exclusive=%s duration=%.1f\n",
+            jj.nnodes,
+            jj.nslots,
+            jj.slot_size,
+            jj.exclusive ? "true" : "false",
+            jj.duration);
     log_fini ();
     free (s);
     return 0;
diff --git a/t/t0022-jj-reader.t b/t/t0022-jj-reader.t
index eb7e6d447a93..604ab7bb4b7c 100755
--- a/t/t0022-jj-reader.t
+++ b/t/t0022-jj-reader.t
@@ -103,17 +103,17 @@ done <invalid.txt
 # <jobspec command args> == <expected result>
 #
 cat <<EOF >inputs.txt
-run              ==nnodes=0 nslots=1 slot_size=1 duration=0.0
-run -N1 -n1      ==nnodes=1 nslots=1 slot_size=1 duration=0.0
-run -N1 -n4      ==nnodes=1 nslots=4 slot_size=1 duration=0.0
-run -N1 -n4 -c4  ==nnodes=1 nslots=4 slot_size=4 duration=0.0
-run -n4 -c4      ==nnodes=0 nslots=4 slot_size=4 duration=0.0
-run -n4 -c4      ==nnodes=0 nslots=4 slot_size=4 duration=0.0
-run -n4 -c1      ==nnodes=0 nslots=4 slot_size=1 duration=0.0
-run -N4 -n4 -c4  ==nnodes=4 nslots=4 slot_size=4 duration=0.0
-run -t 1m -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 duration=60.0
-run -t 5s -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 duration=5.0
-run -t 1h -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 duration=3600.0
+run              ==nnodes=0 nslots=1 slot_size=1 exclusive=false duration=0.0
+run -N1 -n1      ==nnodes=1 nslots=1 slot_size=1 exclusive=false duration=0.0
+run -N1 -n4      ==nnodes=1 nslots=4 slot_size=1 exclusive=false duration=0.0
+run -N1 -n4 -c4  ==nnodes=1 nslots=4 slot_size=4 exclusive=false duration=0.0
+run -n4 -c4      ==nnodes=0 nslots=4 slot_size=4 exclusive=false duration=0.0
+run -n4 -c4      ==nnodes=0 nslots=4 slot_size=4 exclusive=false duration=0.0
+run -n4 -c1      ==nnodes=0 nslots=4 slot_size=1 exclusive=false duration=0.0
+run -N4 -n4 -c4  ==nnodes=4 nslots=4 slot_size=4 exclusive=false duration=0.0
+run -t 1m -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 exclusive=false duration=60.0
+run -t 5s -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 exclusive=false duration=5.0
+run -t 1h -N4 -n4 ==nnodes=4 nslots=4 slot_size=1 exclusive=false duration=3600.0
 EOF
 
 while read line; do

From 395058ee7159443a09d4236984ce6da04590e3f7 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Mon, 28 Mar 2022 21:09:12 -0700
Subject: [PATCH 11/17] python: add exclusive flag to Jobspec constructors

Problem: Jobspec from_*_command() constructors do not have a way to
specify the exclusive flag to request nodes exclusively.

Add an exclusive flag to all constructors, which defaults to False.
Since the exclusive flag must be specified on a node resource vertex,
raise an error if the exclusive flag is requested without also
specifying an explicit node count.
---
 src/bindings/python/flux/job/Jobspec.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/src/bindings/python/flux/job/Jobspec.py b/src/bindings/python/flux/job/Jobspec.py
index 801ce1dbfac9..5bcec1f5d3f9 100644
--- a/src/bindings/python/flux/job/Jobspec.py
+++ b/src/bindings/python/flux/job/Jobspec.py
@@ -308,7 +308,7 @@ def _validate_system_attributes(system):
             _validate_constraint(system["constraints"])
 
     @staticmethod
-    def _create_resource(res_type, count, with_child=None):
+    def _create_resource(res_type, count, with_child=None, exclusive=False):
         if with_child is not None and not isinstance(with_child, abc.Sequence):
             raise TypeError("child resource must None or a sequence")
         if with_child is not None and isinstance(with_child, str):
@@ -318,6 +318,9 @@ def _create_resource(res_type, count, with_child=None):
 
         res = {"type": res_type, "count": count}
 
+        if exclusive:
+            res["exclusive"] = True
+
         if with_child:
             res["with"] = with_child
         return res
@@ -639,8 +642,15 @@ def _v1_validate(resources, tasks, kwargs):
             raise ValueError("attributes.system.duration must be a number")
 
     @classmethod
+    # pylint: disable=too-many-branches
     def from_command(
-        cls, command, num_tasks=1, cores_per_task=1, gpus_per_task=None, num_nodes=None
+        cls,
+        command,
+        num_tasks=1,
+        cores_per_task=1,
+        gpus_per_task=None,
+        num_nodes=None,
+        exclusive=False,
     ):
         """
         Factory function that builds the minimum legal v1 jobspec.
@@ -666,6 +676,8 @@ def from_command(
                 raise ValueError("node count must be an integer >= 1 (if set)")
             if num_nodes > num_tasks:
                 raise ValueError("node count must not be greater than task count")
+        elif exclusive:
+            raise ValueError("exclusive can only be set with a node count")
         children = [cls._create_resource("core", cores_per_task)]
         if gpus_per_task not in (None, 0):
             children.append(cls._create_resource("gpu", gpus_per_task))
@@ -677,7 +689,9 @@ def from_command(
             else:
                 task_count_dict = {"per_slot": 1}
             slot = cls._create_slot("task", num_slots, children)
-            resource_section = cls._create_resource("node", num_nodes, [slot])
+            resource_section = cls._create_resource(
+                "node", num_nodes, [slot], exclusive
+            )
         else:
             task_count_dict = {"per_slot": 1}
             slot = cls._create_slot("task", num_tasks, children)
@@ -699,6 +713,7 @@ def from_batch_command(
         gpus_per_slot=None,
         num_nodes=None,
         broker_opts=None,
+        exclusive=False,
     ):
         """Create a Jobspec describing a nested Flux instance controlled by a script.
 
@@ -738,6 +753,7 @@ def from_batch_command(
             cores_per_task=cores_per_slot,
             gpus_per_task=gpus_per_slot,
             num_nodes=num_nodes,
+            exclusive=exclusive,
         )
         jobspec.setattr_shell_option("per-resource.type", "node")
         #  Copy script contents into jobspec
@@ -755,6 +771,7 @@ def from_nest_command(
         gpus_per_slot=None,
         num_nodes=None,
         broker_opts=None,
+        exclusive=False,
     ):
         """Create a Jobspec describing a nested Flux instance controlled by `command`.
 
@@ -783,6 +800,7 @@ def from_nest_command(
             cores_per_task=cores_per_slot,
             gpus_per_task=gpus_per_slot,
             num_nodes=num_nodes,
+            exclusive=exclusive,
         )
         jobspec.setattr_shell_option("per-resource.type", "node")
         return jobspec

From d6d87a23f96c91b16e5bb59bceb4cc59d76e4a70 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 06:57:58 -0700
Subject: [PATCH 12/17] sched-simple: support exclusive flag

Problem: The jobspec reader in sched-simple supports reading an
exclusive node flag from jobspec, but sched-simple doesn't pass this
flag to the librlist allocator, so effectively the exclusive flag is
not supported.

Pass the exclusive flag from jobspec to rlist_alloc() in sched-simple
so that exclusive node requests are supported.
---
 src/modules/sched-simple/sched.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/modules/sched-simple/sched.c b/src/modules/sched-simple/sched.c
index d6794ad36a77..1fb66fb47042 100644
--- a/src/modules/sched-simple/sched.c
+++ b/src/modules/sched-simple/sched.c
@@ -195,6 +195,7 @@ static struct rlist *sched_alloc (struct simple_sched *ss,
         .nnodes = job->jj.nnodes,
         .nslots = job->jj.nslots,
         .slot_size = job->jj.slot_size,
+        .exclusive = job->jj.exclusive,
         .constraints = job->constraints
     };
     return rlist_alloc (ss->rlist, &ai, errp);

From a8b47dafc81f4a82e2c027998db248b9d5baa0e4 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 07:06:03 -0700
Subject: [PATCH 13/17] flux-mini: add --exclusive flag

Problem: Jobspec v1 supports an exclusive flag to request exclusive
allocation of nodes, but flux-mini commands have interface to set
this flag in generated jobspec.

Add an --exclusive flag to flux-mini which simply passses the flag
down to the various Jobspec from_command() constructors.
---
 src/cmd/flux-mini.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/cmd/flux-mini.py b/src/cmd/flux-mini.py
index 86b895912e22..4080654722d9 100755
--- a/src/cmd/flux-mini.py
+++ b/src/cmd/flux-mini.py
@@ -725,6 +725,11 @@ def __init__(self):
             metavar="N",
             help="Number of GPUs to allocate per task",
         )
+        self.parser.add_argument(
+            "--exclusive",
+            action="store_true",
+            help="With -N, --nodes, allocate nodes exclusively",
+        )
         self.parser.add_argument(
             "-v",
             "--verbose",
@@ -756,6 +761,7 @@ def init_jobspec(self, args):
             cores_per_task=args.cores_per_task,
             gpus_per_task=args.gpus_per_task,
             num_nodes=args.nodes,
+            exclusive=args.exclusive,
         )
 
     def run_and_exit(self):
@@ -1445,6 +1451,11 @@ def add_batch_alloc_args(parser):
         metavar="N",
         help="Distribute allocated resource slots across N individual nodes",
     )
+    parser.add_argument(
+        "--exclusive",
+        action="store_true",
+        help="With --nodes, allocate nodes exclusively",
+    )
 
 
 def list_split(opts):
@@ -1511,6 +1522,7 @@ def init_jobspec(self, args):
             gpus_per_slot=args.gpus_per_slot,
             num_nodes=args.nodes,
             broker_opts=list_split(args.broker_opts),
+            exclusive=args.exclusive,
         )
 
         # Default output is flux-{{jobid}}.out
@@ -1554,6 +1566,7 @@ def init_jobspec(self, args):
             gpus_per_slot=args.gpus_per_slot,
             num_nodes=args.nodes,
             broker_opts=list_split(args.broker_opts),
+            exclusive=args.exclusive,
         )
         if sys.stdin.isatty():
             jobspec.setattr_shell_option("pty.interactive", 1)

From 88b66679a018fa3909880e2d2e0d6b4cf41d58a4 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 07:31:26 -0700
Subject: [PATCH 14/17] testsuite: disable "flux mini run --nodes=2 fails"

Problem: A test in the testsuite ensure that flux mini run --nodes=2
fails due to a "node count cannot be greater than task count" error,
but soon this usage will be allowed. (ntasks will default to nnodes
with exclusive flag set)

Remove the test to avoid breaking the testsuite.
---
 t/t2700-mini-cmd.t | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/t/t2700-mini-cmd.t b/t/t2700-mini-cmd.t
index d5e7c91ea1de..12e0201ca8ab 100755
--- a/t/t2700-mini-cmd.t
+++ b/t/t2700-mini-cmd.t
@@ -49,10 +49,6 @@ test_expect_success 'flux mini run --ntasks=1 --nodes=2 fails' '
 		2>run1n2N.err &&
 	grep -i "node count must not be greater than task count" run1n2N.err
 '
-test_expect_success 'flux mini run (default ntasks) --nodes=2 fails' '
-	test_must_fail flux mini run --nodes=2 hostname 2>run2N.err &&
-	grep -i "node count must not be greater than task count" run2N.err
-'
 test_expect_success 'flux mini submit --urgency=6 works' '
 	jobid=$(flux mini submit --urgency=6 hostname) &&
 	flux job eventlog $jobid | grep submit | grep urgency=6

From 486a359b10c21dd0a627d05cb71049828434ff1a Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 07:08:07 -0700
Subject: [PATCH 15/17] flux-mini: set ntasks/slots to nnodes when ntasks not
 set

Problem: It is inconvenient to require the specification of both ntasks
and nnodes when a user wants one task/slot per node. Until recently,
it was not possible to handle this in a coherent manner, though, so
the Python Jobspec class and flux-mini commands throw an error whenever
the node count is greater than the number of requested tasks/slots.

Now that node exclusivity can be set in the jobspec, though, it is
possible to set ntasks/slots to the number of nodes (when ntasks is
not explicitly set), by also defaulting the node exclusive flag to
True for this case.

This allows `flux mini run -N4 command` to work consistently regardless
of whether or not the enclosing instance defaults to node exclusive
allocation.

Fixes #4228
---
 src/cmd/flux-mini.py | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/cmd/flux-mini.py b/src/cmd/flux-mini.py
index 4080654722d9..fd13f5474c69 100755
--- a/src/cmd/flux-mini.py
+++ b/src/cmd/flux-mini.py
@@ -709,7 +709,6 @@ def __init__(self):
             "-n",
             "--ntasks",
             metavar="N",
-            default="1",
             help="Number of tasks to start",
         )
         self.parser.add_argument(
@@ -742,6 +741,15 @@ def init_jobspec(self, args):
         if not args.command:
             raise ValueError("job command and arguments are missing")
 
+        #  If ntasks not set, then set it to either node count, with
+        #   exclusive flag enabled, or to 1 (the default).
+        if not args.ntasks:
+            if args.nodes:
+                args.ntasks = args.nodes
+                args.exclusive = True
+            else:
+                args.ntasks = 1
+
         #  Ensure integer args are converted to int() here.
         #  This is done because we do not use type=int in argparse in order
         #   to allow these options to be mutable for bulksubmit:
@@ -1510,8 +1518,14 @@ def read_script(args):
 
     def init_jobspec(self, args):
         # If no script (reading from stdin), then use "flux" as arg[0]
+
+        #  If number of slots not specified, then set it to node count
+        #   if set, otherwise raise an error.
         if not args.nslots:
-            raise ValueError("Number of slots to allocate must be specified")
+            if not args.nodes:
+                raise ValueError("Number of slots to allocate must be specified")
+            args.nslots = args.nodes
+            args.exclusive = True
 
         jobspec = JobspecV1.from_batch_command(
             script=self.read_script(args),
@@ -1556,8 +1570,13 @@ def __init__(self):
 
     def init_jobspec(self, args):
 
+        #  If number of slots not specified, then set it to node count
+        #   if set, otherwise raise an error.
         if not args.nslots:
-            raise ValueError("Number of slots to allocate must be specified")
+            if not args.nodes:
+                raise ValueError("Number of slots to allocate must be specified")
+            args.nslots = args.nodes
+            args.exclusive = True
 
         jobspec = JobspecV1.from_nest_command(
             command=args.COMMAND,

From b834d88ccbe91b439531155cfeaa27dc25f28aa0 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 09:58:18 -0700
Subject: [PATCH 16/17] testsuite: add tests for exclusive node allocations

Problem: No tests in the testsuite exercise the flux-mini --exclusive
flag or implicit exclusive node requests when --nodes is used without
--ntasks or --nslots.

Add a set of tests to the flux-mini sharness scripts that exercise
expected behavior for flux-mini --exclusive, both explicit and
implicit use.
---
 t/t2700-mini-cmd.t   |  6 +++++-
 t/t2701-mini-batch.t | 29 ++++++++++++++++++++++++++++-
 t/t2702-mini-alloc.t | 13 +++++++++++++
 3 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/t/t2700-mini-cmd.t b/t/t2700-mini-cmd.t
index 12e0201ca8ab..46c44e5835e2 100755
--- a/t/t2700-mini-cmd.t
+++ b/t/t2700-mini-cmd.t
@@ -41,9 +41,13 @@ test_expect_success HAVE_MULTICORE 'flux mini submit --ntasks=2 --cores-per-task
 	jobid=$(flux mini submit --ntasks=2 --cores-per-task=2 hostname) &&
 	flux job attach $jobid
 '
-test_expect_success 'flux mini run --ntasks=2 --nodes=2 works' '
+test_expect_success 'flux mini submit --ntasks=2 --nodes=2 works' '
 	flux mini run --ntasks=2 --nodes=2 hostname
 '
+test_expect_success 'flux mini run --nodes=2 allocates 2 nodes exclusively' '
+	id=$(flux mini submit --wait-event=start --nodes=2 hostname) &&
+	test $(flux job info ${id} R | flux R decode --count=node) = 2
+'
 test_expect_success 'flux mini run --ntasks=1 --nodes=2 fails' '
 	test_must_fail flux mini run --ntasks=1 --nodes=2 hostname \
 		2>run1n2N.err &&
diff --git a/t/t2701-mini-batch.t b/t/t2701-mini-batch.t
index fd6294a3d08a..aa24c3b7b90b 100755
--- a/t/t2701-mini-batch.t
+++ b/t/t2701-mini-batch.t
@@ -11,6 +11,9 @@ test_under_flux 4 job
 
 flux setattr log-stderr-level 1
 
+NCORES=$(flux kvs get resource.R | flux R decode --count=core)
+test ${NCORES} -gt 4 && test_set_prereq MULTICORE
+
 test_expect_success 'create generic test batch script' '
 	cat <<-EOF >batch-script.sh
 	#!/bin/sh
@@ -60,18 +63,42 @@ test_expect_success 'flux-mini batch fails for file without she-bang' '
 	EOF
 	test_expect_code 1 flux mini batch -n1 invalid-script.sh
 '
+test_expect_success 'flux-mini batch fails if -N > -n' '
+	test_expect_code 1 flux mini batch -N4 -n1 --wrap hostname
+'
+test_expect_success HAVE_JQ 'flux-mini batch -N2 requests 2 nodes exclusively' '
+	flux mini batch -N2 --wrap --dry-run hostname | \
+		jq -S ".resources[0]" | \
+		jq -e ".type == \"node\" and .exclusive"
+'
+test_expect_success HAVE_JQ 'flux-mini batch --exclusive works' '
+	flux mini batch -N1 -n1 --exclusive --wrap --dry-run hostname | \
+		jq -S ".resources[0]" | \
+		jq -e ".type == \"node\" and .exclusive"
+'
 test_expect_success NO_ASAN 'flux-mini batch: submit a series of jobs' '
 	id1=$(flux mini batch --flags=waitable -n1 batch-script.sh) &&
 	id2=$(flux mini batch --flags=waitable -n4 batch-script.sh) &&
 	id3=$(flux mini batch --flags=waitable -N2 -n4 batch-script.sh) &&
+	flux resource list &&
+	flux jobs &&
+	id4=$(flux mini batch --flags=waitable -N2 -n2 --exclusive batch-script.sh) &&
+	id5=$(flux mini batch --flags=waitable -N2 batch-script.sh) &&
 	run_timeout 60 flux job wait --verbose --all
 '
 test_expect_success NO_ASAN 'flux-mini batch: job results are expected' '
 	test_debug "grep . flux-*.out" &&
 	grep "size=1 nodes=1" flux-${id1}.out &&
 	grep "size=1 nodes=1" flux-${id2}.out &&
-	grep "size=2 nodes=2" flux-${id3}.out
+	grep "size=2 nodes=2" flux-${id3}.out &&
+	grep "size=2 nodes=2" flux-${id4}.out &&
+	grep "size=2 nodes=2" flux-${id5}.out
+'
+test_expect_success MULTICORE 'flux-mini batch: exclusive flag worked' '
+	test $(flux job info ${id4} R | flux R decode --count=core) -gt 2 &&
+	test $(flux job info ${id5} R | flux R decode --count=core) -gt 2
 '
+
 test_expect_success 'flux-mini batch: --output=kvs directs output to kvs' '
 	id=$(flux mini batch -n1 --flags=waitable --output=kvs batch-script.sh) &&
 	run_timeout 60 flux job attach $id > kvs-output.log 2>&1 &&
diff --git a/t/t2702-mini-alloc.t b/t/t2702-mini-alloc.t
index 473184ee49dc..7c74e7bdf584 100755
--- a/t/t2702-mini-alloc.t
+++ b/t/t2702-mini-alloc.t
@@ -28,6 +28,19 @@ test_expect_success HAVE_JQ 'flux-mini alloc can set initial-program' '
 	flux mini alloc -n1 --dry-run myapp --foo | \
 	    jq -e ".tasks[0].command == [ \"flux\", \"broker\", \"myapp\", \"--foo\" ]"
 '
+test_expect_success HAVE_JQ 'flux-mini alloc -N2 requests 2 nodes exclusively' '
+	flux mini alloc -N2 --dry-run hostname | \
+		jq -S ".resources[0]" | \
+		jq -e ".type == \"node\" and .exclusive"
+'
+test_expect_success HAVE_JQ 'flux-mini alloc --exclusive works' '
+	flux mini alloc -N1 -n1 --exclusive --dry-run hostname | \
+		jq -S ".resources[0]" | \
+		jq -e ".type == \"node\" and .exclusive"
+'
+test_expect_success 'flux-mini alloc fails if N > n' '
+	test_expect_code 1 flux mini alloc -N2 -n1 --dry-run hostname
+'
 test_expect_success 'flux-mini alloc works' '
 	$runpty -o single.out flux mini alloc -n1 \
 		flux resource list -s up -no {rlist} &&

From 8f32ad321949ec003b94f836ff5d9844d05d4d65 Mon Sep 17 00:00:00 2001
From: "Mark A. Grondona" <mark.grondona@gmail.com>
Date: Tue, 29 Mar 2022 15:09:49 -0700
Subject: [PATCH 17/17] doc: document --exclusive option in flux-mini(1)

Problem: The flux-mini(1) --exclusive option is not documented in
the man page.

Add documentation for the --exclusive option. Also mention that the
exclusive option will default to true when --nodes is used without
--ntasks or --nslots.
---
 doc/man1/flux-mini.rst | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/doc/man1/flux-mini.rst b/doc/man1/flux-mini.rst
index f39d29a7ce20..0fc6df9284fb 100644
--- a/doc/man1/flux-mini.rst
+++ b/doc/man1/flux-mini.rst
@@ -27,8 +27,8 @@ as a parallel job, while **batch** and **alloc** submit a script or launch
 a command as the initial program of a new Flux instance.
 
 If *--ntasks* is unspecified, a value of *N=1* is assumed. Commands that
-take *--nslots* have no default and require that *--nslots* be explicitly
-specified.
+take *--nslots* have no default and require that *--nslots* or *--nodes*
+be specified.
 
 The **submit** and **batch** commands enqueue the job and print its numerical
 Job ID on standard output.
@@ -98,6 +98,13 @@ following additional job parameters:
    than there are tasks. If unspecified, the number of nodes will be chosen
    by the scheduler.
 
+**--exclusive**
+   Indicate to the scheduler that nodes should be exclusively allocated to
+   this job. It is an error to specify this option without also using
+   *-N, --nodes*. If *--nodes* is specified without *--nslots* or *--ntasks*,
+   then this option will be enabled by default and the number of tasks
+   or slots will be set to the number of requested nodes.
+
 **-t, --time-limit=FSD**
    Set a time limit for the job in Flux standard duration (RFC 23).
    FSD is a floating point number with a single character units suffix