From 35997026a864c01f81b4f5f55ca30a48318f7fd1 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 19 Jun 2024 08:02:00 -0700 Subject: [PATCH 1/4] job-list: fix potential double free of a hostlist struct Problem: In the job-list hostlist match function, if an append to the list of hostlist structs fails, then a double free of the hostlist could occur since `goto error` already destroys the hostlist. Remove the extra hostlist_destroy() in the error path. --- src/modules/job-list/match.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/modules/job-list/match.c b/src/modules/job-list/match.c index f60764bc6923..5eec29b8f445 100644 --- a/src/modules/job-list/match.c +++ b/src/modules/job-list/match.c @@ -505,7 +505,6 @@ static struct list_constraint *create_hostlist_constraint ( } if (!zlistx_add_end (c->values, hl)) { errprintf (errp, "failed to append hostlist structure"); - hostlist_destroy (hl); goto error; } return c; From 2b207b09fbdb79660bf12373bc9e5db75882e018 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Mon, 17 Jun 2024 15:22:52 -0700 Subject: [PATCH 2/4] job-list: support ranks constraint Problem: The job-list module supports filtering jobs only by hostname, but there are times when filtering by ranks would be more convenient. Add a "ranks" constraint operator with values in RFC 22 Idset string form to filter jobs based on assigned ranks. --- src/modules/job-list/job_data.c | 1 + src/modules/job-list/job_data.h | 2 + src/modules/job-list/match.c | 91 ++++++++++++++++++++++++++++++ src/modules/job-list/state_match.c | 3 +- 4 files changed, 96 insertions(+), 1 deletion(-) diff --git a/src/modules/job-list/job_data.c b/src/modules/job-list/job_data.c index a48f7bdca52a..733d2f8e5963 100644 --- a/src/modules/job-list/job_data.c +++ b/src/modules/job-list/job_data.c @@ -36,6 +36,7 @@ void job_destroy (void *data) free (job->ranks); free (job->nodelist); hostlist_destroy (job->nodelist_hl); + idset_destroy (job->ranks_idset); json_decref (job->annotations); grudgeset_destroy (job->dependencies); json_decref (job->jobspec); diff --git a/src/modules/job-list/job_data.h b/src/modules/job-list/job_data.h index 512f796ff1e4..4729e0d0431d 100644 --- a/src/modules/job-list/job_data.h +++ b/src/modules/job-list/job_data.h @@ -15,6 +15,7 @@ #include #include "src/common/libhostlist/hostlist.h" +#include "src/common/libidset/idset.h" #include "src/common/libutil/grudgeset.h" #include "src/common/libczmqcontainers/czmq_containers.h" @@ -56,6 +57,7 @@ struct job { char *ranks; char *nodelist; struct hostlist *nodelist_hl; /* cache of nodelist in hl form */ + struct idset *ranks_idset; /* cache of ranks in idset form */ double expiration; int wait_status; bool success; diff --git a/src/modules/job-list/match.c b/src/modules/job-list/match.c index 5eec29b8f445..c7697c3c92d6 100644 --- a/src/modules/job-list/match.c +++ b/src/modules/job-list/match.c @@ -514,6 +514,95 @@ static struct list_constraint *create_hostlist_constraint ( return NULL; } +static int match_ranks (struct list_constraint *c, + const struct job *job, + unsigned int *comparisons, + flux_error_t *errp) +{ + struct idset *idset = zlistx_first (c->values); + size_t n, m; + + /* ranks may not exist if job never ran */ + if (!job->ranks) + return 0; + if (!job->ranks_idset) { + /* hack to remove const */ + struct job *jobtmp = (struct job *)job; + if (!(jobtmp->ranks_idset = idset_decode (job->ranks))) + return 0; + } + /* Account for all ranks being compared before calling + * inc_check_comparison. This is the smallest of the job or + * comparison idset + */ + m = idset_count (job->ranks_idset); + n = idset_count (idset); + *comparisons += (m < n ? m : n) - 1; + if (inc_check_comparison (c->mctx, comparisons, errp) < 0) + return -1; + return idset_has_intersection (job->ranks_idset, idset); +} + + +/* zlistx_set_destructor */ +static void wrap_idset_destroy (void **item) +{ + if (item) { + struct idset *idset = *item; + idset_destroy (idset); + (*item) = NULL; + } +} + +static struct list_constraint *create_ranks_constraint ( + struct match_ctx *mctx, + json_t *values, + flux_error_t *errp) +{ + struct list_constraint *c; + struct idset *idset = NULL; + json_t *entry; + size_t index; + + if (!(c = list_constraint_new (mctx, + match_ranks, + wrap_idset_destroy, + errp))) + return NULL; + + if (!(idset = idset_create (0, IDSET_FLAG_AUTOGROW))) { + errprintf (errp, "failed to create idset structure"); + goto error; + } + json_array_foreach (values, index, entry) { + const char *ids; + idset_error_t error; + if (!json_is_string (entry) + || !(ids = json_string_value (entry))) { + errprintf (errp, "ranks value must be a string"); + goto error; + } + if (idset_decode_add (idset, ids, -1, &error) < 0) { + errprintf (errp, "ranks value '%s': %s", ids, error.text); + goto error; + } + } + if (idset_count (idset) > mctx->max_hostlist) { + errprintf (errp, "too many ranks specified"); + goto error; + } + if (!zlistx_add_end (c->values, idset)) { + errprintf (errp, "failed to append idset structure"); + goto error; + } + return c; + error: + idset_destroy (idset); + list_constraint_destroy (c); + return NULL; +} + + static int match_timestamp (struct list_constraint *c, const struct job *job, unsigned int *comparisons, @@ -753,6 +842,8 @@ struct list_constraint *list_constraint_create (struct match_ctx *mctx, return create_results_constraint (mctx, values, errp); else if (streq (op, "hostlist")) return create_hostlist_constraint (mctx, values, errp); + else if (streq (op, "ranks")) + return create_ranks_constraint (mctx, values, errp); else if (streq (op, "t_submit") || streq (op, "t_depend") || streq (op, "t_run") diff --git a/src/modules/job-list/state_match.c b/src/modules/job-list/state_match.c index 75dc8a851bf2..1f055e40e089 100644 --- a/src/modules/job-list/state_match.c +++ b/src/modules/job-list/state_match.c @@ -367,7 +367,8 @@ struct state_constraint *state_constraint_create (json_t *constraint, flux_error if (streq (op, "userid") || streq (op, "name") || streq (op, "queue") - || streq (op, "hostlist")) + || streq (op, "hostlist") + || streq (op, "ranks")) return state_constraint_new (match_maybe, NULL, errp); else if (streq (op, "results")) return state_constraint_new (match_result, NULL, errp); From 3995adc8cd98badb7b549de4da0aa197181d1ce5 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Mon, 17 Jun 2024 16:54:23 -0700 Subject: [PATCH 3/4] job-list/test: add unit tests for ranks constraint Problem: There are no unit tests for the 'ranks' constraint operator. Add them. --- src/modules/job-list/test/match.c | 172 ++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) diff --git a/src/modules/job-list/test/match.c b/src/modules/job-list/test/match.c index 6c2f0307fecd..b69cdfdbba50 100644 --- a/src/modules/job-list/test/match.c +++ b/src/modules/job-list/test/match.c @@ -103,6 +103,7 @@ static struct job *setup_job (uint32_t userid, const char *name, const char *queue, const char *nodelist, + const char *ranks, flux_job_state_t state, flux_job_result_t result, double t_submit, @@ -125,6 +126,11 @@ static struct job *setup_job (uint32_t userid, if (!(job->nodelist = strdup (nodelist))) BAIL_OUT ("failed to strdup nodelist"); } + if (ranks) { + /* N.B. internally is not const, so strdup it */ + if (!(job->ranks = strdup (ranks))) + BAIL_OUT ("failed to strdup ranks"); + } job->state = state; if (state) { /* Assume all jobs run, we don't skip any states, so add bitmask @@ -172,6 +178,7 @@ static void test_basic_special_cases (void) NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -265,6 +272,7 @@ static void test_basic_userid (void) NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -353,6 +361,7 @@ static void test_basic_name (void) tests->name, NULL, NULL, + NULL, 0, 0, 0.0, @@ -441,6 +450,7 @@ static void test_basic_queue (void) NULL, tests->queue, NULL, + NULL, 0, 0, 0.0, @@ -533,6 +543,7 @@ static void test_basic_states (void) NULL, NULL, NULL, + NULL, tests->state, 0, 0.0, @@ -628,6 +639,7 @@ static void test_basic_results (void) NULL, NULL, NULL, + NULL, tests->state, tests->result, 0.0, @@ -755,6 +767,7 @@ static void test_basic_hostlist (void) NULL, NULL, tests->nodelist, + NULL, 0, 0, 0.0, @@ -777,6 +790,115 @@ static void test_basic_hostlist (void) } } + +struct basic_ranks_test { + const char *ranks; + bool expected; + bool end; /* ranks can be NULL */ +}; + +struct basic_ranks_constraint_test { + const char *constraint; + struct basic_ranks_test tests[9]; +} basic_ranks_tests[] = { + { + "{ \"ranks\": [ ] }", + { + /* N.B. ranks can potentially be NULL */ + { NULL, false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"ranks\": [ \"1\" ] }", + { + /* N.B. ranks can potentially be NULL */ + { NULL, false, false, }, + { "1", true, false, }, + { "2", false, false, }, + { "1-2", true, false, }, + { "2-3", false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"ranks\": [ \"1-2\" ] }", + { + /* N.B. ranks can potentially be NULL */ + { NULL, false, false, }, + { "1", true, false, }, + { "2", true, false, }, + { "1-2", true, false, }, + { "2-3", true, false, }, + { "3-4", false, false, }, + { NULL, false, true, }, + }, + }, + { + "{ \"ranks\": [ \"1\", \"2\", \"3\" ] }", + { + /* N.B. ranks can potentially be NULL */ + { NULL, false, false, }, + { "1", true, false, }, + { "2", true, false, }, + { "1-2", true, false, }, + { "2-3", true, false, }, + { "3-4", true, false, }, + { "4", false, false, }, + { "4-5", false, false, }, + { NULL, false, true, }, + }, + }, + { + NULL, + { + { NULL, false, true, }, + }, + }, +}; + +static void test_basic_ranks (void) +{ + struct basic_ranks_constraint_test *ctests = basic_ranks_tests; + int index = 0; + + while (ctests->constraint) { + struct basic_ranks_test *tests = ctests->tests; + struct list_constraint *c; + int index2 = 0; + + c = create_list_constraint (ctests->constraint); + while (!tests->end) { + struct job *job; + flux_error_t error; + int rv; + job = setup_job (0, + NULL, + NULL, + NULL, + tests->ranks, + 0, + 0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0); + rv = job_match (job, c, &error); + ok (rv == tests->expected, + "basic rank job match test #%d/#%d job->ranks = %s (expected %d got %d)", + index, index2, tests->ranks, tests->expected, rv); + job_destroy (job); + index2++; + tests++; + } + + index++; + list_constraint_destroy (c); + ctests++; + } +} + struct basic_timestamp_test { flux_job_state_t state; int submit_version; @@ -1056,6 +1178,7 @@ static void test_basic_timestamp (void) NULL, NULL, NULL, + NULL, tests->state, 0, tests->t_submit, @@ -1264,6 +1387,7 @@ static void test_basic_conditionals (void) tests->name, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1292,6 +1416,7 @@ struct realworld_test { const char *name; const char *queue; const char *nodelist; + const char *ranks; flux_job_state_t state; flux_job_result_t result; double t_run; @@ -1317,6 +1442,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1328,6 +1454,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1339,6 +1466,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1350,6 +1478,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1361,6 +1490,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1383,6 +1513,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_FAILED, 0.0, @@ -1394,6 +1525,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_CANCELED, 0.0, @@ -1405,6 +1537,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_TIMEOUT, 0.0, @@ -1416,6 +1549,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_FAILED, 0.0, @@ -1427,6 +1561,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1438,6 +1573,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1449,6 +1585,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1472,6 +1609,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1483,6 +1621,7 @@ struct realworld_constraint_test { "foo", "debug", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1494,6 +1633,7 @@ struct realworld_constraint_test { "foo", "debug", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1505,6 +1645,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1516,6 +1657,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1527,6 +1669,7 @@ struct realworld_constraint_test { "foo", "gpu", NULL, + NULL, FLUX_JOB_STATE_DEPEND, 0, 0.0, @@ -1538,6 +1681,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1562,6 +1706,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1573,6 +1718,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_CLEANUP, 0, 0.0, @@ -1584,6 +1730,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1595,6 +1742,7 @@ struct realworld_constraint_test { "foo", "debug", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1606,6 +1754,7 @@ struct realworld_constraint_test { "bar", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1617,6 +1766,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1628,6 +1778,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1650,6 +1801,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_SCHED, 0, 0.0, @@ -1661,6 +1813,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_RUN, 0, 0.0, @@ -1672,6 +1825,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1683,6 +1837,7 @@ struct realworld_constraint_test { "foo", "batch", NULL, + NULL, FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1694,6 +1849,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1716,6 +1872,7 @@ struct realworld_constraint_test { "foo", "batch", "node[1-3]", + "0-2", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1727,6 +1884,7 @@ struct realworld_constraint_test { "foo", "batch", "node[1-3]", + "0-2", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1738,6 +1896,7 @@ struct realworld_constraint_test { "foo", "batch", "node[2-4]", + "1-3", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1749,6 +1908,7 @@ struct realworld_constraint_test { "foo", "batch", "node[3-4]", + "2-3", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 0.0, @@ -1760,6 +1920,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1784,6 +1945,7 @@ struct realworld_constraint_test { "foo", "batch", "node[1-3]", + "0-2", FLUX_JOB_STATE_RUN, 0, 1000.0, @@ -1795,6 +1957,7 @@ struct realworld_constraint_test { "foo", "batch", "node[1-3]", + "0-2", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 1000.0, @@ -1806,6 +1969,7 @@ struct realworld_constraint_test { "foo", "batch", "node[2-3]", + "1-2", FLUX_JOB_STATE_RUN, 0, 1000.0, @@ -1817,6 +1981,7 @@ struct realworld_constraint_test { "foo", "batch", "node[2-3]", + "1-2", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 1000.0, @@ -1828,6 +1993,7 @@ struct realworld_constraint_test { "foo", "batch", "node[2-3]", + "1-2", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 1000.0, @@ -1839,6 +2005,7 @@ struct realworld_constraint_test { "foo", "batch", "node[3-4]", + "2-3", FLUX_JOB_STATE_RUN, 0, 1000.0, @@ -1850,6 +2017,7 @@ struct realworld_constraint_test { "foo", "batch", "node[3-4]", + "2-3", FLUX_JOB_STATE_INACTIVE, FLUX_JOB_RESULT_COMPLETED, 1000.0, @@ -1861,6 +2029,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1877,6 +2046,7 @@ struct realworld_constraint_test { NULL, NULL, NULL, + NULL, 0, 0, 0.0, @@ -1906,6 +2076,7 @@ static void test_realworld (void) tests->name, tests->queue, tests->nodelist, + tests->ranks, tests->state, tests->result, 0.0, @@ -1941,6 +2112,7 @@ int main (int argc, char *argv[]) test_basic_results (); test_corner_case_hostlist (); test_basic_hostlist (); + test_basic_ranks (); test_basic_timestamp (); test_basic_conditionals (); test_realworld (); From 342a2a7b3fa0bb2b79713f389dbe48d555ef5576 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 18 Jun 2024 13:58:48 -0700 Subject: [PATCH 4/4] testsuite: add ranks constraint tests to t2260-job-list.t Problem: There's no end-to-end job-list 'ranks' constraint tests in t2260-job-list.t. Add a few 'ranks' constraint tests to t2260-job-list.t. --- t/t2260-job-list.t | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index 8fea127a0b13..87e4d240522c 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -723,6 +723,33 @@ test_expect_success 'flux job list of all jobs that ran on node[1-2] after certa test $(cat constraint_hostlist9.out | wc -l) -eq ${numlines} ' +test_expect_success 'flux job list by rank (0-3)' ' + constraint="{ and: [ {ranks:[\"0-3\"]} ] }" && + $jq -j -c -n "{max_entries:1000, attrs:[], constraint:${constraint}}" \ + | $RPC job-list.list | $jq .jobs | $jq -c '.[]' | $jq .id > constraint_hostlist1.out && + numlines=$(cat completed.ids running.ids failed.ids timeout.ids | wc -l) && + test $(cat constraint_hostlist1.out | wc -l) -eq ${numlines} +' +test_expect_success 'flux job list by rank (3)' ' + constraint="{ and: [ {ranks:[\"3\"]} ] }" && + $jq -j -c -n \ + "{max_entries:1000, attrs:[\"ranks\"], constraint:${constraint}}" \ + | $RPC job-list.list | $jq -r .jobs[].ranks | uniq > jobs.ranks && + test $(cat jobs.ranks) -eq 3 +' +test_expect_success 'flux job list by rank (does not exist)' ' + constraint="{ and: [ {ranks:[\"10\"]} ] }" && + $jq -j -c -n \ + "{max_entries:1000, attrs:[\"ranks\"], constraint:${constraint}}" \ + | $RPC job-list.list | $jq -e ".jobs|length == 0" +' +test_expect_success 'flux job list by rank (invalid)' ' + constraint="{ and: [ {ranks:[3]} ] }" && + $jq -j -c -n \ + "{max_entries:1000, attrs:[\"ranks\"], constraint:${constraint}}" \ + | test_must_fail $RPC job-list.list 2>ranks.err && + grep "value must be a string" ranks.err +' # # legacy RPC tests #