From 7b1e4200f57fb20674a7ebeb15ef9efddfe9f308 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 08:33:22 -0700 Subject: [PATCH 01/10] testsuite: reorder tests in t2260-job-list Problem: It appears over time several nnodes/ranks/nodelist tests "migrated" away from the other nnodes/ranks/nodelist tests. Move tests closer to related tests so they are "grouped" together. --- t/t2260-job-list.t | 62 +++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index e153d2c509e0..1349d1db3b17 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -797,6 +797,37 @@ test_expect_success HAVE_JQ 'flux job list lists nnodes for pending jobs correct flux queue start ' +test_expect_success 'reload the job-list module' ' + flux module reload job-list +' + +test_expect_success HAVE_JQ 'verify nnodes/ranks/nodelist preserved across restart' ' + jobid1=`cat nodecount1.id` && + jobid2=`cat nodecount2.id` && + jobid3=`cat nodecount3.id` && + jobid4=`cat nodecount4.id` && + obj=$(flux job list -s inactive | grep ${jobid1}) && + echo $obj | jq -e ".nnodes == 1" && + echo $obj | jq -e ".ranks == \"0\"" && + nodes=`flux job info ${jobid1} R | flux R decode --nodelist` && + echo $obj | jq -e ".nodelist == \"${nodes}\"" && + obj=$(flux job list -s inactive | grep ${jobid2}) && + echo $obj | jq -e ".nnodes == 1" && + echo $obj | jq -e ".ranks == \"0\"" && + nodes=`flux job info ${jobid2} R | flux R decode --nodelist` && + echo $obj | jq -e ".nodelist == \"${nodes}\"" && + obj=$(flux job list -s inactive | grep ${jobid3}) && + echo $obj | jq -e ".nnodes == 2" && + echo $obj | jq -e ".ranks == \"[0-1]\"" && + nodes=`flux job info ${jobid3} R | flux R decode --nodelist` && + echo $obj | jq -e ".nodelist == \"${nodes}\"" && + obj=$(flux job list -s inactive | grep ${jobid4}) && + echo $obj | jq -e ".nnodes == 3" && + echo $obj | jq -e ".ranks == \"[0-2]\"" && + nodes=`flux job info ${jobid4} R | flux R decode --nodelist` && + echo $obj | jq -e ".nodelist == \"${nodes}\"" +' + # # job success # @@ -848,37 +879,6 @@ test_expect_success HAVE_JQ 'flux job list outputs expiration time when set' ' flux job cancel $jobid ' -test_expect_success 'reload the job-list module' ' - flux module reload job-list -' - -test_expect_success HAVE_JQ 'verify nnodes/ranks/nodelist preserved across restart' ' - jobid1=`cat nodecount1.id` && - jobid2=`cat nodecount2.id` && - jobid3=`cat nodecount3.id` && - jobid4=`cat nodecount4.id` && - obj=$(flux job list -s inactive | grep ${jobid1}) && - echo $obj | jq -e ".nnodes == 1" && - echo $obj | jq -e ".ranks == \"0\"" && - nodes=`flux job info ${jobid1} R | flux R decode --nodelist` && - echo $obj | jq -e ".nodelist == \"${nodes}\"" && - obj=$(flux job list -s inactive | grep ${jobid2}) && - echo $obj | jq -e ".nnodes == 1" && - echo $obj | jq -e ".ranks == \"0\"" && - nodes=`flux job info ${jobid2} R | flux R decode --nodelist` && - echo $obj | jq -e ".nodelist == \"${nodes}\"" && - obj=$(flux job list -s inactive | grep ${jobid3}) && - echo $obj | jq -e ".nnodes == 2" && - echo $obj | jq -e ".ranks == \"[0-1]\"" && - nodes=`flux job info ${jobid3} R | flux R decode --nodelist` && - echo $obj | jq -e ".nodelist == \"${nodes}\"" && - obj=$(flux job list -s inactive | grep ${jobid4}) && - echo $obj | jq -e ".nnodes == 3" && - echo $obj | jq -e ".ranks == \"[0-2]\"" && - nodes=`flux job info ${jobid4} R | flux R decode --nodelist` && - echo $obj | jq -e ".nodelist == \"${nodes}\"" -' - # all job attributes # note that not all attributes may be returned by via the 'all' From 5700a632d27b8a88be57d576d574a3fe05dfcf96 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:29:53 -0700 Subject: [PATCH 02/10] doc: remove extra word in flux-jobs(1) Problem: There is an errant period in the description for the "!F" conversion flag. Remove the extraneous period. --- doc/man1/flux-jobs.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/man1/flux-jobs.rst b/doc/man1/flux-jobs.rst index 58eee0bddfd0..568133bd5561 100644 --- a/doc/man1/flux-jobs.rst +++ b/doc/man1/flux-jobs.rst @@ -186,7 +186,7 @@ the following conversion flags are supported by *flux-jobs*: datetime of epoch if timestamp field does not exist. **!F** - convert a duration in floating point seconds to Flux Standard Duration (FSD). + convert a duration in floating point seconds to Flux Standard Duration (FSD) string. Defaults to empty string if duration field does not exist. **!H** From 04a5c01487c0634a7c61beb95c935be6562c591a Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:16:31 -0700 Subject: [PATCH 03/10] testsuite: correct test description Problem: A test in t2800-jobs-cmd.t had a test description that was inconsistent to the test performed. Correct the description text to align with the test. --- t/t2800-jobs-cmd.t | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/t/t2800-jobs-cmd.t b/t/t2800-jobs-cmd.t index 212e93d3bab6..ffea2ffa1103 100755 --- a/t/t2800-jobs-cmd.t +++ b/t/t2800-jobs-cmd.t @@ -657,7 +657,7 @@ test_expect_success 'flux jobs --format={t_cleanup/{in}active} works' ' test $count -eq $(state_count inactive) ' -test_expect_success 'flux-jobs --format={runtime},{runtime!F},{runtime!F:h},{runtime!H},{runtime!H:h} works' ' +test_expect_success 'flux-jobs --format={runtime},{runtime!F},{runtime!H},{runtime!F:h},{runtime!H:h} works' ' fmt="{runtime},{runtime!F},{runtime!H},{runtime!F:h},{runtime!H:h}" && flux jobs --filter=pending -no "${fmt}" > runtimeP.out && for i in `seq 1 $(state_count sched)`; do From bd00d5410eea36b455a33f58aab883892a392b95 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:43:44 -0700 Subject: [PATCH 04/10] testsuite: add reloaded job-list coverage tests Problem: Most job-list field tests in t2260-job-list cover the situation in which the job-list module is reloaded. But for some reason success, exceptions, and expiration were not covered. Solution: Add coverage of success, exceptions, and expiration when the job-list module is reloaded. --- t/t2260-job-list.t | 50 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index 1349d1db3b17..ea19b690faa4 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -834,6 +834,7 @@ test_expect_success HAVE_JQ 'verify nnodes/ranks/nodelist preserved across resta test_expect_success HAVE_JQ 'flux job list outputs success correctly (true)' ' jobid=`flux mini submit --wait hostname | flux job id` && + echo $jobid > success1.id && wait_jobid_state $jobid inactive && obj=$(flux job list -s inactive | grep $jobid) && echo $obj | jq -e ".success == true" @@ -841,15 +842,30 @@ test_expect_success HAVE_JQ 'flux job list outputs success correctly (true)' ' test_expect_success HAVE_JQ 'flux job list outputs success correctly (false)' ' jobid=`flux mini submit --wait nosuchcommand | flux job id` && + echo $jobid > success2.id && wait_jobid_state $jobid inactive && obj=$(flux job list -s inactive | grep $jobid) && echo $obj | jq -e ".success == false" ' +test_expect_success 'reload the job-list module' ' + flux module reload job-list +' + +test_expect_success HAVE_JQ 'verify task count preserved across restart' ' + jobid1=`cat success1.id` && + jobid2=`cat success2.id` && + obj=$(flux job list -s inactive | grep ${jobid1}) && + echo $obj | jq -e ".success == true" && + obj=$(flux job list -s inactive | grep ${jobid2}) && + echo $obj | jq -e ".success == false" +' + # job exceptions test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (no exception)' ' jobid=`flux mini submit --wait hostname | flux job id` && + echo $jobid > exceptions1.id && wait_jobid_state $jobid inactive && obj=$(flux job list -s inactive | grep $jobid) && echo $obj | jq -e ".exception_occurred == false" && @@ -860,6 +876,7 @@ test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (no exce test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (exception)' ' jobid=`flux mini submit --wait nosuchcommand | flux job id` && + echo $jobid > exceptions2.id && wait_jobid_state $jobid inactive && obj=$(flux job list -s inactive | grep $jobid) && echo $obj | jq -e ".exception_occurred == true" && @@ -868,10 +885,31 @@ test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (excepti echo $obj | jq .exception_note | grep "No such file or directory" ' +test_expect_success 'reload the job-list module' ' + flux module reload job-list +' + +test_expect_success HAVE_JQ 'verify task count preserved across restart' ' + jobid1=`cat exceptions1.id` && + jobid2=`cat exceptions2.id` && + obj=$(flux job list -s inactive | grep ${jobid1}) && + echo $obj | jq -e ".success == true" && + echo $obj | jq -e ".exception_occurred == false" && + echo $obj | jq -e ".exception_severity == null" && + echo $obj | jq -e ".exception_type == null" && + echo $obj | jq -e ".exception_note == null" && + obj=$(flux job list -s inactive | grep ${jobid2}) && + echo $obj | jq -e ".exception_occurred == true" && + echo $obj | jq -e ".exception_severity == 0" && + echo $obj | jq -e ".exception_type == \"exec\"" && + echo $obj | jq .exception_note | grep "No such file or directory" +' + # expiration time test_expect_success HAVE_JQ 'flux job list outputs expiration time when set' ' - jobid=$(flux mini submit -t 30s sleep 1000 | flux job id) && + jobid=$(flux mini submit -t 500s sleep 1000 | flux job id) && + echo $jobid > expiration.id && fj_wait_event $jobid start && flux job list | grep $jobid > expiration.json && test_debug "cat expiration.json" && @@ -879,6 +917,16 @@ test_expect_success HAVE_JQ 'flux job list outputs expiration time when set' ' flux job cancel $jobid ' +test_expect_success 'reload the job-list module' ' + flux module reload job-list +' + +test_expect_success HAVE_JQ 'verify task count preserved across restart' ' + jobid=`cat expiration.id` && + flux job list -s inactive | grep ${jobid} > expiration2.json && + jq -e ".expiration > now" < expiration2.json +' + # all job attributes # note that not all attributes may be returned by via the 'all' From e51dad71a43060df413f817f352e19a0d20a0697 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 08:21:13 -0700 Subject: [PATCH 05/10] job-list: support getting job duration Problem: It would be useful to get a job's duration, but that is not yet available in job-list. Solution: Support getting job duration via the 'duration' attribute. --- src/modules/job-list/job-list.c | 2 +- src/modules/job-list/job_data.c | 13 +++++++++++++ src/modules/job-list/job_data.h | 1 + src/modules/job-list/job_util.c | 6 ++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/modules/job-list/job-list.c b/src/modules/job-list/job-list.c index 574047f04ff3..77ed5d6c7ec4 100644 --- a/src/modules/job-list/job-list.c +++ b/src/modules/job-list/job-list.c @@ -24,7 +24,7 @@ static const char *attrs[] = { "userid", "urgency", "priority", "t_submit", "t_depend", "t_run", "t_cleanup", "t_inactive", - "state", "name", "queue", "ntasks", "nnodes", + "state", "name", "queue", "ntasks", "duration", "nnodes", "ranks", "nodelist", "success", "exception_occurred", "exception_type", "exception_severity", "exception_note", "result", "expiration", diff --git a/src/modules/job-list/job_data.c b/src/modules/job-list/job_data.c index 062622e9d02a..0b538f8e52f9 100644 --- a/src/modules/job-list/job_data.c +++ b/src/modules/job-list/job_data.c @@ -56,6 +56,7 @@ struct job *job_create (struct list_ctx *ctx, flux_jobid_t id) job->priority = FLUX_JOB_PRIORITY_MIN; job->state = FLUX_JOB_STATE_NEW; job->ntasks = -1; + job->duration = -1.0; job->nnodes = -1; job->expiration = -1.0; job->wait_status = -1; @@ -153,6 +154,15 @@ static int parse_jobspec_nnodes (struct job *job, struct jj_counts *jj) return 0; } +static int parse_jobspec_duration (struct job *job, struct jj_counts *jj) +{ + /* N.B. Jobspec V1 requires duration to be set, so duration will + * always be >= 0 from libjj. + */ + job->duration = jj->duration; + return 0; +} + static int parse_per_resource (struct job *job, const char **type, int *count) @@ -288,6 +298,9 @@ int job_parse_jobspec (struct job *job, const char *s) if (parse_jobspec_ntasks (job, &jj) < 0) goto nonfatal_error; + if (parse_jobspec_duration (job, &jj) < 0) + goto nonfatal_error; + /* nonfatal error - jobspec illegal, but we'll continue on. job * listing will return whatever data is available */ nonfatal_error: diff --git a/src/modules/job-list/job_data.h b/src/modules/job-list/job_data.h index 3080bb96a549..d69774082a2b 100644 --- a/src/modules/job-list/job_data.h +++ b/src/modules/job-list/job_data.h @@ -47,6 +47,7 @@ struct job { const char *name; const char *queue; int ntasks; + double duration; int nnodes; char *ranks; char *nodelist; diff --git a/src/modules/job-list/job_util.c b/src/modules/job-list/job_util.c index fdceb9ce8d83..b3bb81545cdc 100644 --- a/src/modules/job-list/job_util.c +++ b/src/modules/job-list/job_util.c @@ -98,6 +98,12 @@ static int store_attr (struct job *job, return 0; val = json_integer (job->ntasks); } + else if (!strcmp (attr, "duration")) { + /* job->duration potentially < 0 if jobspec invalid */ + if (job->duration < 0) + return 0; + val = json_real (job->duration); + } else if (!strcmp (attr, "nnodes")) { /* job->nnodes < 0 if not set yet or R invalid, may be set in * DEPEND or RUN state */ From fcb281d5c80eea41c09098b4e54e76d1b1ea01b0 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 10:05:51 -0700 Subject: [PATCH 06/10] testsuite: cover job-list duration attribute Problem: There is no coverage for the job duration attribute in job-list. Add coverage for duration. --- t/python/t0010-job.py | 1 + t/t2260-job-list.t | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/t/python/t0010-job.py b/t/python/t0010-job.py index 7511371a27a0..af29d94a7501 100755 --- a/t/python/t0010-job.py +++ b/t/python/t0010-job.py @@ -475,6 +475,7 @@ def test_25_job_list_attrs(self): "name", "queue", "ntasks", + "duration", "nnodes", "ranks", "nodelist", diff --git a/t/t2260-job-list.t b/t/t2260-job-list.t index ea19b690faa4..2a2ebac43ade 100755 --- a/t/t2260-job-list.t +++ b/t/t2260-job-list.t @@ -927,6 +927,28 @@ test_expect_success HAVE_JQ 'verify task count preserved across restart' ' jq -e ".expiration > now" < expiration2.json ' +# duration time + +test_expect_success HAVE_JQ 'flux job list outputs duration time when set' ' + jobid=$(flux mini submit -t 60m sleep 1000 | flux job id) && + echo $jobid > duration.id && + fj_wait_event $jobid start && + flux job list | grep $jobid > duration.json && + test_debug "cat duration.json" && + jq -e ".duration == 3600.0" < duration.json && + flux job cancel $jobid +' + +test_expect_success 'reload the job-list module' ' + flux module reload job-list +' + +test_expect_success HAVE_JQ 'verify task count preserved across restart' ' + jobid=`cat duration.id` && + flux job list -s inactive | grep ${jobid} > duration2.json && + jq -e ".duration == 3600.0" < duration2.json +' + # all job attributes # note that not all attributes may be returned by via the 'all' @@ -1052,6 +1074,7 @@ t_inactive \ state \ name \ ntasks \ +duration \ nnodes \ ranks \ nodelist \ From 799a4510ee7ad335f38f3c694f8f7d70bc484e33 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 10:33:22 -0700 Subject: [PATCH 07/10] flux-jobs: support job duration retrieval Problem: job-list supports getting the job duration but flux-jobs does not. Solution: Support job duration via the 'duration' key. --- src/bindings/python/flux/job/info.py | 2 ++ src/cmd/flux-jobs.py | 1 + 2 files changed, 3 insertions(+) diff --git a/src/bindings/python/flux/job/info.py b/src/bindings/python/flux/job/info.py index d533c41f7feb..91e2b84bd735 100644 --- a/src/bindings/python/flux/job/info.py +++ b/src/bindings/python/flux/job/info.py @@ -200,6 +200,7 @@ class JobInfo: "t_run": 0.0, "t_cleanup": 0.0, "t_inactive": 0.0, + "duration": 0.0, "expiration": 0.0, "name": "", "queue": "", @@ -506,6 +507,7 @@ def get_field(self, field_name, args, kwargs): "name": "NAME", "queue": "QUEUE", "ntasks": "NTASKS", + "duration": "DURATION", "nnodes": "NNODES", "expiration": "EXPIRATION", "t_remaining": "T_REMAINING", diff --git a/src/cmd/flux-jobs.py b/src/cmd/flux-jobs.py index d8c088f9e7fc..207fc53f7bce 100755 --- a/src/cmd/flux-jobs.py +++ b/src/cmd/flux-jobs.py @@ -155,6 +155,7 @@ def fetch_jobs_flux(args, fields, flux_handle=None): "name": ("name",), "queue": ("queue",), "ntasks": ("ntasks",), + "duration": ("duration",), "nnodes": ("nnodes",), "ranks": ("ranks",), "nodelist": ("nodelist",), From 57bb6cf2767c2af0f02e20f8f3aeade30d4690a2 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:01:21 -0700 Subject: [PATCH 08/10] testsuite: cover flux-jobs duration key Problem: No coverage exists for retrieving job duration in flux-jobs. Add coverage for duration in t2800-jobs-cmd.t. --- t/t2800-jobs-cmd.t | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/t/t2800-jobs-cmd.t b/t/t2800-jobs-cmd.t index ffea2ffa1103..a4b353c5e483 100755 --- a/t/t2800-jobs-cmd.t +++ b/t/t2800-jobs-cmd.t @@ -537,6 +537,19 @@ test_expect_success 'flux-jobs --format={ntasks},{nnodes},{nnodes:h} works' ' test_cmp nodecountI.exp nodecountI.out ' +test_expect_success 'flux-jobs --format={duration},{duration:h},{duration!F},{duration!H},{duration!F:h},{duration!H:h} works' ' + fmt="{duration},{duration:h},{duration!F},{duration!H},{duration!F:h},{duration!H:h}" && + flux jobs --filter=pending,running -no "${fmt}" > durationPR.out && + for i in `seq 1 $(state_count sched run)`; do + echo "300.0,300.0,5m,0:05:00,5m,0:05:00" >> durationPR.exp + done && + test_cmp durationPR.exp durationPR.out && + flux jobs --filter=completed -no "${fmt}" > durationCD.out && + for i in `seq 1 $(state_count completed)`; + do echo "0.0,-,0s,0:00:00,-,-" >> durationCD.exp + done && + test_cmp durationCD.exp durationCD.out +' test_expect_success 'flux-jobs --format={runtime:0.3f} works' ' flux jobs --filter=pending -no "{runtime:0.3f}" > runtime-dotP.out && @@ -917,6 +930,7 @@ test_expect_success 'flux-jobs: header included with all custom formats' ' name==NAME queue==QUEUE ntasks==NTASKS + duration==DURATION nnodes==NNODES ranks==RANKS nodelist==NODELIST From d83bd22f8ce64eb7eb8a333b27165a0c638a1614 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:26:41 -0700 Subject: [PATCH 09/10] doc: add job duration information to flux-jobs(1) Problem: flux-jobs(1) does not include duration as a output field. Add it to the output field list. --- doc/man1/flux-jobs.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/man1/flux-jobs.rst b/doc/man1/flux-jobs.rst index 568133bd5561..fc9877bbd0a2 100644 --- a/doc/man1/flux-jobs.rst +++ b/doc/man1/flux-jobs.rst @@ -264,6 +264,9 @@ The field names that can be specified are: **ntasks** job task count +**duration** + job duration in seconds + **nnodes** job node count (if job ran / is running), empty string otherwise From 3ff24d3659dfbcb92344a07eca4699e4661794e6 Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Mon, 26 Sep 2022 11:29:09 -0700 Subject: [PATCH 10/10] doc: clarify "duration" in flux-jobs(1) Problem: Now that there is a job field "duration", generalized use of the word "duration" could be confusing. Solution: To avoid potential confusion between a general "duration" and the new job field "duration", everytime the generalized "duration" is said, instead say "time duration". In addition, make the descriptions of the "!F" and "!H" more consistent to each other by saying "floating point seconds" and giving example usage. --- doc/man1/flux-jobs.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/man1/flux-jobs.rst b/doc/man1/flux-jobs.rst index fc9877bbd0a2..653d8cc9dee8 100644 --- a/doc/man1/flux-jobs.rst +++ b/doc/man1/flux-jobs.rst @@ -186,12 +186,14 @@ the following conversion flags are supported by *flux-jobs*: datetime of epoch if timestamp field does not exist. **!F** - convert a duration in floating point seconds to Flux Standard Duration (FSD) - string. Defaults to empty string if duration field does not exist. + convert a time duration in floating point seconds to Flux Standard + Duration (FSD) string (e.g. *{runtime!F}*). Defaults to empty string if + field does not exist. **!H** - convert a duration to hours:minutes:seconds form (e.g. *{runtime!H}*). - Defaults to empty string if duration field does not exist. + convert a time duration in floating point seconds to + hours:minutes:seconds form (e.g. *{runtime!H}*). Defaults to empty + string if time duration field does not exist. **!P** convert a floating point number into a percentage fitting in 5 characters