From 88d6b51530b416b87e1f4b50e0495d5ebc5fffa9 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 20 Oct 2021 15:33:41 -0700 Subject: [PATCH 1/7] job-exec: fix typo in error message Problem: The error from eventlog_append in jobinfo_log_output() has a typo: "evenlog_append failed". Fix the typo. --- src/modules/job-exec/job-exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/modules/job-exec/job-exec.c b/src/modules/job-exec/job-exec.c index d11879b26418..0d2b5b860573 100644 --- a/src/modules/job-exec/job-exec.c +++ b/src/modules/job-exec/job-exec.c @@ -440,7 +440,7 @@ void jobinfo_log_output (struct jobinfo *job, "rank", buf, "data", data, len) < 0) flux_log_error (job->h, - "evenlog_append failed: %ju: message=%s", + "eventlog_append failed: %ju: message=%s", (uintmax_t) job->id, data); } From c1058ce2841f84564fb142d38932ec123cf9c5ea Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 20 Oct 2021 15:42:52 -0700 Subject: [PATCH 2/7] shell: fix typo in evlog.c Problem: There is a typo in a comment: evenlog Fix the typo. --- src/shell/evlog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/shell/evlog.c b/src/shell/evlog.c index 80bb034fa219..b7c20aaccfa4 100644 --- a/src/shell/evlog.c +++ b/src/shell/evlog.c @@ -175,7 +175,7 @@ static int evlog_shell_exit (flux_plugin_t *p, return 0; } -/* Start the evenlog-based logger during shell.connect, just after the +/* Start the eventlog-based logger during shell.connect, just after the * shell has obtained a flux_t handle. This allows more early log * messages to make it into the eventlog, but some data (such as * the current shell_rank) is not available at this time. From c34e6117629c1789166398c59187f6e2d3908e94 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Tue, 26 Oct 2021 20:08:04 -0700 Subject: [PATCH 3/7] job-manager: make flux_jobtap_call() reentrant Problem: flux_jobtap_call() may be called recursively, especially when plugins are subscribing to event notifications. However, this function is not reentrant safe, since it iterates the plugins list in place. Therefore, in this situation Bad Things will happen. Make flux_jobtap_call() reentrant-safe by iterating a copy of the plugins list. --- src/modules/job-manager/jobtap.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/modules/job-manager/jobtap.c b/src/modules/job-manager/jobtap.c index 43d0232ca622..28a0f4faf66a 100644 --- a/src/modules/job-manager/jobtap.c +++ b/src/modules/job-manager/jobtap.c @@ -545,10 +545,17 @@ static int jobtap_stack_call (struct jobtap *jobtap, flux_plugin_arg_t *args) { int retcode = 0; - flux_plugin_t *p = zlistx_first (plugins); + flux_plugin_t *p = NULL; + + /* Duplicate list to make jobtap_stack_call reentrant */ + zlistx_t *l = zlistx_dup (plugins); + if (!l) + return -1; + zlistx_set_destructor (l, NULL); if (current_job_push (jobtap, job) < 0) return -1; + p = zlistx_first (l); while (p) { int rc = flux_plugin_call (p, topic, args); if (rc < 0) { @@ -561,8 +568,9 @@ static int jobtap_stack_call (struct jobtap *jobtap, break; } retcode += rc; - p = zlistx_next (plugins); + p = zlistx_next (l); } + zlistx_destroy (&l); if (current_job_pop (jobtap) < 0) return -1; return retcode; From dc05c49ad949042701caf38060f35b80bf0f81ec Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 27 Oct 2021 07:32:27 -0700 Subject: [PATCH 4/7] job-manager: support job manager prolog and epilog Problem: There are use cases where a jobtap plugin needs to block or pause a job after it has been allocated resources (after the `alloc` event), but before the job starts running (before the `start` request is made to the execution system), but currently the start request is sent immediately after the alloc event. Similarly, it would be useful to block a job after the `finish` event, but before resources are returned to the scheduler (before the `free` request to the scheduler). Introduce the concept of job manager prolog and epilog actions. Together, call these actions "perilogs" and add a `perilog_active` count for every job. If the perilog_active count is nonzero, then defer sending of start/free requests until the count is zero. The perilog_active count is managed by RFC 21 prolog-{start,finish} and epilog-{start,finish} events. --- src/modules/job-manager/event.c | 45 ++++++++++++++++++++++++++++++++- src/modules/job-manager/job.h | 2 ++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/modules/job-manager/event.c b/src/modules/job-manager/event.c index a945a6b781de..270020e4ed54 100644 --- a/src/modules/job-manager/event.c +++ b/src/modules/job-manager/event.c @@ -349,7 +349,12 @@ int event_job_action (struct event *event, struct job *job) return -1; break; case FLUX_JOB_STATE_RUN: - if (start_send_request (ctx->start, job) < 0) + /* + * If job->request_refcount is nonzero then a prolog action + * is still in progress so do not send start request. + */ + if (!job->perilog_active + && start_send_request (ctx->start, job) < 0) return -1; break; case FLUX_JOB_STATE_CLEANUP: @@ -364,6 +369,7 @@ int event_job_action (struct event *event, struct job *job) * it is safe to release all resources to the scheduler. */ if (job->has_resources + && !job->perilog_active && !job->alloc_bypass && !job->start_pending && !job->free_pending) { @@ -494,6 +500,31 @@ static int event_handle_set_flags (struct job *job, return 0; } +/* Handle an prolog-* or epilog-* event + */ +static int event_handle_perilog (struct job *job, + const char *cmd, + json_t *context) +{ + if (strcmp (cmd, "start") == 0) { + if (job->perilog_active == UINT8_MAX) { + errno = EOVERFLOW; + return -1; + } + job->perilog_active++; + } + else if (strcmp (cmd, "finish") == 0) { + if (job->perilog_active > 0) + job->perilog_active--; + } + else { + errno = EPROTO; + return -1; + } + return 0; +} + + /* Return a callback topic string for the current job state * * NOTE: 'job.state.new' and 'job.state.depend' are not currently used @@ -626,6 +657,18 @@ int event_job_update (struct job *job, json_t *event) goto inval; job->state = FLUX_JOB_STATE_INACTIVE; } + else if (!strncmp (name, "prolog-", 7)) { + if (job->start_pending) + goto inval; + if (event_handle_perilog (job, name+7, context) < 0) + goto error; + } + else if (!strncmp (name, "epilog-", 7)) { + if (job->state != FLUX_JOB_STATE_CLEANUP) + goto inval; + if (event_handle_perilog (job, name+7, context) < 0) + goto error; + } else if (!strcmp (name, "flux-restart")) { /* The flux-restart event is currently only posted to jobs in * SCHED state since that is the only state transition defined diff --git a/src/modules/job-manager/job.h b/src/modules/job-manager/job.h index 5f695a42dd10..d3d4a14332ee 100644 --- a/src/modules/job-manager/job.h +++ b/src/modules/job-manager/job.h @@ -41,6 +41,8 @@ struct job { uint8_t has_resources:1; uint8_t start_pending:1;// start request sent to job-exec + uint8_t perilog_active; // if nonzero, prolog/epilog active + json_t *annotations; struct grudgeset *dependencies; From 7cb73522277e615e0eceaf147f42fd5c9e507387 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 27 Oct 2021 07:42:25 -0700 Subject: [PATCH 5/7] jobtap: add jobtap functions for prolog/epilog actions Problem: There is no API in the jobtap interface to manage prolog and epilog actions instantiated by plugins. Add flux_jobtap_prolog_start(), flux_jobtap_prolog_finish(), flux_jobtap_epilog_start() and flux_jobtap_epilog_finish() to give jobtap plugins a nice interface to using prolog and epilog actions. --- src/modules/job-manager/jobtap.c | 118 +++++++++++++++++++++++++++++++ src/modules/job-manager/jobtap.h | 32 +++++++++ 2 files changed, 150 insertions(+) diff --git a/src/modules/job-manager/jobtap.c b/src/modules/job-manager/jobtap.c index 28a0f4faf66a..a54d5821104a 100644 --- a/src/modules/job-manager/jobtap.c +++ b/src/modules/job-manager/jobtap.c @@ -1898,6 +1898,124 @@ int flux_jobtap_job_event_posted (flux_plugin_t *p, return 0; } +static int jobtap_emit_perilog_event (struct jobtap *jobtap, + struct job *job, + bool prolog, + bool start, + const char *description, + int status) +{ + int flags = 0; + const char *event = prolog ? start ? "prolog-start" : "prolog-finish" : + start ? "epilog-start" : "epilog-finish"; + + if (!description) { + errno = EINVAL; + return -1; + } + + /* prolog events cannot be emitted after a start request is pending. + * + * epilog events cannot be emitted outside of CLEANUP state + * and must be emitted before free request is pending. + */ + if ((prolog && job->start_pending) + || (prolog && job->state == FLUX_JOB_STATE_CLEANUP) + || (!prolog && job->state != FLUX_JOB_STATE_CLEANUP) + || (!prolog && job->free_pending)) { + errno = EINVAL; + return -1; + } + if (start) + return event_job_post_pack (jobtap->ctx->event, + job, + event, + flags, + "{s:s}", + "description", description); + else + return event_job_post_pack (jobtap->ctx->event, + job, + event, + flags, + "{s:s s:i}", + "description", description, + "status", status); +} + +int flux_jobtap_prolog_start (flux_plugin_t *p, const char *description) +{ + struct job * job; + struct jobtap *jobtap; + + if (!p + || !(jobtap = flux_plugin_aux_get (p, "flux::jobtap")) + || !(job = current_job (jobtap))) { + errno = EINVAL; + return -1; + } + return jobtap_emit_perilog_event (jobtap, job, true, true, description, 0); +} + +int flux_jobtap_prolog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status) +{ + struct job * job; + struct jobtap *jobtap; + + if (!p || !(jobtap = flux_plugin_aux_get (p, "flux::jobtap"))) { + errno = EINVAL; + return -1; + } + if (!(job = jobtap_lookup_jobid (p, id))) + return -1; + return jobtap_emit_perilog_event (jobtap, + job, + true, + false, + description, + status); +} + +int flux_jobtap_epilog_start (flux_plugin_t *p, const char *description) +{ + struct job * job; + struct jobtap *jobtap; + + if (!p + || !(jobtap = flux_plugin_aux_get (p, "flux::jobtap")) + || !(job = current_job (jobtap))) { + errno = EINVAL; + return -1; + } + return jobtap_emit_perilog_event (jobtap, job, false, true, description, 0); +} + +int flux_jobtap_epilog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status) +{ + struct job * job; + struct jobtap *jobtap; + + if (!p || !(jobtap = flux_plugin_aux_get (p, "flux::jobtap"))) { + errno = EINVAL; + return -1; + } + if (!(job = jobtap_lookup_jobid (p, id))) + return -1; + return jobtap_emit_perilog_event (jobtap, + job, + false, + false, + description, + status); +} + + /* * vi:tabstop=4 shiftwidth=4 expandtab */ diff --git a/src/modules/job-manager/jobtap.h b/src/modules/job-manager/jobtap.h index 0b29b3efbd04..740f6fb12f0a 100644 --- a/src/modules/job-manager/jobtap.h +++ b/src/modules/job-manager/jobtap.h @@ -194,6 +194,38 @@ int flux_jobtap_job_subscribe (flux_plugin_t *p, flux_jobid_t id); */ void flux_jobtap_job_unsubscribe (flux_plugin_t *p, flux_jobid_t id); + +/* Post an event to the current job eventlog indicating that a prolog + * action has started. This will block the start request to the + * execution system until `flux_jobtap_prolog_finish()` is called. + */ +int flux_jobtap_prolog_start (flux_plugin_t *p, const char *description); + +/* Post an event to the eventlog for job id indicating that a prolog + * action has finished. The description should match the description + * of an outstanding prolog start event. `status` is informational + * and should be 0 to indicate success, non-zero for failure. + */ +int flux_jobtap_prolog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status); + +/* Post an event to the current job eventlog indicating that an epilog + * action has started. This will block the free request to the + * scheduler until `flux_jobtap_epilog_finish()` is called. + */ +int flux_jobtap_epilog_start (flux_plugin_t *p, const char *description); + +/* Post an event to the eventlog for job id indicating that an epilog + * action has finished. The description should match the description + * of an outstanding epilog start event. `status` is informational + * and should be 0 to indicate success, non-zero for failure. + */ +int flux_jobtap_epilog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status); #ifdef __cplusplus } #endif From 64b75837f7792b2867b799cd1ab50a245fd867cf Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 27 Oct 2021 07:56:04 -0700 Subject: [PATCH 6/7] doc: add prolog/epilog section to flux-jobtap-plugins(7) Problem: No documentation for jobtap plugin prolog/epilog actions exists. Add a prolog/epilog section to the flux-jobtap-plugins(7) manpage with a brief description of this functionality. --- doc/man7/flux-jobtap-plugins.rst | 68 ++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/doc/man7/flux-jobtap-plugins.rst b/doc/man7/flux-jobtap-plugins.rst index 680cddde0bea..b637b4e63799 100644 --- a/doc/man7/flux-jobtap-plugins.rst +++ b/doc/man7/flux-jobtap-plugins.rst @@ -251,6 +251,74 @@ or via configuration (See :ref:`configuration` below) ] +.. _perilogs: + +PROLOG AND EPILOG ACTIONS +========================= + +Plugins that need to perform asynchronous tasks for jobs after an ``alloc`` +event but before the job is running, or after a ``finish`` event but before +resources are freed to the scheduler can make use of job manager prolog or +epilog actions. + +Prolog and epilog actions are delineated by the following functions: + +:: + + int flux_jobtap_prolog_start (flux_plugin_t *p, + const char *description); + + int flux_jobtap_prolog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status); + + int flux_jobtap_epilog_start (flux_plugin_t *p, + const char *description); + + int flux_jobtap_epilog_finish (flux_plugin_t *p, + flux_jobid_t id, + const char *description, + int status); + +To initiate a prolog action, a plugin should call the function +``flux_jobtap_prolog_start()``. This will block the job from starting +even after resources have been assigned until a corresponding call to +``flux_jobtap_prolog_finish()`` has been called. While the status of the +prolog action is passed to ``flux_jobtap_prolog_finish()`` so it can be +captured in the eventlog, the action itself is responsible for raising +a job exception or taking other action on failure. That is, a non-zero +prolog finish status does not cause any automated behavior on the part of +the job manager. Similarly, the prolog ``description`` is used for +informational purposes only, so that multiple actions in an eventlog +may be differentiated. + +Similarly, an epilog action is initiated with ``flux_jobtap_epilog_start()``, +and prevents resources from being released to the scheduler until a +corresponding call to ``flux_jobtap_epilog_finish()``. The same caveats +described for prolog actions regarding description and completion status +of epilog actions apply. + +The ``flux_jobtap_prolog_start()`` function may be initiated anytime +before the ``start`` request is made to the execution system, though most +often from the ``job.state.run`` or ``job.event.alloc`` callbacks, +since this is the point at which a job has been allocated resources. +(Note: plugins will only receive the ``job.event.*`` callbacks for +jobs to which they have subscribed with a call to +``flux_jobtap_job_subscribe()``). A prolog action cannot be started +after a job enters the CLEANUP state. + +The ``flux_jobtap_epilog_start()`` function may only be called after a +job is in the CLEANUP state, but before the ``free`` request has been +sent to the scheduler, for example from the ``job.state.cleanup`` +or ``job.event.finish`` callbacks. + +If ``flux_jobtap_prolog_start()``, ``flux_jobtap_prolog_finish()``, +``flux_jobtap_epilog_start()`` or ``flux_jobtap_epilog_finish()`` are +called for a job in an invaid state, these function will return -1 with +``errno`` set to ``EINVAL``. + +Multiple prolog or epilog actions can be active at the same time. .. _configuration: From bc8c87b02d43b275b361f872d75da63904b1299e Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Wed, 27 Oct 2021 12:52:17 -0700 Subject: [PATCH 7/7] testsuite: add tests for job-manager prolog/epilog events Problem: No tests exist for job-manager prolog/epilog events and the jobtap API which emits them. Add basic API tests to the jobtap_api.c job-manager test plugin. Add a new test job-manager plugin, perilog-test, which tests basic functionality of a plugin which uses these events. Drive the plugin from the t2212-job-manager-plugins.t sharness test. --- t/Makefile.am | 10 ++ t/job-manager/plugins/jobtap_api.c | 178 +++++++++++++++++++++++++++ t/job-manager/plugins/perilog-test.c | 142 +++++++++++++++++++++ t/t2212-job-manager-plugins.t | 12 ++ 4 files changed, 342 insertions(+) create mode 100644 t/job-manager/plugins/perilog-test.c diff --git a/t/Makefile.am b/t/Makefile.am index c4e5286c7540..f6698a3c90b7 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -377,6 +377,7 @@ check_LTLIBRARIES = \ job-manager/plugins/dependency-test.la \ job-manager/plugins/subscribe.la \ job-manager/plugins/cleanup-event.la \ + job-manager/plugins/perilog-test.la \ stats/stats-basic.la \ stats/stats-immediate.la @@ -789,6 +790,15 @@ job_manager_plugins_cleanup_event_la_LDFLAGS = \ job_manager_plugins_cleanup_event_la_LIBADD = \ $(top_builddir)/src/common/libflux-core.la +job_manager_plugins_perilog_test_la_SOURCES = \ + job-manager/plugins/perilog-test.c +job_manager_plugins_perilog_test_la_CPPFLAGS = \ + $(test_cppflags) +job_manager_plugins_perilog_test_la_LDFLAGS = \ + $(fluxplugin_ldflags) -module -rpath /nowhere +job_manager_plugins_perilog_test_la_LIBADD = \ + $(top_builddir)/src/common/libflux-core.la + hwloc_hwloc_convert_SOURCES = hwloc/hwloc-convert.c hwloc_hwloc_convert_CPPFLAGS = $(HWLOC_CFLAGS) $(test_cppflags) hwloc_hwloc_convert_LDADD = $(HWLOC_LIBS) \ diff --git a/t/job-manager/plugins/jobtap_api.c b/t/job-manager/plugins/jobtap_api.c index fa1898021e80..324cc18b9a4c 100644 --- a/t/job-manager/plugins/jobtap_api.c +++ b/t/job-manager/plugins/jobtap_api.c @@ -13,6 +13,180 @@ #include #include +static int test_prolog_start_finish (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args) +{ + errno = 0; + if (flux_jobtap_prolog_start (NULL, NULL) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_start (NULL NULL)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_prolog_start (p, NULL) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_start (p, NULL)", + errno, + EINVAL); + + errno = 0; + if (strcmp (topic, "job.state.cleanup") == 0) { + if (flux_jobtap_prolog_start (p, "test") == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_start ", + "after start request should fail", + errno, + EINVAL); + + + } + errno = 0; + if (flux_jobtap_prolog_finish (NULL, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_finish (NULL, ...)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_prolog_finish (p, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_finish (p, NULL...)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_prolog_finish (NULL, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_prolog_finish (p, 1)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_prolog_finish (p, 1, "test", 0) == 0 + || errno != ENOENT) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s (%s): errno=%d != %d", + topic, + "flux_jobtap_prolog_finish", + "p, 1, \"test\", 0", + errno, + EINVAL); + + + return 0; +} + + +static int test_epilog_start_finish (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args) +{ + errno = 0; + if (flux_jobtap_epilog_start (NULL, NULL) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_start (NULL NULL)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_epilog_start (p, NULL) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_start (p, NULL)", + errno, + EINVAL); + + errno = 0; + if (strcmp (topic, "job.state.run") == 0) { + if (flux_jobtap_epilog_start (p, "test") == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_start ", + "after start request should fail", + errno, + EINVAL); + + + } + errno = 0; + if (flux_jobtap_epilog_finish (NULL, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_finish (NULL, ...)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_epilog_finish (p, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_finish (p, NULL...)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_epilog_finish (NULL, FLUX_JOBTAP_CURRENT_JOB, NULL, 0) == 0 + || errno != EINVAL) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s: errno=%d != %d", + topic, + "flux_jobtap_epilog_finish (p, 1)", + errno, + EINVAL); + errno = 0; + if (flux_jobtap_epilog_finish (p, 1, "test", 0) == 0 + || errno != ENOENT) + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "%s: %s (%s): errno=%d != %d", + topic, + "flux_jobtap_epilog_finish", + "p, 1, \"test\", 0", + errno, + EINVAL); + + + return 0; +} + + static int test_event_post_pack (flux_plugin_t *p, const char *topic, flux_plugin_arg_t *args) @@ -332,6 +506,8 @@ static int cleanup_cb (flux_plugin_t *p, void *arg) { test_event_post_pack (p, topic, args); + test_prolog_start_finish (p, topic, args); + test_epilog_start_finish (p, topic, args); return test_job_result (p, topic, args); } @@ -357,6 +533,8 @@ static int run_cb (flux_plugin_t *p, EINVAL, errno); test_event_post_pack (p, topic, args); + test_prolog_start_finish (p, topic, args); + test_epilog_start_finish (p, topic, args); return 0; } diff --git a/t/job-manager/plugins/perilog-test.c b/t/job-manager/plugins/perilog-test.c new file mode 100644 index 000000000000..82b6ea8e1c4f --- /dev/null +++ b/t/job-manager/plugins/perilog-test.c @@ -0,0 +1,142 @@ +/************************************************************\ + * Copyright 2021 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +/* perilog-test.c - basic tests for job manager prolog/epilog + */ + +#include +#include +#include + +#include +#include + +struct perilog_data { + flux_plugin_t *p; + flux_jobid_t id; + char *name; + bool prolog; + int status; +}; + +static struct perilog_data * +perilog_data_create (flux_plugin_t *p, + flux_jobid_t id, + bool prolog, + const char *name, + int status) +{ + struct perilog_data *d = malloc (sizeof (*d)); + if (!d) + return NULL; + if (!(d->name = strdup (name))) { + free (d); + return NULL; + } + d->p = p; + d->id = id; + d->prolog = prolog; + d->status = status; + return d; +} + +static void perilog_data_destroy (struct perilog_data *d) +{ + if (d) { + free (d->name); + free (d); + } +} + +static void timer_cb (flux_reactor_t *r, + flux_watcher_t *w, + int revents, void *arg) +{ + struct perilog_data *d = arg; + if (d->prolog) { + if (flux_jobtap_prolog_finish (d->p, d->id, d->name, d->status) < 0) + flux_jobtap_raise_exception (d->p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "flux_jobtap_prolog_finish: %s", + strerror (errno)); + } + else { + if (flux_jobtap_epilog_finish (d->p, d->id, d->name, d->status) < 0) + flux_jobtap_raise_exception (d->p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "flux_jobtap_epilog_finish: %s", + strerror (errno)); + } + flux_watcher_destroy (w); + perilog_data_destroy (d); +} + +static int cb (flux_plugin_t *p, + const char *topic, + flux_plugin_arg_t *args, + void *arg) +{ + flux_t *h = flux_jobtap_get_flux (p); + flux_watcher_t *tw; + flux_jobid_t id; + struct perilog_data *d; + int rc; + int prolog = strcmp (topic, "job.state.run") == 0; + + if (flux_plugin_arg_unpack (args, + FLUX_PLUGIN_ARG_IN, + "{s:I}", + "id", &id) < 0) { + flux_log_error (h, "flux_plugin_arg_unpack"); + return -1; + } + + if (!(d = perilog_data_create (p, id, prolog, "test", 0))) { + flux_log_error (h, "perilog_data_create"); + return -1; + } + + tw = flux_timer_watcher_create (flux_get_reactor (h), + 0.1, + 0.0, + timer_cb, + d); + if (tw == NULL) { + flux_log_error (h, "timer_watcher_create"); + return -1; + } + + flux_watcher_start (tw); + if (prolog) + rc = flux_jobtap_prolog_start (p, "test"); + else + rc = flux_jobtap_epilog_start (p, "test"); + if (rc < 0) { + flux_jobtap_raise_exception (p, FLUX_JOBTAP_CURRENT_JOB, + "test", 0, + "flux_jobtap_%s_start failed: %s", + prolog ? "prolog" : "epilog", + strerror (errno)); + } + return 0; +} + +static const struct flux_plugin_handler tab[] = { + { "job.state.run", cb, NULL }, + { "job.state.cleanup", cb, NULL }, + { 0 }, +}; + +int flux_plugin_init (flux_plugin_t *p) +{ + if (flux_plugin_register (p, "perilog-test", tab) < 0) + return -1; + return 0; +} diff --git a/t/t2212-job-manager-plugins.t b/t/t2212-job-manager-plugins.t index 74e1b721b85f..9183d0080f95 100755 --- a/t/t2212-job-manager-plugins.t +++ b/t/t2212-job-manager-plugins.t @@ -380,4 +380,16 @@ test_expect_success 'job-manager: job.state.depend is called on plugin load' ' flux python dep-remove.py ${jobid} && flux job wait-event -vt 15 ${jobid} clean ' +test_expect_success 'job-manager: job prolog/epilog events work' ' + flux jobtap load --remove=all ${PLUGINPATH}/perilog-test.so && + jobid=$(flux mini submit hostname) && + flux job attach -vE $jobid 2>&1 | tee perilog-test.out && + n_prolog=$(grep -n job.prolog-finish perilog-test.out | cut -d: -f1) && + n_start=$(grep -n job.start perilog-test.out | cut -d: -f1) && + n_epilog=$(grep -n job.epilog-finish perilog-test.out | cut -d: -f1) && + n_free=$(grep -n job.free perilog-test.out | cut -d: -f1) && + test_debug "echo Checking that prolog-finish=$n_prolog event occurs before start=$n_start event" && + test_debug "echo Checking that epilog-finish=$n_epilog event occurs before free=$n_free event" && + test $n_prolog -lt $n_start -a $n_epilog -lt $n_free +' test_done