From cb444d69b12014eaba6a52bbbf53a13221afdd0c Mon Sep 17 00:00:00 2001 From: Albert Chu Date: Wed, 1 Dec 2021 13:54:01 -0800 Subject: [PATCH] rfc21: add reattach job state --- data/spec_21/states.dot | 5 +- data/spec_21/states.svg | 171 ++++++++++++++++++++-------------------- spec_21.rst | 43 +++++++++- 3 files changed, 130 insertions(+), 89 deletions(-) diff --git a/data/spec_21/states.dot b/data/spec_21/states.dot index 7fc7b40c..a85deab3 100644 --- a/data/spec_21/states.dot +++ b/data/spec_21/states.dot @@ -12,7 +12,7 @@ digraph states { DEPEND; PRIORITY; SCHED; - RUN; + {rank=same; RUN; REATTACH;} CLEANUP; } @@ -25,6 +25,9 @@ digraph states { SCHED -> PRIORITY [label="flux-restart"] + RUN -> REATTACH [xlabel="reattach"] + REATTACH -> RUN [xlabel="attached"] + edge [weight=0 color="red"]; DEPEND -> CLEANUP [label="exception"]; diff --git a/data/spec_21/states.svg b/data/spec_21/states.svg index 0e988830..7f8dd19f 100644 --- a/data/spec_21/states.svg +++ b/data/spec_21/states.svg @@ -1,127 +1,126 @@ - - - + + states - - -cluster_main - -active + +cluster_main + +active - -INACTIVE - - -INACTIVE +INACTIVE + + +INACTIVE - -DEPEND - -DEPEND +DEPEND + +DEPEND - -PRIORITY - -PRIORITY +PRIORITY + +PRIORITY - -DEPEND->PRIORITY - - -depend +DEPEND->PRIORITY + + +depend - -CLEANUP - -CLEANUP +CLEANUP + +CLEANUP - -DEPEND->CLEANUP - - -exception +DEPEND->CLEANUP + + +exception - -SCHED - -SCHED +SCHED + +SCHED - -PRIORITY->SCHED - - -priority - - - -SCHED->PRIORITY - - -flux-restart +PRIORITY->SCHED + + +priority - -RUN - -RUN +RUN + +RUN - -SCHED->RUN - - -alloc +SCHED->RUN + + +alloc + + +SCHED->PRIORITY + + +flux-restart - -SCHED->CLEANUP - - +SCHED->CLEANUP + + - -RUN->CLEANUP - - -finish +RUN->CLEANUP + + +finish + + +REATTACH + +REATTACH + + +RUN->REATTACH + + +reattach - -RUN->CLEANUP - - +RUN->CLEANUP + + + + +REATTACH->RUN + + +attached - -CLEANUP->INACTIVE - - +CLEANUP->INACTIVE + + - -NEW - -NEW +NEW + +NEW - -NEW->DEPEND - - +NEW->DEPEND + + diff --git a/spec_21.rst b/spec_21.rst index 44019eeb..c8e40d01 100644 --- a/spec_21.rst +++ b/spec_21.rst @@ -110,6 +110,14 @@ RUN job shells have been started, and a ``finish`` event once all the job shells have exited. The state transitions to CLEANUP. +REATTACH + The job was started, but the job manager has lost tracking to it + due to an error (for example, a system crash). The job manager is + attempting to reattach itself to the running job. A + ``reattach`` event is logged to indicate transition into this + state. ``attached`` will be logged when the tracking has been + reestablished and we can re-enter the RUN state. + CLEANUP The job has completed or an exception has occurred. Under normal termination, the job manager waits for notification from the exec service that job @@ -133,10 +141,10 @@ PENDING The job is in DEPEND, PRIORITY, or SCHED states. RUNNING - The job is in RUN or CLEANUP states. + The job is in RUN, REATTACH, or CLEANUP states. ACTIVE - The job is in DEPEND, PRIORITY, SCHED, RUN, or CLEANUP states. + The job is in DEPEND, PRIORITY, SCHED, RUN, REATTACH, or CLEANUP states. Exceptions @@ -391,6 +399,37 @@ status {"timestamp":1552594348.0,"name":"epilog-finish","context":{"description":"/usr/sbin/job-epilog.sh", "status":0}} +Reattach Event +^^^^^^^^^^^^^^ + +The job manager is attempting to reattach to a running job. + +The following keys are OPTIONAL in the event context object: + +id + (long long) job ID to reattach to + +Example: + +.. code:: json + + {"timestamp":1636747761.5495925,"name":"reattach","context":{"id":341835776000}} + + +Attached Event +^^^^^^^^^^^^^^ + +The job manager has re-connected to the job shells. + +The context SHALL be empty. + +Example: + +.. code:: json + + {"timestamp":1636747761.827836,"name":"reattached"} + + Free Event ^^^^^^^^^^