diff --git a/README.md b/README.md index 362c584..fa2ed86 100644 --- a/README.md +++ b/README.md @@ -75,11 +75,11 @@ To shutdown and cleanup the entire test environment, use the `clean` target in t make -C testsuite/integration clean ``` -### Simple playground exercise #1 +### Simple playground exercise #1: DriverWait This simple exercise will cause the job to proceed to DataOut state and wait for us to mark that state as complete, and then it will proceed to Teardown state. -Edit `testsuite/integration/slurm/jobs/test-bb.sh` to change the `#DW` line to be `#DW DataOut action=wait`. +Edit `testsuite/integration/slurm/jobs/test-bb.sh` to change the `#DW` line to be `#DW DataOut action=wait`. These '#DW' lines are being interpreted by dws-test-driver, which a stand-in for any backend service. ```bash #SBATCH --output=/jobs/slurm-%j.out @@ -115,7 +115,7 @@ kubectl patch workflow -n slurm bb12 --type=json -p '[{"op":"replace", "path":"/ You can then watch the Workflow resource proceed to Teardown state, after which the burst_buffer.lua teardown function will delete the resource. -### Simple playground exercise #2 +### Simple playground exercise #2: Transient Error This exercise will cause the job to wait on a Major error in DataOut and after we clear the error it will proceed to Teardown state where it will wait for @@ -160,3 +160,32 @@ The workflow is now in Teardown with a status of DriverWait. Release it by mark kubectl patch workflow -n slurm bb3 --type=json -p '[{"op":"replace", "path":"/status/drivers/1/status", "value": "Completed"}, {"op":"replace", "path":"/status/drivers/1/completed", "value": true}]' ``` +### Simple playground exercise #3: Fatal Error + +This exercise will cause the job to wait on a Fatal error in DataOut. The +burst-buffer plugin will collect the error and add it to the slurm job and +will transition the workflow to Teardown state and delete it. + +Edit `testsuite/integration/slurm/jobs/test-bb.sh` to change the `#DW` lines to be the following: + +```bash +#SBATCH --output=/jobs/slurm-%j.out +#DW DataOut action=error message=rename_permission_denied severity=Fatal +/bin/hostname +srun -l /bin/hostname +srun -l /bin/pwd +``` + +Submit this job using the `sbatch` command as shown above. + +Watch the workflow with `kubectl get workflow -wA` as shown above. We did not +ask dws-test-driver to wait in Teardown state this time (no `#DW Teardown +action=wait`), so everything will run at full speed and we won't have a chance +to look at the workflow before it is deleted. + +We can view the error in the slurm job: + +```console +bash-4.4$ scontrol show job 3 +``` + diff --git a/src/burst_buffer/burst_buffer.lua b/src/burst_buffer/burst_buffer.lua index 40a7ac6..0b58b37 100644 --- a/src/burst_buffer/burst_buffer.lua +++ b/src/burst_buffer/burst_buffer.lua @@ -639,6 +639,7 @@ function slurm_bb_job_teardown(job_id, job_script, hurry, uid, gid) if done == false then if string.find(err, [["]] .. workflow.name .. [[" not found]]) then -- It's already gone, and that's what we wanted anyway. + slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: already removed", lua_script_name, workflow.name) return slurm.SUCCESS else slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: unable to check driver errors: %s", lua_script_name, workflow.name, err) @@ -659,22 +660,22 @@ function slurm_bb_job_teardown(job_id, job_script, hurry, uid, gid) slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s, delete: %s", lua_script_name, workflow.name, err) ret = slurm.ERROR -- fall-through, let any necessary scancel happen. + else + slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: deleted", lua_script_name, workflow.name) end if state_errors ~= "" then -- Now do the scancel. This will terminate this Lua script and will -- trigger slurm to call our teardown again, but that'll be a no-op -- when it comes back here. - slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: executing scancel --hurry %s, found driver errors: %s", lua_script_name, workflow.name, job_id, state_errors) - _, err = workflow:scancel(job_id, true) - if err == "" then - err = "(no output)" + slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: executing 'scancel --hurry %s', found driver errors: %s", lua_script_name, workflow.name, job_id, state_errors) + local err2 + done, err2 = workflow:scancel(job_id, true) + if done == false then + slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: scancel: %s", lua_script_name, workflow.name, err2) end end - if ret == slurm.SUCCESS then - err = "Success" - end return ret, err end @@ -798,7 +799,7 @@ end --state. --]] function slurm_bb_post_run(job_id, job_script, uid, gid, job_info) - slurm.log_info("%s: slurm_post_run(). job id:%s, job script:%s, uid:%s, gid:%s", + slurm.log_info("%s: slurm_bb_post_run(). job id:%s, job script:%s, uid:%s, gid:%s", lua_script_name, job_id, job_script, uid, gid) local workflow = DWS(make_workflow_name(job_id)) @@ -808,7 +809,7 @@ function slurm_bb_post_run(job_id, job_script, uid, gid, job_info) return slurm.ERROR, err end - return slurm.SUCCESS, "Success" + return slurm.SUCCESS, "" end --[[ @@ -829,7 +830,7 @@ function slurm_bb_data_out(job_id, job_script, uid, gid, job_info) return slurm.ERROR, err end - return slurm.SUCCESS, "Success" + return slurm.SUCCESS, "" end --[[