DataWorkflowServices · roehrich-hpe · Sep 28, 2023 · Sep 7, 2023 · Sep 18, 2023 · Sep 22, 2023
diff --git a/.gitmodules b/.gitmodules
@@ -1,11 +1,9 @@
-[submodule "testsuite/submodules/dws"]
-	path = testsuite/submodules/dws
-	url = https://github.com/HewlettPackard/dws.git
-	branch = releases/v0
 [submodule "testsuite/submodules/slurm-docker-cluster"]
 	path = testsuite/submodules/slurm-docker-cluster
 	url = git@github.com:DataWorkflowServices/slurm-docker-cluster.git
 [submodule "testsuite/submodules/dws-test-driver"]
 	path = testsuite/submodules/dws-test-driver
 	url = git@github.com:DataWorkflowServices/dws-test-driver.git
-	branch = releases/v0
+[submodule "testsuite/submodules/dws"]
+	path = testsuite/submodules/dws
+	url = git@github.com:DataWorkflowServices/dws.git
diff --git a/src/burst_buffer/burst_buffer.conf b/src/burst_buffer/burst_buffer.conf
@@ -3,3 +3,9 @@
 # See https://slurm.schedmd.com/burst_buffer.conf.html
 Directive=DW
 
+# If set, then teardown a burst buffer after file staging error. Otherwise
+# preserve the burst buffer for analysis and manual teardown.
+# See https://slurm.schedmd.com/burst_buffer.conf.html
+# and https://slurm.schedmd.com/burst_buffer.html#states
+Flags=TeardownFailure
+
diff --git a/src/burst_buffer/burst_buffer.lua b/src/burst_buffer/burst_buffer.lua
@@ -35,7 +35,7 @@ DEFAULT_LABEL_KEY = "origin"
 DEFAULT_LABEL_VAL = lua_script_name
 
 -- The fully-qualified name of the DWS Workflow CRD.
-local WORKFLOW_CRD = "workflows.dws.cray.hpe.com"
+local WORKFLOW_CRD = "workflows.dataworkflowservices.github.io"
 
 KUBECTL_CACHE_DIR = "/tmp/burst_buffer_kubectl_cache"
 
@@ -118,7 +118,7 @@ end
 -- resource with keywords that must be replaced by the caller.
 function DWS:template()
 	return [[
-apiVersion: dws.cray.hpe.com/v1alpha2
+apiVersion: dataworkflowservices.github.io/v1alpha2
 kind: Workflow
 metadata:
   name: WF_NAME
@@ -280,9 +280,16 @@ end
 
 -- DWS:get_driver_errors will collect driver errors from the Workflow resource
 -- with respect to the given state.
-function DWS:get_driver_errors(state)
-	local error_list = {}
-	local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
+-- If all_errors=true then collect all errors from all states in all drivers.
+-- On success this returns true and a string with all of the errors.
+-- On failure this returns false, an empty string for the errors, and a string
+-- explaining why it couldn't collect the errors.
+function DWS:get_driver_errors(state, all_errors)
+	local driver_index = [[?(@.watchState=="]].. state ..[[")]]
+	if all_errors == true then
+		driver_index = "*"
+	end
+	local jsonpath = [[{range .status.drivers[]] .. driver_index .. [[]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
 	local ret, output = self:get_jsonpath(jsonpath)
 	if ret == false then
 		return ret, "", "could not get driver errors: " .. output
@@ -442,6 +449,18 @@ function DWS:kubectl(cmd)
 	return self:io_popen(kcmd)
 end
 
+-- DWS:scancel will run the Slurm scancel command and collect its output.
+-- On success this returns true and the output of the command.
+-- On failure this returns false and the output of the command.
+function DWS:scancel(jobId, hurry)
+	local hurry_opt = ""
+	if hurry == true then
+		hurry_opt = "--hurry "
+	end
+	local scmd = "scancel " .. hurry_opt .. jobId
+	return self:io_popen(scmd)
+end
+
 -- DWS:io_popen will run the given command and collect its output.
 -- On success this returns true and the output of the command.
 -- On failure this returns false and the output of the command.
@@ -627,24 +646,51 @@ function slurm_bb_job_teardown(job_id, job_script, hurry)
 		hurry_flag = true
 	end
 	local workflow = DWS(make_workflow_name(job_id))
-	local done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)
+
+	local ret = slurm.SUCCESS
+	-- Does the workflow have a fatal error in it?
+	-- If so, we'll call scancel as well.
+	local done, state_errors, err = workflow:get_driver_errors("", true)
 	if done == false then
 		if string.find(err, [["]] .. workflow.name .. [[" not found]]) then
 			-- It's already gone, and that's what we wanted anyway.
 			return slurm.SUCCESS
 		else
-			slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
-			return slurm.ERROR, err
+			slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: unable to check driver errors: %s", lua_script_name, workflow.name, err)
+			ret = slurm.ERROR
+			-- fall-through, let the Workflow delete happen.
 		end
 	end
 
+	done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)
+	if done == false then
+		slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
+		ret = slurm.ERROR
+		-- fall-through, let the Workflow delete happen.
+	end
+
 	done, err = workflow:delete()
 	if done == false then
 		slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s, delete: %s", lua_script_name, workflow.name, err)
-		return slurm.ERROR, err
+		ret = slurm.ERROR
+		-- fall-through, let any necessary scancel happen.
+	end
+
+	if state_errors ~= "" then
+		-- Now do the scancel.  This will terminate this Lua script and will
+		-- trigger slurm to call our teardown again, but that'll be a no-op
+		-- when it comes back here.
+		slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: executing scancel --hurry %s, found driver errors: %s", lua_script_name, workflow.name, job_id, state_errors)
+		_, err = workflow:scancel(job_id, true)
+		if err == "" then
+			err = "(no output)"
+		end
 	end
 
-	return slurm.SUCCESS
+	if ret == slurm.SUCCESS then
+		err = ""
+	end
+	return ret, err
 end
 
 --[[
@@ -844,10 +890,20 @@ function slurm_bb_get_status(...)
 	local args = {...}
 	args.n = select("#", ...)
 
+	local found_jid = false
+	local jid = 0
 	if args.n == 2 and args[1] == "workflow" then
+		-- Slurm 22.05
+		jid = args[2]
+		found_jid = true
+	elseif args.n == 4 and args[3] == "workflow" then
+		-- Slurm 23.02
+		jid = args[4]
+		found_jid = true
+	end
+	if found_jid == true then
 		local done = false
 		local status = {}
-		local jid = args[2]
 		if string.find(jid, "^%d+$") == nil then
 			msg = "A job ID must contain only digits."
 		else

diff --git a/testsuite/integration/Makefile b/testsuite/integration/Makefile
@@ -39,20 +39,20 @@ setup-dws:
 	@{\
 		set -e ; \
 		cd ../submodules/dws ; \
-		docker buildx build -t local/dws-operator:test --load . ; \
-		IMAGE_TAG_BASE=local/dws-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
-		kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-controller-manager --for condition=Available=True ; \
-		kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-webhook --for condition=Available=True ; \
+		docker buildx build -t local/dws:test --load . ; \
+		IMAGE_TAG_BASE=local/dws VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
+		kubectl wait deployment --timeout=120s -n dws-system dws-controller-manager --for condition=Available=True ; \
+		kubectl wait deployment --timeout=120s -n dws-system dws-webhook --for condition=Available=True ; \
 	}
 
 .PHONY: setup-dws-test-driver
 setup-dws-test-driver:
 	@{\
 		set -e ; \
 		cd ../submodules/dws-test-driver ; \
-		docker buildx build -t local/dws-test-driver-operator:test --load . ; \
-		IMAGE_TAG_BASE=local/dws-test-driver-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
-		kubectl wait deployment --timeout=60s -n dws-test-operator-system dws-test-driver-controller-manager --for condition=Available=True ; \
+		docker buildx build -t local/dws-test-driver:test --load . ; \
+		IMAGE_TAG_BASE=local/dws-test-driver VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
+		kubectl wait deployment --timeout=60s -n dws-test-system dws-test-driver-controller-manager --for condition=Available=True ; \
 	}
 
 .PHONY: setup
@@ -75,10 +75,10 @@ debug:
 	kubectl describe node dws-control-plane dws-worker
 	echo
 	echo "***** DWS DEPLOYMENT *****"
-	kubectl describe deployment -n dws-operator-system dws-operator-controller-manager
+	kubectl describe deployment -n dws-system dws-controller-manager
 	echo
 	echo "***** DWS LOGS *****"
-	kubectl logs -n dws-operator-system deployment/dws-operator-controller-manager
+	kubectl logs -n dws-system deployment/dws-controller-manager
 
 .PHONY: reports
 reports:

diff --git a/testsuite/integration/src/features/test_dws_states.feature b/testsuite/integration/src/features/test_dws_states.feature
@@ -22,6 +22,7 @@ Feature: Data Workflow Services State Progression
     Verify that the DWS-Slurm Burst Buffer Plugin progresses through Data
     Workflow Services states
 
+    @happy_one
     Scenario: The DWS-BB Plugin progresses through DWS states
         Given a job script:
             #!/bin/bash
@@ -44,13 +45,15 @@ Feature: Data Workflow Services State Progression
         And the Workflow and job progress to the PostRun state
         And the Workflow and job progress to the DataOut state
         And the Workflow and job progress to the Teardown state
-        And the job state is COMPLETED
+        And the job has eventually been COMPLETED
 
     # DWS does not allow spaces in key/value pairs in directives. To skirt around this
     # constraint, the dws-test-driver replaces underscores ("_") in the message value with
     # spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
     # It also makes it easier to check that the error is included in scontrol output.
-    Scenario Outline: The DWS-BB Plugin can handle fatal driver errors before being canceled
+    # This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
+    @fatal_one
+    Scenario Outline: Report fatal errors from Proposal, Setup, DataIn, PreRun
         Given a job script:
             #!/bin/bash
 
@@ -59,12 +62,13 @@ Feature: Data Workflow Services State Progression
             /bin/hostname
 
         When the job is run
-        Then a Workflow has been created for the job
-        And the Workflow and job report fatal errors at the <workflowState> state
-        And the job is canceled
-        And the Workflow and job progress to the Teardown state
-        And the job's final system comment contains the following:
+        And some Workflow has been created for the job
+        And the Workflow reports fatal errors at the <workflowState> state
+        Then the job's system comment eventually contains the following:
             TEST FATAL ERROR
+        And the Workflow and job progress to the Teardown state
+        And the Workflow has eventually been deleted
+        And the job has eventually been CANCELLED
 
         Examples:
             # *** HEADER ***
@@ -73,14 +77,15 @@ Feature: Data Workflow Services State Progression
             | Proposal      |
             | Setup         |
             | DataIn        |
-            | PostRun       | 
-            | DataOut       | 
+            | PreRun        |
 
-    # With the exception of PreRun, states will need to be canceled with the
-    # "--hurry" flag to transition to the Teardown state. If 
-    # "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
-    # transition to Teardown without needing to be canceled
-    Scenario Outline: The DWS-BB Plugin can handle fatal driver errors for PreRun
+    # DWS does not allow spaces in key/value pairs in directives. To skirt around this
+    # constraint, the dws-test-driver replaces underscores ("_") in the message value with
+    # spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
+    # It also makes it easier to check that the error is included in scontrol output.
+    # This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
+    @fatal_two
+    Scenario Outline: Report fatal errors from PostRun and DataOut
         Given a job script:
             #!/bin/bash
 
@@ -89,35 +94,17 @@ Feature: Data Workflow Services State Progression
             /bin/hostname
 
         When the job is run
-        Then a Workflow has been created for the job
-        And the Workflow reports a fatal error in the <workflowState> state
-        And the Workflow and job progress to the Teardown state
-        # Slurm moved it from PreRun/Error to Teardown without canceling
-        # the job.  So the driver (this test) must cancel it.
-        And the job is canceled
-        And the job's final system comment contains the following:
+        And some Workflow has been created for the job
+        And the Workflow reports fatal errors at the <workflowState> state
+        Then the job's system comment eventually contains the following:
             TEST FATAL ERROR
+        And the Workflow and job progress to the Teardown state
+        And the Workflow has eventually been deleted
+        And the job has eventually been COMPLETED
 
         Examples:
             # *** HEADER ***
             | workflowState |
             # *** VALUES ***
-            | PreRun        |
-
-    Scenario: The DWS-BB Plugin can handle fatal driver errors during Teardown
-        Given a job script:
-            #!/bin/bash
-
-            #DW Teardown action=error message=TEST_FATAL_ERROR severity=Fatal
-            /bin/hostname
-
-        When the job is run
-        Then a Workflow has been created for the job
-        And the Workflow reports a fatal error in the Teardown state
-        And the job's intermediate system comment contains the following:
-            TEST FATAL ERROR
-        # Eventually the driver (this test) must work through the Teardown
-        # issues and complete that step.  Slurm has already marked the job
-        # as completed and is now looping over slurm_bb_job_teardown() in
-        # burst_buffer.lua.
-        And the Workflow error is cleared from the Teardown state
+            | PostRun       | 
+            | DataOut       |
diff --git a/testsuite/integration/src/features/test_environment.feature b/testsuite/integration/src/features/test_environment.feature
@@ -36,7 +36,7 @@ Feature: Integration test environment
             srun -l /bin/hostname
             srun -l /bin/pwd
         When the job is run
-        Then the job state is COMPLETED
+        Then the job has eventually been COMPLETED
 
     Scenario: Kubernetes and slurm are connected
         Given the kubernetes cluster kube-system UID

diff --git a/testsuite/integration/src/pytest.ini b/testsuite/integration/src/pytest.ini
@@ -22,4 +22,6 @@ bdd_features_base_dir = features
 markers =
   environment
   dws_states
-
+  happy_one
+  fatal_one
+  fatal_two
diff --git a/testsuite/integration/src/tests/conftest.py b/testsuite/integration/src/tests/conftest.py
@@ -72,17 +72,7 @@ def _(slurmctld, script_path):
     # remove the slurm output from the jobs folder
     slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)
 
-@then(parsers.parse('the job state is {expectedJobState}'))
-def _(slurmctld, jobId, expectedJobState):
-    """the job state is <expectedJobState>"""
-    jobState, out = slurmctld.get_final_job_state(jobId)
-
-    if expectedJobState == "COMPLETED" and jobState == "FAILED":
-        warnings.warn(ResourceWarning((f"Job {jobId} failed unexpectedly.\n") + \
-            "This may happen if Slurm doesn't have enough resources to schedule the job.\n" + \
-            "This is not considered a test failure, in this context, since DWS isn't\n" + \
-            "dependent on the job's failure or success."
-        ))
-        return
-
-    assert jobState == expectedJobState, "Unexpected Job State: " + jobState + "\n" + out
+@then(parsers.parse('the job has eventually been {job_state:l}'))
+def _(slurmctld, jobId, job_state):
+    """the job has eventually been <job_state>"""
+    slurmctld.wait_until_job_has_been_x(jobId, job_state)