DataWorkflowServices · nathandotleeathpe · Feb 10, 2023 · Dec 14, 2022 · Dec 15, 2022 · Dec 17, 2022
diff --git a/Dockerfile b/Dockerfile
@@ -10,13 +10,11 @@ COPY testsuite/submodules/dws /dws
 FROM testbase AS testrun
 
 WORKDIR /
-COPY testsuite/unit/bin /bin
 COPY testsuite/unit/luacov.lua /.luacov
 COPY testsuite/unit/output.lua /output.lua
 COPY src /
 COPY testsuite/unit/src/burst_buffer/dws-test.lua /
 
-ENV MOCK_SLURM yes
 RUN busted -o output.lua -Xoutput junit.xml --verbose --coverage *test.lua || \
     touch testsFailed.indicator
 

diff --git a/Makefile b/Makefile
@@ -24,16 +24,9 @@ test: $(find src -type f) $(find testsuite/unit/src -type f) Dockerfile
 	docker buildx build $(NOCACHE) $(PROGRESS) --target test -t test .
 
 OUTPUT_HANDLER = --output TAP
-
 TAG ?=  # specify a string like TAG="-t mytag"
-
-test-mocks: VALIDATOR ?= testsuite/unit/bin/validate 
-test-mocks: CRDFILE=testsuite/submodules/dws/config/crd/bases/dws.cray.hpe.com_workflows.yaml
-test-mocks:
-	MOCK_SLURM=yes CRDFILE=$(CRDFILE) VALIDATOR=$(VALIDATOR) busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
-
-test-realk8s:
-	MOCK_SLURM=yes REAL_K8S=yes busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
+test-no-docker:
+	busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
 
 integration-test: $(find testsuite/integration/src -type f) testsuite/integration/Dockerfile
 	cd testsuite/integration && make setup test clean
diff --git a/src/burst_buffer/burst_buffer.lua b/src/burst_buffer/burst_buffer.lua
@@ -241,6 +241,69 @@ function DWS:get_current_state()
 	return ret, status
 end
 
+function _parse_driver_statuses(text, iterator, status_list, status_info)
+	if iterator == nil then
+		iterator = text:gmatch("[^\n]+")
+	end
+
+	if status_list == nil then
+		status_list = {}
+	end
+
+	local line = iterator()
+	if line == nil  and status_info ~= nil then
+		table.insert(status_list, status_info)
+		return status_list
+	elseif line == nil then
+		return status_list
+	end
+
+	if line == "===" then
+		if status_info ~= nil then
+			table.insert(status_list, status_info)
+		end
+
+		status_info = {}
+		status_info["status"] = iterator()
+		status_info["driverID"] = iterator()
+		status_info["error"] = ""
+		return _parse_driver_statuses(text, iterator, status_list, status_info)
+	end
+
+	if status_info ~= nil then
+		status_info["error"] = status_info["error"] .. line .. "\n"
+		return _parse_driver_statuses(text, iterator, status_list, status_info)
+	end
+
+	return status_list
+end
+
+-- DWS:get_driver_errors will collect driver errors from the Workflow resource
+-- with respect to the given state.
+function DWS:get_driver_errors(state)
+	local error_list = {}
+	local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
+	local ret, output = self:get_jsonpath(jsonpath)
+	if ret == false then
+		return ret, "", "could not get driver errors: " .. output
+	end
+
+	driver_statuses = _parse_driver_statuses(output)
+
+	errors = ""
+	for _, driver_status in ipairs(driver_statuses) do
+		if driver_status["status"] == "Error" then
+			if errors ~= "" then
+				errors = errors .. "\n"
+			end
+
+			errors = errors .. driver_status["driverID"] .. ": " .. driver_status["error"]
+		end
+	end
+
+	return true, errors
+end
+
 -- DWS:get_hurry will get the hurry flag of the Workflow resource.
 -- On success this returns true and a boolean for the value of the hurry flag.
 -- On failure this returns false and the output of the kubectl command.
@@ -273,7 +336,11 @@ function DWS:wait_for_status_complete(max_passes)
 		if status["desiredState"] == status["currentState"] and status["status"] == "Completed" then
 			return true, status
 		elseif status["status"] == "Error" then
-			return false, empty, string.format("Error in Workflow %s", self.name)
+			local ret, driver_errors, err = self:get_driver_errors(status["desiredState"])
+			if ret ~= true then
+				return ret, empty, err
+			end
+			return false, empty, "DWS driver error(s):\n" .. driver_errors
 		end
 		os.execute("sleep 1")
 		if max_passes > 0 then

diff --git a/testsuite/integration/src/features/test_dws_states.feature b/testsuite/integration/src/features/test_dws_states.feature
@@ -36,82 +36,73 @@ Feature: Data Workflow Services State Progression
 
         When the job is run
         And a Workflow is created for the job
-        #Then the job's temporary Workflow is not found
         Then the Workflow and job progress to the Proposal state
         And the Workflow and job progress to the Setup state
         And the Workflow and job progress to the DataIn state
         And the Workflow and job progress to the PreRun state
         And the Workflow and job progress to the PostRun state
         And the Workflow and job progress to the DataOut state
         And the Workflow and job progress to the Teardown state
-        And the job is COMPLETED
+        And the job state is COMPLETED
 
-    @todo
-    Scenario: The DWS-BB Plugin can handle DWS driver errors
+    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors before being canceled
         Given a job script:
             #!/bin/bash
-
-            #DW <state> action=error message=TEST_ERROR
+            
+            #DW <workflowState> action=error message=TEST_ERROR
             #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
         And a Workflow is created for the job
+        And the Workflow and job progress to the <workflowState> state
+        And the job is canceled
         Then the Workflow and job progress to the Teardown state
-        And the job shows an error with message "TEST ERROR"
+        And the job's system comment contains the following:
+            TEST ERROR
 
         Examples:
             # *** HEADER ***
-            | state    |
+            | workflowState |
             # *** VALUES ***
-            | Proposal |
-            | Setup    |
-            | DataIn   |
-            | PreRun   |
-            | PostRun  |
-            | DataOut  |
+            | Proposal      |
+            | Setup         |
+            | DataIn        |
+            | PostRun       | 
+            | DataOut       | 
 
-    @todo
-    Scenario: The DWS-BB Plugin can handle a DWS driver error during Teardown
+    # With the exception of PreRun, states will need to be cancelled with the
+    # "--hurry" flag to transition to the Teardown state. If 
+    # "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
+    # transition to Teardown without needing to be canceled
+    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors
         Given a job script:
             #!/bin/bash
-
-            #DW Teardown action=error message=TEST_ERROR
+
+            #DW <workflowState> action=error message=TEST_ERROR
+            #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
         And a Workflow is created for the job
-        Then the job shows an error with message "TEST ERROR"
+        Then the Workflow and job progress to the Teardown state
+        And the job's system comment contains the following:
+            TEST ERROR
+
+        Examples:
+            # *** HEADER ***
+            | workflowState |
+            # *** VALUES ***
+            | PreRun        |
 
-    @todo
-    Scenario: The DWS-BB Plugin can cancel jobs
+    Scenario: The DWS-BB Plugin can handle DWS driver errors during Teardown
         Given a job script:
             #!/bin/bash
-
-            #DW <state> action=wait
-            #DW Teardown action=wait
+
+            #DW Teardown action=error message=TEST_ERROR
             /bin/hostname
 
         When the job is run
-        And a Workflow is created for the job
-        And the Workflow and job progress to the <state> state
-        And the job is canceled with the hurry flag set to <hurry_flag>
-        Then the Workflow and job progress to the Teardown state
-        And the Workflow's hurry flag is set to <hurry_flag>
-
-        Examples:
-            # *** HEADER ***
-            | state    | hurry_flag |
-            # *** VALUES ***
-            | Proposal | false      |
-            | Setup    | false      |
-            | DataIn   | false      |
-            | PreRun   | false      |
-            | PostRun  | false      |
-            | DataOut  | false      |
-            | Proposal | true       |
-            | Setup    | true       |
-            | DataIn   | true       |
-            | PreRun   | true       |
-            | PostRun  | true       |
-            | DataOut  | true       |
+        Then the job's system comment contains the following:
+            TEST ERROR
+        And the workflow still exists
diff --git a/testsuite/integration/src/features/test_environment.feature b/testsuite/integration/src/features/test_environment.feature
@@ -35,9 +35,9 @@ Feature: Integration test environment
             srun -l /bin/hostname
             srun -l /bin/pwd
         When the job is run
-        Then the job is COMPLETED
+        Then the job state is COMPLETED
 
     Scenario: Kubernetes and slurm are connected
         Given the kubernetes cluster kube-system UID
         When the kube-system UID is queried from slurmctld
-        Then the UIDs match and the cluster is the same
+        Then the UIDs match and the cluster is the same
diff --git a/testsuite/integration/src/tests/conftest.py b/testsuite/integration/src/tests/conftest.py
@@ -52,7 +52,7 @@ def pytest_bdd_apply_tag(tag, function):
 
 @given(parsers.parse('a job script:\n{script}'), target_fixture="script_path")
 def _(script):
-    """a simple job script: <script>"""
+    """a job script: <script>"""
     path = "/jobs/" + secrets.token_hex(5) + "-job.sh"
     with open(path, "w") as file:
         file.write(script)
@@ -64,14 +64,6 @@ def _(script):
 @when('the job is run', target_fixture="jobId")
 def _(slurmctld, script_path):
     """the job is run."""
-    _,out = slurmctld.exec_run("sinfo -lNe")
-    print(out)
-
-    _,out = slurmctld.exec_run("scontrol show node")
-    print(out)
-
-    _,out = slurmctld.exec_run("kubectl describe deployment -n dws-operator-system dws-operator-controller-manager")
-    print(out)
 
     jobId, outputFilePath, errorFilePath = slurmctld.submit_job(script_path)
     print("submitted job: " + str(jobId))
@@ -81,9 +73,9 @@ def _(slurmctld, script_path):
     # remove the slurm output from the jobs folder
     slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)
 
-@then(parsers.parse('the job is {expectedJobState}'))
+@then(parsers.parse('the job state is {expectedJobState}'))
 def _(slurmctld, jobId, expectedJobState):
-    """the job completes successfully."""
+    """the job state is <expectedJobState>"""
     jobState, out = slurmctld.get_final_job_state(jobId)
 
     if expectedJobState == "COMPLETED" and jobState == "FAILED":

diff --git a/testsuite/integration/src/tests/dws_bb_plugin/test_dws_states.py b/testsuite/integration/src/tests/dws_bb_plugin/test_dws_states.py
@@ -30,49 +30,60 @@
 scenarios("test_dws_states.feature")
 
 @when('a Workflow is created for the job')
+@then('the workflow still exists')
 def _(k8s, jobId):
     """a Workflow is created for the job."""
     workflow = Workflow(k8s, jobId)
     assert workflow.data != None, "Workflow for Job: " + str(jobId) + " not found"
+
+    yield
 
-@when('the job is canceled with the hurry flag set to <hurry_flag>')
-def _():
-    """the job is canceled with the hurry flag set to <hurry_flag>."""
-    raise NotImplementedError
+    # attempt to delete workflow if it still exists
+    try:
+        workflow.delete()
+    except:
+        pass
 
-@when('the job\'s temporary Workflow is not found')
-def _():
-    """the job's temporary Workflow is not found."""
-    raise NotImplementedError
+@when(parsers.parse('the Workflow status becomes {status:l}'))
+def _(slurmctld, jobId, status):
+    """the Workflow status becomes <status>"""
+    workflowStatus = slurmctld.get_workflow_status(jobId)
+    assert workflowStatus["status"] == status
 
+@when('the job is canceled')
+def _(slurmctld, jobId):
+    """the job is canceled"""
+    slurmctld.cancel_job(jobId, False)
+
+@when(parsers.parse('the Workflow and job progress to the {state:l} state'))
 @then(parsers.parse('the Workflow and job progress to the {state:l} state'))
 def _(k8s, slurmctld, jobId, state):
     """the Workflow and job progress to the <state> state."""
     workflow = Workflow(k8s, jobId)
     workflow.wait_until(
         "the state the workflow is transitioning to",
-        lambda wf: wf.data["status"]["state"], state
+        lambda wf: wf.data["status"]["state"] == state and wf.data["status"]["status"] in ["Error", "DriverWait"]
     )
+    print("job %s progressed to state %s" % (str(jobId),state))
 
     jobStatus = slurmctld.get_workflow_status(jobId)
     assert jobStatus["desiredState"] == state, "Incorrect desired state: " + str(jobStatus)
     assert jobStatus["currentState"] == state, "Incorrect current state: " + str(jobStatus)
-    assert jobStatus["status"] == "DriverWait", "Incorrect status: " + str(jobStatus)
+    assert jobStatus["status"] == workflow.data["status"]["status"], "Incorrect status: " + str(jobStatus)
 
     # Set driver status to completed so the workflow can progress to the next state
+    updateRequired = False
     for driverStatus in workflow.data["status"]["drivers"]:
-        if driverStatus["driverID"] == "tester" and state in driverStatus["watchState"]:
+        if driverStatus["driverID"] == "tester" and state in driverStatus["watchState"] and driverStatus["status"] == "Pending":
+            print("updating job %s to complete state %s" % (str(jobId), state))
             driverStatus["completed"] = True
             driverStatus["status"] = "Completed"
+            updateRequired = True
 
-    workflow.save_driver_statuses()
-
-@then('the Workflow\'s hurry flag is set to <hurry_flag>')
-def _():
-    """the Workflow's hurry flag is set to <hurry_flag>."""
-    raise NotImplementedError
+    if updateRequired: 
+        workflow.save_driver_statuses()
 
-@then('the job shows an error with message "{message}"')
-def _():
-    """the job shows an error with message "{message}"}"."""
-    raise NotImplementedError
+@then(parsers.parse("the job's system comment contains the following:\n{message}"))
+def _(slurmctld, jobId, message):
+    _,out = slurmctld.get_final_job_state(jobId)
+    assert message in out