Added integration tests for error scenarios and added error observabi…

…lity to plugin (#30) * Using dws-test-driver for DWS state progression integration tests Signed-off-by: Nathan Lee <[email protected]> * Fixed integration test errors Signed-off-by: Nathan Lee <[email protected]> * code review changes Signed-off-by: Nathan Lee <[email protected]> * Added integration tests for error scenarios Signed-off-by: Nathan Lee <[email protected]> * Updated dws-test-driver to main branch HEAD Signed-off-by: Nathan Lee <[email protected]> * Code review Signed-off-by: Nathan Lee <[email protected]> * Remove option to run in real slurm or k8s Signed-off-by: Nathan Lee <[email protected]> * Refactored unit test before adding error scenarios Signed-off-by: Nathan Lee <[email protected]> * Added driver error observability Signed-off-by: Nathan Lee <[email protected]> * Using dws-test-driver for DWS state progression integration tests (#28) * Using dws-test-driver for DWS state progression integration tests Signed-off-by: Nathan Lee <[email protected]> * Fixed integration test errors Signed-off-by: Nathan Lee <[email protected]> * code review changes Signed-off-by: Nathan Lee <[email protected]> * Updated dws-test-driver to main branch HEAD Signed-off-by: Nathan Lee <[email protected]> * Code review Signed-off-by: Nathan Lee <[email protected]> --------- Signed-off-by: Nathan Lee <[email protected]> * Pre-code review fixes Signed-off-by: Nathan Lee <[email protected]> * PR change Signed-off-by: Nathan Lee <[email protected]> * Update submodules Signed-off-by: Nathan Lee <[email protected]> * Updating dws submodules Signed-off-by: Nathan Lee <[email protected]> * Removed unused test function Signed-off-by: Nathan Lee <[email protected]> * Added sleep before cancelling job Signed-off-by: Nathan Lee <[email protected]> * Added documentation about test error transformation Signed-off-by: Nathan Lee <[email protected]> --------- Signed-off-by: Nathan Lee <[email protected]>
DataWorkflowServices · Feb 10, 2023 · 0dae837 · 0dae837
1 parent 9031a8c
commit 0dae837
Show file tree

Hide file tree

Showing 14 changed files with 373 additions and 433 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -1,11 +1,11 @@
 [submodule "testsuite/submodules/dws"]
 	path = testsuite/submodules/dws
 	url = https://github.com/HewlettPackard/dws.git
-	branch = master
 [submodule "testsuite/submodules/slurm-docker-cluster"]
 	path = testsuite/submodules/slurm-docker-cluster
 	url = [email protected]:DataWorkflowServices/slurm-docker-cluster.git
         branch = master
 [submodule "testsuite/submodules/dws-test-driver"]
 	path = testsuite/submodules/dws-test-driver
 	url = [email protected]:DataWorkflowServices/dws-test-driver.git
+	branch = main
diff --git a/Dockerfile b/Dockerfile
@@ -10,13 +10,11 @@ COPY testsuite/submodules/dws /dws
 FROM testbase AS testrun
 
 WORKDIR /
-COPY testsuite/unit/bin /bin
 COPY testsuite/unit/luacov.lua /.luacov
 COPY testsuite/unit/output.lua /output.lua
 COPY src /
 COPY testsuite/unit/src/burst_buffer/dws-test.lua /
 
-ENV MOCK_SLURM yes
 RUN busted -o output.lua -Xoutput junit.xml --verbose --coverage *test.lua || \
     touch testsFailed.indicator
 

diff --git a/Makefile b/Makefile
@@ -24,16 +24,9 @@ test: $(find src -type f) $(find testsuite/unit/src -type f) Dockerfile
 	docker buildx build $(NOCACHE) $(PROGRESS) --target test -t test .
 
 OUTPUT_HANDLER = --output TAP
-
 TAG ?=  # specify a string like TAG="-t mytag"
-
-test-mocks: VALIDATOR ?= testsuite/unit/bin/validate 
-test-mocks: CRDFILE=testsuite/submodules/dws/config/crd/bases/dws.cray.hpe.com_workflows.yaml
-test-mocks:
-	MOCK_SLURM=yes CRDFILE=$(CRDFILE) VALIDATOR=$(VALIDATOR) busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
-
-test-realk8s:
-	MOCK_SLURM=yes REAL_K8S=yes busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
+test-no-docker:
+	busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
 
 integration-test: $(find testsuite/integration/src -type f) testsuite/integration/Dockerfile
 	cd testsuite/integration && make setup test clean
diff --git a/src/burst_buffer/burst_buffer.lua b/src/burst_buffer/burst_buffer.lua
@@ -241,6 +241,69 @@ function DWS:get_current_state()
 	return ret, status
 end
 
+function _parse_driver_statuses(text, iterator, status_list, status_info)
+	if iterator == nil then
+		iterator = text:gmatch("[^\n]+")
+	end
+
+	if status_list == nil then
+		status_list = {}
+	end
+
+	local line = iterator()
+	if line == nil  and status_info ~= nil then
+		table.insert(status_list, status_info)
+		return status_list
+	elseif line == nil then
+		return status_list
+	end
+
+	if line == "===" then
+		if status_info ~= nil then
+			table.insert(status_list, status_info)
+		end
+
+		status_info = {}
+		status_info["status"] = iterator()
+		status_info["driverID"] = iterator()
+		status_info["error"] = ""
+		return _parse_driver_statuses(text, iterator, status_list, status_info)
+	end
+
+	if status_info ~= nil then
+		status_info["error"] = status_info["error"] .. line .. "\n"
+		return _parse_driver_statuses(text, iterator, status_list, status_info)
+	end
+
+	return status_list
+end
+
+-- DWS:get_driver_errors will collect driver errors from the Workflow resource
+-- with respect to the given state.
+function DWS:get_driver_errors(state)
+	local error_list = {}
+	local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
+	local ret, output = self:get_jsonpath(jsonpath)
+	if ret == false then
+		return ret, "", "could not get driver errors: " .. output
+	end
+
+	driver_statuses = _parse_driver_statuses(output)
+
+	errors = ""
+	for _, driver_status in ipairs(driver_statuses) do
+		if driver_status["status"] == "Error" then
+			if errors ~= "" then
+				errors = errors .. "\n"
+			end
+
+			errors = errors .. driver_status["driverID"] .. ": " .. driver_status["error"]
+		end
+	end
+
+	return true, errors
+end
+
 -- DWS:get_hurry will get the hurry flag of the Workflow resource.
 -- On success this returns true and a boolean for the value of the hurry flag.
 -- On failure this returns false and the output of the kubectl command.
@@ -273,7 +336,11 @@ function DWS:wait_for_status_complete(max_passes)
 		if status["desiredState"] == status["currentState"] and status["status"] == "Completed" then
 			return true, status
 		elseif status["status"] == "Error" then
-			return false, empty, string.format("Error in Workflow %s", self.name)
+			local ret, driver_errors, err = self:get_driver_errors(status["desiredState"])
+			if ret ~= true then
+				return ret, empty, err
+			end
+			return false, empty, "DWS driver error(s):\n" .. driver_errors
 		end
 		os.execute("sleep 1")
 		if max_passes > 0 then

diff --git a/testsuite/integration/src/features/test_dws_states.feature b/testsuite/integration/src/features/test_dws_states.feature
@@ -36,82 +36,77 @@ Feature: Data Workflow Services State Progression
 
         When the job is run
         And a Workflow is created for the job
-        #Then the job's temporary Workflow is not found
         Then the Workflow and job progress to the Proposal state
         And the Workflow and job progress to the Setup state
         And the Workflow and job progress to the DataIn state
         And the Workflow and job progress to the PreRun state
         And the Workflow and job progress to the PostRun state
         And the Workflow and job progress to the DataOut state
         And the Workflow and job progress to the Teardown state
-        And the job is COMPLETED
+        And the job state is COMPLETED
 
-    @todo
-    Scenario: The DWS-BB Plugin can handle DWS driver errors
+    # DWS does not allow spaces in key/value pairs in directives. To skirt around this
+    # constraint, the dws-test-driver replaces underscores ("_") in the message value with
+    # spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
+    # It also makes it easier to check that the error is included in scontrol output.
+    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors before being canceled
         Given a job script:
             #!/bin/bash
-
-            #DW <state> action=error message=TEST_ERROR
+            
+            #DW <workflowState> action=error message=TEST_ERROR
             #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
         And a Workflow is created for the job
+        And the Workflow and job report errors at the <workflowState> state
+        And the job is canceled
         Then the Workflow and job progress to the Teardown state
-        And the job shows an error with message "TEST ERROR"
+        And the job's system comment contains the following:
+            TEST ERROR
 
         Examples:
             # *** HEADER ***
-            | state    |
+            | workflowState |
             # *** VALUES ***
-            | Proposal |
-            | Setup    |
-            | DataIn   |
-            | PreRun   |
-            | PostRun  |
-            | DataOut  |
+            | Proposal      |
+            | Setup         |
+            | DataIn        |
+            | PostRun       | 
+            | DataOut       | 
 
-    @todo
-    Scenario: The DWS-BB Plugin can handle a DWS driver error during Teardown
+    # With the exception of PreRun, states will need to be canceled with the
+    # "--hurry" flag to transition to the Teardown state. If 
+    # "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
+    # transition to Teardown without needing to be canceled
+    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors
         Given a job script:
             #!/bin/bash
-
-            #DW Teardown action=error message=TEST_ERROR
+
+            #DW <workflowState> action=error message=TEST_ERROR
+            #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
         And a Workflow is created for the job
-        Then the job shows an error with message "TEST ERROR"
+        Then the Workflow and job progress to the Teardown state
+        And the job's system comment contains the following:
+            TEST ERROR
+
+        Examples:
+            # *** HEADER ***
+            | workflowState |
+            # *** VALUES ***
+            | PreRun        |
 
-    @todo
-    Scenario: The DWS-BB Plugin can cancel jobs
+    Scenario: The DWS-BB Plugin can handle DWS driver errors during Teardown
         Given a job script:
             #!/bin/bash
-
-            #DW <state> action=wait
-            #DW Teardown action=wait
+
+            #DW Teardown action=error message=TEST_ERROR
             /bin/hostname
 
         When the job is run
-        And a Workflow is created for the job
-        And the Workflow and job progress to the <state> state
-        And the job is canceled with the hurry flag set to <hurry_flag>
-        Then the Workflow and job progress to the Teardown state
-        And the Workflow's hurry flag is set to <hurry_flag>
-
-        Examples:
-            # *** HEADER ***
-            | state    | hurry_flag |
-            # *** VALUES ***
-            | Proposal | false      |
-            | Setup    | false      |
-            | DataIn   | false      |
-            | PreRun   | false      |
-            | PostRun  | false      |
-            | DataOut  | false      |
-            | Proposal | true       |
-            | Setup    | true       |
-            | DataIn   | true       |
-            | PreRun   | true       |
-            | PostRun  | true       |
-            | DataOut  | true       |
+        Then the job's system comment contains the following:
+            TEST ERROR
+        And the workflow still exists
diff --git a/testsuite/integration/src/features/test_environment.feature b/testsuite/integration/src/features/test_environment.feature
@@ -16,7 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 Feature: Integration test environment
     Verify the integration test environment has been setup correctly
 
@@ -35,9 +34,9 @@ Feature: Integration test environment
             srun -l /bin/hostname
             srun -l /bin/pwd
         When the job is run
-        Then the job is COMPLETED
+        Then the job state is COMPLETED
 
     Scenario: Kubernetes and slurm are connected
         Given the kubernetes cluster kube-system UID
         When the kube-system UID is queried from slurmctld
-        Then the UIDs match and the cluster is the same
+        Then the UIDs match and the cluster is the same
diff --git a/testsuite/integration/src/tests/conftest.py b/testsuite/integration/src/tests/conftest.py
@@ -52,7 +52,7 @@ def pytest_bdd_apply_tag(tag, function):
 
 @given(parsers.parse('a job script:\n{script}'), target_fixture="script_path")
 def _(script):
-    """a simple job script: <script>"""
+    """a job script: <script>"""
     path = "/jobs/" + secrets.token_hex(5) + "-job.sh"
     with open(path, "w") as file:
         file.write(script)
@@ -64,14 +64,6 @@ def _(script):
 @when('the job is run', target_fixture="jobId")
 def _(slurmctld, script_path):
     """the job is run."""
-    _,out = slurmctld.exec_run("sinfo -lNe")
-    print(out)
-
-    _,out = slurmctld.exec_run("scontrol show node")
-    print(out)
-
-    _,out = slurmctld.exec_run("kubectl describe deployment -n dws-operator-system dws-operator-controller-manager")
-    print(out)
 
     jobId, outputFilePath, errorFilePath = slurmctld.submit_job(script_path)
     print("submitted job: " + str(jobId))
@@ -81,9 +73,9 @@ def _(slurmctld, script_path):
     # remove the slurm output from the jobs folder
     slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)
 
-@then(parsers.parse('the job is {expectedJobState}'))
+@then(parsers.parse('the job state is {expectedJobState}'))
 def _(slurmctld, jobId, expectedJobState):
-    """the job completes successfully."""
+    """the job state is <expectedJobState>"""
     jobState, out = slurmctld.get_final_job_state(jobId)
 
     if expectedJobState == "COMPLETED" and jobState == "FAILED":