From 0dae83753ee3f4e363d82eff662bf929be714a79 Mon Sep 17 00:00:00 2001 From: Nathan Lee <113044550+nathandotleeathpe@users.noreply.github.com> Date: Fri, 10 Feb 2023 13:17:03 -0600 Subject: [PATCH] Added integration tests for error scenarios and added error observability to plugin (#30) * Using dws-test-driver for DWS state progression integration tests Signed-off-by: Nathan Lee * Fixed integration test errors Signed-off-by: Nathan Lee * code review changes Signed-off-by: Nathan Lee * Added integration tests for error scenarios Signed-off-by: Nathan Lee * Updated dws-test-driver to main branch HEAD Signed-off-by: Nathan Lee * Code review Signed-off-by: Nathan Lee * Remove option to run in real slurm or k8s Signed-off-by: Nathan Lee * Refactored unit test before adding error scenarios Signed-off-by: Nathan Lee * Added driver error observability Signed-off-by: Nathan Lee * Using dws-test-driver for DWS state progression integration tests (#28) * Using dws-test-driver for DWS state progression integration tests Signed-off-by: Nathan Lee * Fixed integration test errors Signed-off-by: Nathan Lee * code review changes Signed-off-by: Nathan Lee * Updated dws-test-driver to main branch HEAD Signed-off-by: Nathan Lee * Code review Signed-off-by: Nathan Lee --------- Signed-off-by: Nathan Lee * Pre-code review fixes Signed-off-by: Nathan Lee * PR change Signed-off-by: Nathan Lee * Update submodules Signed-off-by: Nathan Lee * Updating dws submodules Signed-off-by: Nathan Lee * Removed unused test function Signed-off-by: Nathan Lee * Added sleep before cancelling job Signed-off-by: Nathan Lee * Added documentation about test error transformation Signed-off-by: Nathan Lee --------- Signed-off-by: Nathan Lee --- .gitmodules | 2 +- Dockerfile | 2 - Makefile | 11 +- src/burst_buffer/burst_buffer.lua | 69 ++- .../src/features/test_dws_states.feature | 87 ++-- .../src/features/test_environment.feature | 5 +- testsuite/integration/src/tests/conftest.py | 14 +- .../tests/dws_bb_plugin/test_dws_states.py | 77 +++- .../src/tests/dws_bb_plugin/workflow.py | 28 +- testsuite/integration/src/tests/slurmctld.py | 37 +- testsuite/submodules/dws-test-driver | 2 +- testsuite/submodules/slurm-docker-cluster | 2 +- testsuite/unit/bin/validate | 37 -- testsuite/unit/src/burst_buffer/dws-test.lua | 433 +++++++----------- 14 files changed, 373 insertions(+), 433 deletions(-) delete mode 100755 testsuite/unit/bin/validate diff --git a/.gitmodules b/.gitmodules index 36f39db..1cff8c4 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,6 @@ [submodule "testsuite/submodules/dws"] path = testsuite/submodules/dws url = https://github.com/HewlettPackard/dws.git - branch = master [submodule "testsuite/submodules/slurm-docker-cluster"] path = testsuite/submodules/slurm-docker-cluster url = git@github.com:DataWorkflowServices/slurm-docker-cluster.git @@ -9,3 +8,4 @@ [submodule "testsuite/submodules/dws-test-driver"] path = testsuite/submodules/dws-test-driver url = git@github.com:DataWorkflowServices/dws-test-driver.git + branch = main diff --git a/Dockerfile b/Dockerfile index f5ba6ac..174dce3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,13 +10,11 @@ COPY testsuite/submodules/dws /dws FROM testbase AS testrun WORKDIR / -COPY testsuite/unit/bin /bin COPY testsuite/unit/luacov.lua /.luacov COPY testsuite/unit/output.lua /output.lua COPY src / COPY testsuite/unit/src/burst_buffer/dws-test.lua / -ENV MOCK_SLURM yes RUN busted -o output.lua -Xoutput junit.xml --verbose --coverage *test.lua || \ touch testsFailed.indicator diff --git a/Makefile b/Makefile index 1d331bc..8e4fe37 100644 --- a/Makefile +++ b/Makefile @@ -24,16 +24,9 @@ test: $(find src -type f) $(find testsuite/unit/src -type f) Dockerfile docker buildx build $(NOCACHE) $(PROGRESS) --target test -t test . OUTPUT_HANDLER = --output TAP - TAG ?= # specify a string like TAG="-t mytag" - -test-mocks: VALIDATOR ?= testsuite/unit/bin/validate -test-mocks: CRDFILE=testsuite/submodules/dws/config/crd/bases/dws.cray.hpe.com_workflows.yaml -test-mocks: - MOCK_SLURM=yes CRDFILE=$(CRDFILE) VALIDATOR=$(VALIDATOR) busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua - -test-realk8s: - MOCK_SLURM=yes REAL_K8S=yes busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua +test-no-docker: + busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua integration-test: $(find testsuite/integration/src -type f) testsuite/integration/Dockerfile cd testsuite/integration && make setup test clean diff --git a/src/burst_buffer/burst_buffer.lua b/src/burst_buffer/burst_buffer.lua index 31db12e..0a77f59 100644 --- a/src/burst_buffer/burst_buffer.lua +++ b/src/burst_buffer/burst_buffer.lua @@ -241,6 +241,69 @@ function DWS:get_current_state() return ret, status end +function _parse_driver_statuses(text, iterator, status_list, status_info) + if iterator == nil then + iterator = text:gmatch("[^\n]+") + end + + if status_list == nil then + status_list = {} + end + + local line = iterator() + if line == nil and status_info ~= nil then + table.insert(status_list, status_info) + return status_list + elseif line == nil then + return status_list + end + + if line == "===" then + if status_info ~= nil then + table.insert(status_list, status_info) + end + + status_info = {} + status_info["status"] = iterator() + status_info["driverID"] = iterator() + status_info["error"] = "" + return _parse_driver_statuses(text, iterator, status_list, status_info) + end + + if status_info ~= nil then + status_info["error"] = status_info["error"] .. line .. "\n" + return _parse_driver_statuses(text, iterator, status_list, status_info) + end + + return status_list +end + +-- DWS:get_driver_errors will collect driver errors from the Workflow resource +-- with respect to the given state. +function DWS:get_driver_errors(state) + local error_list = {} + local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]] + local ret, output = self:get_jsonpath(jsonpath) + if ret == false then + return ret, "", "could not get driver errors: " .. output + end + + driver_statuses = _parse_driver_statuses(output) + + errors = "" + for _, driver_status in ipairs(driver_statuses) do + if driver_status["status"] == "Error" then + if errors ~= "" then + errors = errors .. "\n" + end + + errors = errors .. driver_status["driverID"] .. ": " .. driver_status["error"] + end + end + + return true, errors +end + -- DWS:get_hurry will get the hurry flag of the Workflow resource. -- On success this returns true and a boolean for the value of the hurry flag. -- On failure this returns false and the output of the kubectl command. @@ -273,7 +336,11 @@ function DWS:wait_for_status_complete(max_passes) if status["desiredState"] == status["currentState"] and status["status"] == "Completed" then return true, status elseif status["status"] == "Error" then - return false, empty, string.format("Error in Workflow %s", self.name) + local ret, driver_errors, err = self:get_driver_errors(status["desiredState"]) + if ret ~= true then + return ret, empty, err + end + return false, empty, "DWS driver error(s):\n" .. driver_errors end os.execute("sleep 1") if max_passes > 0 then diff --git a/testsuite/integration/src/features/test_dws_states.feature b/testsuite/integration/src/features/test_dws_states.feature index 1bb26e4..20439c1 100644 --- a/testsuite/integration/src/features/test_dws_states.feature +++ b/testsuite/integration/src/features/test_dws_states.feature @@ -36,7 +36,6 @@ Feature: Data Workflow Services State Progression When the job is run And a Workflow is created for the job - #Then the job's temporary Workflow is not found Then the Workflow and job progress to the Proposal state And the Workflow and job progress to the Setup state And the Workflow and job progress to the DataIn state @@ -44,74 +43,70 @@ Feature: Data Workflow Services State Progression And the Workflow and job progress to the PostRun state And the Workflow and job progress to the DataOut state And the Workflow and job progress to the Teardown state - And the job is COMPLETED + And the job state is COMPLETED - @todo - Scenario: The DWS-BB Plugin can handle DWS driver errors + # DWS does not allow spaces in key/value pairs in directives. To skirt around this + # constraint, the dws-test-driver replaces underscores ("_") in the message value with + # spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages + # It also makes it easier to check that the error is included in scontrol output. + Scenario Outline: The DWS-BB Plugin can handle DWS driver errors before being canceled Given a job script: #!/bin/bash - - #DW action=error message=TEST_ERROR + + #DW action=error message=TEST_ERROR #DW Teardown action=wait /bin/hostname When the job is run And a Workflow is created for the job + And the Workflow and job report errors at the state + And the job is canceled Then the Workflow and job progress to the Teardown state - And the job shows an error with message "TEST ERROR" + And the job's system comment contains the following: + TEST ERROR Examples: # *** HEADER *** - | state | + | workflowState | # *** VALUES *** - | Proposal | - | Setup | - | DataIn | - | PreRun | - | PostRun | - | DataOut | + | Proposal | + | Setup | + | DataIn | + | PostRun | + | DataOut | - @todo - Scenario: The DWS-BB Plugin can handle a DWS driver error during Teardown + # With the exception of PreRun, states will need to be canceled with the + # "--hurry" flag to transition to the Teardown state. If + # "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will + # transition to Teardown without needing to be canceled + Scenario Outline: The DWS-BB Plugin can handle DWS driver errors Given a job script: #!/bin/bash - - #DW Teardown action=error message=TEST_ERROR + + #DW action=error message=TEST_ERROR + #DW Teardown action=wait /bin/hostname When the job is run And a Workflow is created for the job - Then the job shows an error with message "TEST ERROR" + Then the Workflow and job progress to the Teardown state + And the job's system comment contains the following: + TEST ERROR + + Examples: + # *** HEADER *** + | workflowState | + # *** VALUES *** + | PreRun | - @todo - Scenario: The DWS-BB Plugin can cancel jobs + Scenario: The DWS-BB Plugin can handle DWS driver errors during Teardown Given a job script: #!/bin/bash - - #DW action=wait - #DW Teardown action=wait + + #DW Teardown action=error message=TEST_ERROR /bin/hostname When the job is run - And a Workflow is created for the job - And the Workflow and job progress to the state - And the job is canceled with the hurry flag set to - Then the Workflow and job progress to the Teardown state - And the Workflow's hurry flag is set to - - Examples: - # *** HEADER *** - | state | hurry_flag | - # *** VALUES *** - | Proposal | false | - | Setup | false | - | DataIn | false | - | PreRun | false | - | PostRun | false | - | DataOut | false | - | Proposal | true | - | Setup | true | - | DataIn | true | - | PreRun | true | - | PostRun | true | - | DataOut | true | \ No newline at end of file + Then the job's system comment contains the following: + TEST ERROR + And the workflow still exists diff --git a/testsuite/integration/src/features/test_environment.feature b/testsuite/integration/src/features/test_environment.feature index af09b04..d12e9e4 100644 --- a/testsuite/integration/src/features/test_environment.feature +++ b/testsuite/integration/src/features/test_environment.feature @@ -16,7 +16,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # - Feature: Integration test environment Verify the integration test environment has been setup correctly @@ -35,9 +34,9 @@ Feature: Integration test environment srun -l /bin/hostname srun -l /bin/pwd When the job is run - Then the job is COMPLETED + Then the job state is COMPLETED Scenario: Kubernetes and slurm are connected Given the kubernetes cluster kube-system UID When the kube-system UID is queried from slurmctld - Then the UIDs match and the cluster is the same + Then the UIDs match and the cluster is the same \ No newline at end of file diff --git a/testsuite/integration/src/tests/conftest.py b/testsuite/integration/src/tests/conftest.py index 689c247..894ac3c 100644 --- a/testsuite/integration/src/tests/conftest.py +++ b/testsuite/integration/src/tests/conftest.py @@ -52,7 +52,7 @@ def pytest_bdd_apply_tag(tag, function): @given(parsers.parse('a job script:\n{script}'), target_fixture="script_path") def _(script): - """a simple job script: