DataWorkflowServices · roehrich-hpe · Aug 29, 2023 · Aug 23, 2023 · Aug 29, 2023 · bdevcich
diff --git a/testsuite/integration/README.md b/testsuite/integration/README.md
@@ -15,7 +15,7 @@ make setup
 ```
 
 Build a version of the integration test container using only the `testbase`
-stage, so it has the tests but doesn't automatically run the tests:
+stage, so it has the test environment but doesn't automatically run the tests:
 
 ```console
 docker build -t local/integration-test:test --target testbase --no-cache .

diff --git a/testsuite/integration/src/features/test_dws_states.feature b/testsuite/integration/src/features/test_dws_states.feature
@@ -1,5 +1,5 @@
 #
-# Copyright 2022 Hewlett Packard Enterprise Development LP
+# Copyright 2022-2023 Hewlett Packard Enterprise Development LP
 # Other additional copyright holders may be indicated within.
 #
 # The entirety of this work is licensed under the Apache License,
@@ -17,6 +17,7 @@
 # limitations under the License.
 #
 
+@dws_states
 Feature: Data Workflow Services State Progression
     Verify that the DWS-Slurm Burst Buffer Plugin progresses through Data
     Workflow Services states
@@ -35,8 +36,8 @@ Feature: Data Workflow Services State Progression
             /bin/hostname
 
         When the job is run
-        And a Workflow is created for the job
-        Then the Workflow and job progress to the Proposal state
+        Then a Workflow has been created for the job
+        And the Workflow and job progress to the Proposal state
         And the Workflow and job progress to the Setup state
         And the Workflow and job progress to the DataIn state
         And the Workflow and job progress to the PreRun state
@@ -49,21 +50,21 @@ Feature: Data Workflow Services State Progression
     # constraint, the dws-test-driver replaces underscores ("_") in the message value with
     # spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
     # It also makes it easier to check that the error is included in scontrol output.
-    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors before being canceled
+    Scenario Outline: The DWS-BB Plugin can handle fatal driver errors before being canceled
         Given a job script:
             #!/bin/bash
 
-            #DW <workflowState> action=error message=TEST_ERROR
+            #DW <workflowState> action=error message=TEST_FATAL_ERROR severity=Fatal
             #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
-        And a Workflow is created for the job
-        And the Workflow and job report errors at the <workflowState> state
+        Then a Workflow has been created for the job
+        And the Workflow and job report fatal errors at the <workflowState> state
         And the job is canceled
-        Then the Workflow and job progress to the Teardown state
-        And the job's system comment contains the following:
-            TEST ERROR
+        And the Workflow and job progress to the Teardown state
+        And the job's final system comment contains the following:
+            TEST FATAL ERROR
 
         Examples:
             # *** HEADER ***
@@ -79,34 +80,44 @@ Feature: Data Workflow Services State Progression
     # "--hurry" flag to transition to the Teardown state. If 
     # "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
     # transition to Teardown without needing to be canceled
-    Scenario Outline: The DWS-BB Plugin can handle DWS driver errors
+    Scenario Outline: The DWS-BB Plugin can handle fatal driver errors for PreRun
         Given a job script:
             #!/bin/bash
 
-            #DW <workflowState> action=error message=TEST_ERROR
+            #DW <workflowState> action=error message=TEST_FATAL_ERROR severity=Fatal
             #DW Teardown action=wait
             /bin/hostname
 
         When the job is run
-        And a Workflow is created for the job
-        Then the Workflow and job progress to the Teardown state
-        And the job's system comment contains the following:
-            TEST ERROR
+        Then a Workflow has been created for the job
+        And the Workflow reports a fatal error in the <workflowState> state
+        And the Workflow and job progress to the Teardown state
+        # Slurm moved it from PreRun/Error to Teardown without canceling
+        # the job.  So the driver (this test) must cancel it.
+        And the job is canceled
+        And the job's final system comment contains the following:
+            TEST FATAL ERROR
 
         Examples:
             # *** HEADER ***
             | workflowState |
             # *** VALUES ***
             | PreRun        |
 
-    Scenario: The DWS-BB Plugin can handle DWS driver errors during Teardown
+    Scenario: The DWS-BB Plugin can handle fatal driver errors during Teardown
         Given a job script:
             #!/bin/bash
 
-            #DW Teardown action=error message=TEST_ERROR
+            #DW Teardown action=error message=TEST_FATAL_ERROR severity=Fatal
             /bin/hostname
 
         When the job is run
-        Then the job's system comment contains the following:
-            TEST ERROR
-        And the workflow still exists
+        Then a Workflow has been created for the job
+        And the Workflow reports a fatal error in the Teardown state
+        And the job's intermediate system comment contains the following:
+            TEST FATAL ERROR
+        # Eventually the driver (this test) must work through the Teardown
+        # issues and complete that step.  Slurm has already marked the job
+        # as completed and is now looping over slurm_bb_job_teardown() in
+        # burst_buffer.lua.
+        And the Workflow error is cleared from the Teardown state
diff --git a/testsuite/integration/src/features/test_environment.feature b/testsuite/integration/src/features/test_environment.feature
@@ -1,5 +1,5 @@
 #
-# Copyright 2022 Hewlett Packard Enterprise Development LP
+# Copyright 2022-2023 Hewlett Packard Enterprise Development LP
 # Other additional copyright holders may be indicated within.
 #
 # The entirety of this work is licensed under the Apache License,
@@ -16,6 +16,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+@environment
 Feature: Integration test environment
     Verify the integration test environment has been setup correctly
 
@@ -39,4 +41,4 @@ Feature: Integration test environment
     Scenario: Kubernetes and slurm are connected
         Given the kubernetes cluster kube-system UID
         When the kube-system UID is queried from slurmctld
-        Then the UIDs match and the cluster is the same
+        Then the UIDs match and the cluster is the same
diff --git a/testsuite/integration/src/pytest.ini b/testsuite/integration/src/pytest.ini
@@ -1,5 +1,5 @@
 #
-# Copyright 2022 Hewlett Packard Enterprise Development LP
+# Copyright 2022-2023 Hewlett Packard Enterprise Development LP
 # Other additional copyright holders may be indicated within.
 #
 # The entirety of this work is licensed under the Apache License,
@@ -18,4 +18,8 @@
 #
 
 [pytest]
-bdd_features_base_dir = features
+bdd_features_base_dir = features
+markers =
+  environment
+  dws_states
+
diff --git a/testsuite/integration/src/tests/conftest.py b/testsuite/integration/src/tests/conftest.py
@@ -18,19 +18,18 @@
 #
 
 import os
-import pytest
 import secrets
 import warnings
+import pytest
 
-from .slurmctld import Slurmctld
 from kubernetes import client, config
 from pytest_bdd import (
     given,
-    scenarios,
     parsers,
     then,
     when,
 )
+from .slurmctld import Slurmctld
 
 @pytest.fixture
 def k8s():
@@ -54,7 +53,7 @@ def pytest_bdd_apply_tag(tag, function):
 def _(script):
     """a job script: <script>"""
     path = "/jobs/inttest-" + secrets.token_hex(5) + "-job.sh"
-    with open(path, "w") as file:
+    with open(path, "w", encoding="utf-8") as file:
         file.write(script)
 
     yield path
@@ -87,4 +86,3 @@ def _(slurmctld, jobId, expectedJobState):
         return
 
     assert jobState == expectedJobState, "Unexpected Job State: " + jobState + "\n" + out
-
diff --git a/testsuite/integration/src/tests/dws_bb_plugin/test_dws_states.py b/testsuite/integration/src/tests/dws_bb_plugin/test_dws_states.py
@@ -1,5 +1,5 @@
 #
-# Copyright 2022 Hewlett Packard Enterprise Development LP
+# Copyright 2022-2023 Hewlett Packard Enterprise Development LP
 # Other additional copyright holders may be indicated within.
 #
 # The entirety of this work is licensed under the Apache License,
@@ -18,31 +18,32 @@
 #
 
 import time
-from .workflow import Workflow
+import re
 from pytest_bdd import (
-    given,
     parsers,
     scenarios,
     then,
     when,
 )
+from kubernetes import client as k8sclient
+from .workflow import Workflow
 
-"""Data Workflow Services State Progression feature tests."""
+# Data Workflow Services State Progression feature tests.
 scenarios("test_dws_states.feature")
 
-@when('a Workflow is created for the job')
+@then('a Workflow has been created for the job')
 @then('the workflow still exists')
 def _(k8s, jobId):
-    """a Workflow is created for the job."""
+    """a Workflow has been created for the job."""
     workflow = Workflow(k8s, jobId)
-    assert workflow.data != None, "Workflow for Job: " + str(jobId) + " not found"
-    
+    assert workflow.data is not None, "Workflow for Job: " + str(jobId) + " not found"
+
     yield
 
     # attempt to delete workflow if it still exists
     try:
         workflow.delete()
-    except:
+    except k8sclient.exceptions.ApiException:
         pass
 
 @when(parsers.parse('the Workflow status becomes {status:l}'))
@@ -51,11 +52,12 @@ def _(slurmctld, jobId, status):
     workflowStatus = slurmctld.get_workflow_status(jobId)
     assert workflowStatus["status"] == status
 
-@when('the job is canceled')
+@then('the job is canceled')
 def _(slurmctld, jobId):
     """the job is canceled"""
     time.sleep(2) # Sleep long enough for bb plugin to poll workflow once or twice
     slurmctld.cancel_job(jobId, False)
+    time.sleep(2) # Sleep long enough for the workflow to be deleted
 
 def verify_job_status(slurmctld, jobId, state, status):
     jobStatus = slurmctld.get_workflow_status(jobId)
@@ -71,7 +73,7 @@ def _(k8s, slurmctld, jobId, state):
 
     workflow = Workflow(k8s, jobId)
     workflow.wait_until(
-        "the state the workflow is transitioning to",
+        f"the workflow transitions to {state}/{expectedStatus}",
         lambda wf: wf.data["status"]["state"] == state and wf.data["status"]["status"] == expectedStatus
     )
     print("job %s progressed to state %s" % (str(jobId),state))
@@ -81,31 +83,89 @@ def _(k8s, slurmctld, jobId, state):
     # Set driver status to completed so the workflow can progress to the next state
     foundPendingDriverStatus = False
     for driverStatus in workflow.data["status"]["drivers"]:
-        if driverStatus["driverID"] == "tester" and state in driverStatus["watchState"] and driverStatus["status"] == "Pending":
-            print("updating job %s to complete state %s" % (str(jobId), state))
+        if driverStatus["driverID"] == "tester" and driverStatus["watchState"] == state and driverStatus["status"] == "Pending":
+            print("updating workflow %s to complete state %s" % (str(jobId), state))
+            driverStatus["completed"] = True
+            driverStatus["status"] = "Completed"
+            foundPendingDriverStatus = True
+            break
+
+    assert foundPendingDriverStatus, "Driver not found with \"Pending\" status"
+    workflow.save_driver_statuses()
+
+@then(parsers.parse('the Workflow error is cleared from the {state:l} state'))
+def _(k8s, slurmctld, jobId, state):
+    """the Workflow error is cleared from the <state> state."""
+
+    workflow = Workflow(k8s, jobId)
+
+    # Set driver status to completed so the workflow can progress to the next state
+    foundPendingDriverStatus = False
+    for driverStatus in workflow.data["status"]["drivers"]:
+        if driverStatus["driverID"] == "tester" and driverStatus["watchState"] == state and driverStatus["status"] == "Error":
+            print(f"updating workflow %s to complete state %s" % (str(jobId), state))
             driverStatus["completed"] = True
             driverStatus["status"] = "Completed"
+            # The DWS webhook requires that the error message be cleared as well.
+            del driverStatus["error"]
             foundPendingDriverStatus = True
+            break
 
-    assert foundPendingDriverStatus, "Driver not found with \"Pending\" status" 
+    assert foundPendingDriverStatus, "Driver not found with \"Error\" status"
     workflow.save_driver_statuses()
 
-@when(parsers.parse('the Workflow and job report errors at the {state:l} state'))
+def driver_state_check(workflow, state, expected_status):
+    found_it = False
+    print(f"check drivers for state {state} with status {expected_status}")
+    for driver in workflow.data["status"]["drivers"]:
+        if driver["driverID"] == "tester" and driver["watchState"] == state:
+            if driver["status"] == expected_status:
+                print(f"found driver state {state} with {expected_status}")
+                found_it = True
+            else:
+                print(f"found driver state {state}/{driver['status']}")
+            break
+    return found_it
+
+@then(parsers.parse('the Workflow and job report fatal errors at the {state:l} state'))
 def _(k8s, slurmctld, jobId, state):
     """the Workflow and job report errors at the <state> state."""
 
-    expectedStatus = "Error"
+    expected_status = "Error"
+
+    def driver_check(workflow):
+        return driver_state_check(workflow, state, expected_status)
 
     workflow = Workflow(k8s, jobId)
     workflow.wait_until(
-        "the state the workflow is transitioning to",
-        lambda wf: wf.data["status"]["state"] == state and wf.data["status"]["status"] == expectedStatus
+        f"the workflow {state} state shows a status of {expected_status}",
+        lambda wf: driver_check(wf) is True
     )
-    print("job %s progressed to state %s" % (str(jobId),state))
 
-    verify_job_status(slurmctld, jobId, state, expectedStatus)
+    verify_job_status(slurmctld, jobId, state, expected_status)
+
+@then(parsers.parse('the Workflow reports a fatal error in the {state:l} state'))
+def _(k8s, slurmctld, jobId, state):
+    """the Workflow reports a fatal error in the <state> state."""
+
+    expected_status = "Error"
+
+    def driver_check(workflow):
+        return driver_state_check(workflow, state, expected_status)
+
+    workflow = Workflow(k8s, jobId)
+    workflow.wait_until(
+        f"the workflow {state} state shows a status of {expected_status}",
+        lambda wf: driver_check(wf) is True
+    )
 
-@then(parsers.parse("the job's system comment contains the following:\n{message}"))
-def _(slurmctld, jobId, message):
-    _,out = slurmctld.get_final_job_state(jobId)
-    assert message in out
+@then(parsers.parse("the job's {disposition:l} system comment contains the following:\n{message}"))
+def _(slurmctld, jobId, disposition, message):
+    assert disposition in ["final", "intermediate"], f"unknown disposition: {disposition}"
+    must_be_gone = True if disposition == "final" else False
+    _,out = slurmctld.get_final_job_state(jobId, must_be_gone)
+    m = re.search(r'\n\s+SystemComment=(.*)\n\s+StdErr=', out, re.DOTALL)
+    assert m is not None, f"Could not find SystemComment in job state from Slurm\n{out}"
+    if message in m.group(1):
+        print(f"Found \"{message}\" in SystemComment")
+    assert message in m.group(1)