Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added integration tests for error scenarios and added error observability to plugin #30

Merged
merged 21 commits into from
Feb 10, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c5a29c8
Using dws-test-driver for DWS state progression integration tests
nathandotleeathpe Dec 14, 2022
6cb1711
Fixed integration test errors
nathandotleeathpe Dec 15, 2022
16c9136
code review changes
nathandotleeathpe Dec 17, 2022
37a35e5
Added integration tests for error scenarios
nathandotleeathpe Jan 6, 2023
3c7533d
Updated dws-test-driver to main branch HEAD
nathandotleeathpe Jan 6, 2023
8f31567
Merge branch 'main' into more-tests
nathandotleeathpe Jan 6, 2023
80d9cd7
Code review
nathandotleeathpe Jan 6, 2023
8418bf5
Merge branch 'main' into more-tests
nathandotleeathpe Jan 9, 2023
65ce686
Remove option to run in real slurm or k8s
nathandotleeathpe Jan 30, 2023
6d57479
Refactored unit test before adding error scenarios
nathandotleeathpe Jan 30, 2023
dab85b2
Added driver error observability
nathandotleeathpe Feb 1, 2023
e14e08d
Using dws-test-driver for DWS state progression integration tests (#28)
nathandotleeathpe Feb 2, 2023
69fba72
Merge remote-tracking branch 'upstream/main'
nathandotleeathpe Feb 3, 2023
c2ea3ce
Pre-code review fixes
nathandotleeathpe Feb 6, 2023
3fdce67
Merge remote-tracking branch 'upstream/main'
nathandotleeathpe Feb 6, 2023
bdb437d
PR change
nathandotleeathpe Feb 6, 2023
5e8029a
Update submodules
nathandotleeathpe Feb 6, 2023
b2522a3
Updating dws submodules
nathandotleeathpe Feb 9, 2023
3a93e3f
Removed unused test function
nathandotleeathpe Feb 9, 2023
ed5d138
Added sleep before cancelling job
nathandotleeathpe Feb 10, 2023
161c4f0
Added documentation about test error transformation
nathandotleeathpe Feb 10, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ COPY testsuite/submodules/dws /dws
FROM testbase AS testrun

WORKDIR /
COPY testsuite/unit/bin /bin
COPY testsuite/unit/luacov.lua /.luacov
COPY testsuite/unit/output.lua /output.lua
COPY src /
COPY testsuite/unit/src/burst_buffer/dws-test.lua /

ENV MOCK_SLURM yes
RUN busted -o output.lua -Xoutput junit.xml --verbose --coverage *test.lua || \
touch testsFailed.indicator

Expand Down
11 changes: 2 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,9 @@ test: $(find src -type f) $(find testsuite/unit/src -type f) Dockerfile
docker buildx build $(NOCACHE) $(PROGRESS) --target test -t test .

OUTPUT_HANDLER = --output TAP

TAG ?= # specify a string like TAG="-t mytag"

test-mocks: VALIDATOR ?= testsuite/unit/bin/validate
test-mocks: CRDFILE=testsuite/submodules/dws/config/crd/bases/dws.cray.hpe.com_workflows.yaml
test-mocks:
MOCK_SLURM=yes CRDFILE=$(CRDFILE) VALIDATOR=$(VALIDATOR) busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua

test-realk8s:
MOCK_SLURM=yes REAL_K8S=yes busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua
test-no-docker:
busted $(TAG) $(OUTPUT_HANDLER) testsuite/unit/src/burst_buffer/dws-test.lua

integration-test: $(find testsuite/integration/src -type f) testsuite/integration/Dockerfile
cd testsuite/integration && make setup test clean
69 changes: 68 additions & 1 deletion src/burst_buffer/burst_buffer.lua
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,69 @@ function DWS:get_current_state()
return ret, status
end

function _parse_driver_statuses(text, iterator, status_list, status_info)
if iterator == nil then
iterator = text:gmatch("[^\n]+")
end

if status_list == nil then
status_list = {}
end

local line = iterator()
if line == nil and status_info ~= nil then
table.insert(status_list, status_info)
return status_list
elseif line == nil then
return status_list
end

if line == "===" then
if status_info ~= nil then
table.insert(status_list, status_info)
end

status_info = {}
status_info["status"] = iterator()
status_info["driverID"] = iterator()
status_info["error"] = ""
return _parse_driver_statuses(text, iterator, status_list, status_info)
end

if status_info ~= nil then
status_info["error"] = status_info["error"] .. line .. "\n"
return _parse_driver_statuses(text, iterator, status_list, status_info)
end

return status_list
end

-- DWS:get_driver_errors will collect driver errors from the Workflow resource
-- with respect to the given state.
function DWS:get_driver_errors(state)
local error_list = {}
local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
local ret, output = self:get_jsonpath(jsonpath)
if ret == false then
return ret, "", "could not get driver errors: " .. output
end

driver_statuses = _parse_driver_statuses(output)

errors = ""
for _, driver_status in ipairs(driver_statuses) do
if driver_status["status"] == "Error" then
if errors ~= "" then
errors = errors .. "\n"
end

errors = errors .. driver_status["driverID"] .. ": " .. driver_status["error"]
end
end

return true, errors
end

-- DWS:get_hurry will get the hurry flag of the Workflow resource.
-- On success this returns true and a boolean for the value of the hurry flag.
-- On failure this returns false and the output of the kubectl command.
Expand Down Expand Up @@ -273,7 +336,11 @@ function DWS:wait_for_status_complete(max_passes)
if status["desiredState"] == status["currentState"] and status["status"] == "Completed" then
return true, status
elseif status["status"] == "Error" then
return false, empty, string.format("Error in Workflow %s", self.name)
local ret, driver_errors, err = self:get_driver_errors(status["desiredState"])
if ret ~= true then
return ret, empty, err
end
return false, empty, "DWS driver error(s):\n" .. driver_errors
end
os.execute("sleep 1")
if max_passes > 0 then
Expand Down
83 changes: 37 additions & 46 deletions testsuite/integration/src/features/test_dws_states.feature
Original file line number Diff line number Diff line change
Expand Up @@ -36,82 +36,73 @@ Feature: Data Workflow Services State Progression

When the job is run
And a Workflow is created for the job
#Then the job's temporary Workflow is not found
Then the Workflow and job progress to the Proposal state
And the Workflow and job progress to the Setup state
And the Workflow and job progress to the DataIn state
And the Workflow and job progress to the PreRun state
And the Workflow and job progress to the PostRun state
And the Workflow and job progress to the DataOut state
And the Workflow and job progress to the Teardown state
And the job is COMPLETED
And the job state is COMPLETED

@todo
Scenario: The DWS-BB Plugin can handle DWS driver errors
Scenario Outline: The DWS-BB Plugin can handle DWS driver errors before being canceled
Given a job script:
#!/bin/bash

#DW <state> action=error message=TEST_ERROR
#DW <workflowState> action=error message=TEST_ERROR
#DW Teardown action=wait
/bin/hostname

When the job is run
And a Workflow is created for the job
And the Workflow and job progress to the <workflowState> state
And the job is canceled
Then the Workflow and job progress to the Teardown state
And the job shows an error with message "TEST ERROR"
And the job's system comment contains the following:
TEST ERROR
nathandotleeathpe marked this conversation as resolved.
Show resolved Hide resolved

Examples:
# *** HEADER ***
| state |
| workflowState |
# *** VALUES ***
| Proposal |
| Setup |
| DataIn |
| PreRun |
| PostRun |
| DataOut |
| Proposal |
| Setup |
| DataIn |
| PostRun |
| DataOut |

@todo
Scenario: The DWS-BB Plugin can handle a DWS driver error during Teardown
# With the exception of PreRun, states will need to be cancelled with the
# "--hurry" flag to transition to the Teardown state. If
# "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
# transition to Teardown without needing to be canceled
Scenario Outline: The DWS-BB Plugin can handle DWS driver errors
Given a job script:
#!/bin/bash

#DW Teardown action=error message=TEST_ERROR

#DW <workflowState> action=error message=TEST_ERROR
#DW Teardown action=wait
/bin/hostname

When the job is run
And a Workflow is created for the job
Then the job shows an error with message "TEST ERROR"
Then the Workflow and job progress to the Teardown state
And the job's system comment contains the following:
TEST ERROR
nathandotleeathpe marked this conversation as resolved.
Show resolved Hide resolved

Examples:
# *** HEADER ***
| workflowState |
# *** VALUES ***
| PreRun |

@todo
Scenario: The DWS-BB Plugin can cancel jobs
Scenario: The DWS-BB Plugin can handle DWS driver errors during Teardown
Given a job script:
#!/bin/bash

#DW <state> action=wait
#DW Teardown action=wait

#DW Teardown action=error message=TEST_ERROR
/bin/hostname

When the job is run
And a Workflow is created for the job
And the Workflow and job progress to the <state> state
And the job is canceled with the hurry flag set to <hurry_flag>
Then the Workflow and job progress to the Teardown state
And the Workflow's hurry flag is set to <hurry_flag>

Examples:
# *** HEADER ***
| state | hurry_flag |
# *** VALUES ***
| Proposal | false |
| Setup | false |
| DataIn | false |
| PreRun | false |
| PostRun | false |
| DataOut | false |
| Proposal | true |
| Setup | true |
| DataIn | true |
| PreRun | true |
| PostRun | true |
| DataOut | true |
Then the job's system comment contains the following:
TEST ERROR
nathandotleeathpe marked this conversation as resolved.
Show resolved Hide resolved
And the workflow still exists
4 changes: 2 additions & 2 deletions testsuite/integration/src/features/test_environment.feature
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ Feature: Integration test environment
srun -l /bin/hostname
srun -l /bin/pwd
When the job is run
Then the job is COMPLETED
Then the job state is COMPLETED

Scenario: Kubernetes and slurm are connected
Given the kubernetes cluster kube-system UID
When the kube-system UID is queried from slurmctld
Then the UIDs match and the cluster is the same
Then the UIDs match and the cluster is the same
14 changes: 3 additions & 11 deletions testsuite/integration/src/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def pytest_bdd_apply_tag(tag, function):

@given(parsers.parse('a job script:\n{script}'), target_fixture="script_path")
def _(script):
"""a simple job script: <script>"""
"""a job script: <script>"""
path = "/jobs/" + secrets.token_hex(5) + "-job.sh"
with open(path, "w") as file:
file.write(script)
Expand All @@ -64,14 +64,6 @@ def _(script):
@when('the job is run', target_fixture="jobId")
def _(slurmctld, script_path):
"""the job is run."""
_,out = slurmctld.exec_run("sinfo -lNe")
print(out)

_,out = slurmctld.exec_run("scontrol show node")
print(out)

_,out = slurmctld.exec_run("kubectl describe deployment -n dws-operator-system dws-operator-controller-manager")
print(out)

jobId, outputFilePath, errorFilePath = slurmctld.submit_job(script_path)
print("submitted job: " + str(jobId))
Expand All @@ -81,9 +73,9 @@ def _(slurmctld, script_path):
# remove the slurm output from the jobs folder
slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)

@then(parsers.parse('the job is {expectedJobState}'))
@then(parsers.parse('the job state is {expectedJobState}'))
def _(slurmctld, jobId, expectedJobState):
"""the job completes successfully."""
"""the job state is <expectedJobState>"""
jobState, out = slurmctld.get_final_job_state(jobId)

if expectedJobState == "COMPLETED" and jobState == "FAILED":
Expand Down
53 changes: 32 additions & 21 deletions testsuite/integration/src/tests/dws_bb_plugin/test_dws_states.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,49 +30,60 @@
scenarios("test_dws_states.feature")

@when('a Workflow is created for the job')
@then('the workflow still exists')
def _(k8s, jobId):
"""a Workflow is created for the job."""
workflow = Workflow(k8s, jobId)
assert workflow.data != None, "Workflow for Job: " + str(jobId) + " not found"

yield

@when('the job is canceled with the hurry flag set to <hurry_flag>')
def _():
"""the job is canceled with the hurry flag set to <hurry_flag>."""
raise NotImplementedError
# attempt to delete workflow if it still exists
try:
workflow.delete()
except:
pass

@when('the job\'s temporary Workflow is not found')
def _():
"""the job's temporary Workflow is not found."""
raise NotImplementedError
@when(parsers.parse('the Workflow status becomes {status:l}'))
def _(slurmctld, jobId, status):
"""the Workflow status becomes <status>"""
workflowStatus = slurmctld.get_workflow_status(jobId)
assert workflowStatus["status"] == status

@when('the job is canceled')
def _(slurmctld, jobId):
"""the job is canceled"""
slurmctld.cancel_job(jobId, False)

@when(parsers.parse('the Workflow and job progress to the {state:l} state'))
@then(parsers.parse('the Workflow and job progress to the {state:l} state'))
def _(k8s, slurmctld, jobId, state):
"""the Workflow and job progress to the <state> state."""
workflow = Workflow(k8s, jobId)
workflow.wait_until(
"the state the workflow is transitioning to",
lambda wf: wf.data["status"]["state"], state
lambda wf: wf.data["status"]["state"] == state and wf.data["status"]["status"] in ["Error", "DriverWait"]
)
print("job %s progressed to state %s" % (str(jobId),state))

jobStatus = slurmctld.get_workflow_status(jobId)
assert jobStatus["desiredState"] == state, "Incorrect desired state: " + str(jobStatus)
assert jobStatus["currentState"] == state, "Incorrect current state: " + str(jobStatus)
assert jobStatus["status"] == "DriverWait", "Incorrect status: " + str(jobStatus)
assert jobStatus["status"] == workflow.data["status"]["status"], "Incorrect status: " + str(jobStatus)

# Set driver status to completed so the workflow can progress to the next state
updateRequired = False
for driverStatus in workflow.data["status"]["drivers"]:
if driverStatus["driverID"] == "tester" and state in driverStatus["watchState"]:
if driverStatus["driverID"] == "tester" and state in driverStatus["watchState"] and driverStatus["status"] == "Pending":
print("updating job %s to complete state %s" % (str(jobId), state))
driverStatus["completed"] = True
driverStatus["status"] = "Completed"
updateRequired = True

workflow.save_driver_statuses()

@then('the Workflow\'s hurry flag is set to <hurry_flag>')
def _():
"""the Workflow's hurry flag is set to <hurry_flag>."""
raise NotImplementedError
if updateRequired:
workflow.save_driver_statuses()

@then('the job shows an error with message "{message}"')
def _():
"""the job shows an error with message "{message}"}"."""
raise NotImplementedError
@then(parsers.parse("the job's system comment contains the following:\n{message}"))
def _(slurmctld, jobId, message):
_,out = slurmctld.get_final_job_state(jobId)
assert message in out
Loading