Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release v0.0.2 #48

Merged
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
[submodule "testsuite/submodules/dws"]
path = testsuite/submodules/dws
url = https://github.com/HewlettPackard/dws.git
branch = releases/v0
[submodule "testsuite/submodules/slurm-docker-cluster"]
path = testsuite/submodules/slurm-docker-cluster
url = git@github.com:DataWorkflowServices/slurm-docker-cluster.git
[submodule "testsuite/submodules/dws-test-driver"]
path = testsuite/submodules/dws-test-driver
url = git@github.com:DataWorkflowServices/dws-test-driver.git
branch = releases/v0
[submodule "testsuite/submodules/dws"]
path = testsuite/submodules/dws
url = git@github.com:DataWorkflowServices/dws.git
6 changes: 6 additions & 0 deletions src/burst_buffer/burst_buffer.conf
Original file line number Diff line number Diff line change
@@ -3,3 +3,9 @@
# See https://slurm.schedmd.com/burst_buffer.conf.html
Directive=DW

# If set, then teardown a burst buffer after file staging error. Otherwise
# preserve the burst buffer for analysis and manual teardown.
# See https://slurm.schedmd.com/burst_buffer.conf.html
# and https://slurm.schedmd.com/burst_buffer.html#states
Flags=TeardownFailure

78 changes: 67 additions & 11 deletions src/burst_buffer/burst_buffer.lua
Original file line number Diff line number Diff line change
@@ -35,7 +35,7 @@ DEFAULT_LABEL_KEY = "origin"
DEFAULT_LABEL_VAL = lua_script_name

-- The fully-qualified name of the DWS Workflow CRD.
local WORKFLOW_CRD = "workflows.dws.cray.hpe.com"
local WORKFLOW_CRD = "workflows.dataworkflowservices.github.io"

KUBECTL_CACHE_DIR = "/tmp/burst_buffer_kubectl_cache"

@@ -118,7 +118,7 @@ end
-- resource with keywords that must be replaced by the caller.
function DWS:template()
return [[
apiVersion: dws.cray.hpe.com/v1alpha2
apiVersion: dataworkflowservices.github.io/v1alpha2
kind: Workflow
metadata:
name: WF_NAME
@@ -280,9 +280,16 @@ end

-- DWS:get_driver_errors will collect driver errors from the Workflow resource
-- with respect to the given state.
function DWS:get_driver_errors(state)
local error_list = {}
local jsonpath = [[{range .status.drivers[?(@.watchState=="]].. state ..[[")]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
-- If all_errors=true then collect all errors from all states in all drivers.
-- On success this returns true and a string with all of the errors.
-- On failure this returns false, an empty string for the errors, and a string
-- explaining why it couldn't collect the errors.
function DWS:get_driver_errors(state, all_errors)
local driver_index = [[?(@.watchState=="]].. state ..[[")]]
if all_errors == true then
driver_index = "*"
end
local jsonpath = [[{range .status.drivers[]] .. driver_index .. [[]}==={"\n"}{@.status}{"\n"}{@.driverID}{"\n"}{@.error}{"\n"}{end}]]
local ret, output = self:get_jsonpath(jsonpath)
if ret == false then
return ret, "", "could not get driver errors: " .. output
@@ -442,6 +449,18 @@ function DWS:kubectl(cmd)
return self:io_popen(kcmd)
end

-- DWS:scancel will run the Slurm scancel command and collect its output.
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
function DWS:scancel(jobId, hurry)
local hurry_opt = ""
if hurry == true then
hurry_opt = "--hurry "
end
local scmd = "scancel " .. hurry_opt .. jobId
return self:io_popen(scmd)
end

-- DWS:io_popen will run the given command and collect its output.
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
@@ -627,24 +646,51 @@ function slurm_bb_job_teardown(job_id, job_script, hurry)
hurry_flag = true
end
local workflow = DWS(make_workflow_name(job_id))
local done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)

local ret = slurm.SUCCESS
-- Does the workflow have a fatal error in it?
-- If so, we'll call scancel as well.
local done, state_errors, err = workflow:get_driver_errors("", true)
if done == false then
if string.find(err, [["]] .. workflow.name .. [[" not found]]) then
-- It's already gone, and that's what we wanted anyway.
return slurm.SUCCESS
else
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
return slurm.ERROR, err
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: unable to check driver errors: %s", lua_script_name, workflow.name, err)
ret = slurm.ERROR
-- fall-through, let the Workflow delete happen.
end
end

done, err = workflow:set_workflow_state_and_wait("Teardown", hurry_flag)
if done == false then
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s: %s", lua_script_name, workflow.name, err)
ret = slurm.ERROR
-- fall-through, let the Workflow delete happen.
end

done, err = workflow:delete()
if done == false then
slurm.log_error("%s: slurm_bb_job_teardown(), workflow=%s, delete: %s", lua_script_name, workflow.name, err)
return slurm.ERROR, err
ret = slurm.ERROR
-- fall-through, let any necessary scancel happen.
end

if state_errors ~= "" then
-- Now do the scancel. This will terminate this Lua script and will
-- trigger slurm to call our teardown again, but that'll be a no-op
-- when it comes back here.
slurm.log_info("%s: slurm_bb_job_teardown(), workflow=%s: executing scancel --hurry %s, found driver errors: %s", lua_script_name, workflow.name, job_id, state_errors)
_, err = workflow:scancel(job_id, true)
if err == "" then
err = "(no output)"
end
end

return slurm.SUCCESS
if ret == slurm.SUCCESS then
err = ""
end
return ret, err
end

--[[
@@ -844,10 +890,20 @@ function slurm_bb_get_status(...)
local args = {...}
args.n = select("#", ...)

local found_jid = false
local jid = 0
if args.n == 2 and args[1] == "workflow" then
-- Slurm 22.05
jid = args[2]
found_jid = true
elseif args.n == 4 and args[3] == "workflow" then
-- Slurm 23.02
jid = args[4]
found_jid = true
end
if found_jid == true then
local done = false
local status = {}
local jid = args[2]
if string.find(jid, "^%d+$") == nil then
msg = "A job ID must contain only digits."
else
18 changes: 9 additions & 9 deletions testsuite/integration/Makefile
Original file line number Diff line number Diff line change
@@ -39,20 +39,20 @@ setup-dws:
@{\
set -e ; \
cd ../submodules/dws ; \
docker buildx build -t local/dws-operator:test --load . ; \
IMAGE_TAG_BASE=local/dws-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-controller-manager --for condition=Available=True ; \
kubectl wait deployment --timeout=120s -n dws-operator-system dws-operator-webhook --for condition=Available=True ; \
docker buildx build -t local/dws:test --load . ; \
IMAGE_TAG_BASE=local/dws VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=120s -n dws-system dws-controller-manager --for condition=Available=True ; \
kubectl wait deployment --timeout=120s -n dws-system dws-webhook --for condition=Available=True ; \
}

.PHONY: setup-dws-test-driver
setup-dws-test-driver:
@{\
set -e ; \
cd ../submodules/dws-test-driver ; \
docker buildx build -t local/dws-test-driver-operator:test --load . ; \
IMAGE_TAG_BASE=local/dws-test-driver-operator VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=60s -n dws-test-operator-system dws-test-driver-controller-manager --for condition=Available=True ; \
docker buildx build -t local/dws-test-driver:test --load . ; \
IMAGE_TAG_BASE=local/dws-test-driver VERSION=test KIND_CLUSTER=dws make kind-push deploy ; \
kubectl wait deployment --timeout=60s -n dws-test-system dws-test-driver-controller-manager --for condition=Available=True ; \
}

.PHONY: setup
@@ -75,10 +75,10 @@ debug:
kubectl describe node dws-control-plane dws-worker
echo
echo "***** DWS DEPLOYMENT *****"
kubectl describe deployment -n dws-operator-system dws-operator-controller-manager
kubectl describe deployment -n dws-system dws-controller-manager
echo
echo "***** DWS LOGS *****"
kubectl logs -n dws-operator-system deployment/dws-operator-controller-manager
kubectl logs -n dws-system deployment/dws-controller-manager

.PHONY: reports
reports:
67 changes: 27 additions & 40 deletions testsuite/integration/src/features/test_dws_states.feature
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ Feature: Data Workflow Services State Progression
Verify that the DWS-Slurm Burst Buffer Plugin progresses through Data
Workflow Services states

@happy_one
Scenario: The DWS-BB Plugin progresses through DWS states
Given a job script:
#!/bin/bash
@@ -44,13 +45,15 @@ Feature: Data Workflow Services State Progression
And the Workflow and job progress to the PostRun state
And the Workflow and job progress to the DataOut state
And the Workflow and job progress to the Teardown state
And the job state is COMPLETED
And the job has eventually been COMPLETED

# DWS does not allow spaces in key/value pairs in directives. To skirt around this
# constraint, the dws-test-driver replaces underscores ("_") in the message value with
# spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
# It also makes it easier to check that the error is included in scontrol output.
Scenario Outline: The DWS-BB Plugin can handle fatal driver errors before being canceled
# This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
@fatal_one
Scenario Outline: Report fatal errors from Proposal, Setup, DataIn, PreRun
Given a job script:
#!/bin/bash

@@ -59,12 +62,13 @@ Feature: Data Workflow Services State Progression
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow and job report fatal errors at the <workflowState> state
And the job is canceled
And the Workflow and job progress to the Teardown state
And the job's final system comment contains the following:
And some Workflow has been created for the job
And the Workflow reports fatal errors at the <workflowState> state
Then the job's system comment eventually contains the following:
TEST FATAL ERROR
And the Workflow and job progress to the Teardown state
And the Workflow has eventually been deleted
And the job has eventually been CANCELLED

Examples:
# *** HEADER ***
@@ -73,14 +77,15 @@ Feature: Data Workflow Services State Progression
| Proposal |
| Setup |
| DataIn |
| PostRun |
| DataOut |
| PreRun |

# With the exception of PreRun, states will need to be canceled with the
# "--hurry" flag to transition to the Teardown state. If
# "Flags=TeardownFailure" is set in burst_buffer.conf, then all states will
# transition to Teardown without needing to be canceled
Scenario Outline: The DWS-BB Plugin can handle fatal driver errors for PreRun
# DWS does not allow spaces in key/value pairs in directives. To skirt around this
# constraint, the dws-test-driver replaces underscores ("_") in the message value with
# spaces. This ensures that the dws-slurm-plugin can handle whitespace in error messages
# It also makes it easier to check that the error is included in scontrol output.
# This scenario assumes that "Flags=TeardownFailure" is set in burst_buffer.conf.
@fatal_two
Scenario Outline: Report fatal errors from PostRun and DataOut
Given a job script:
#!/bin/bash

@@ -89,35 +94,17 @@ Feature: Data Workflow Services State Progression
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow reports a fatal error in the <workflowState> state
And the Workflow and job progress to the Teardown state
# Slurm moved it from PreRun/Error to Teardown without canceling
# the job. So the driver (this test) must cancel it.
And the job is canceled
And the job's final system comment contains the following:
And some Workflow has been created for the job
And the Workflow reports fatal errors at the <workflowState> state
Then the job's system comment eventually contains the following:
TEST FATAL ERROR
And the Workflow and job progress to the Teardown state
And the Workflow has eventually been deleted
And the job has eventually been COMPLETED

Examples:
# *** HEADER ***
| workflowState |
# *** VALUES ***
| PreRun |

Scenario: The DWS-BB Plugin can handle fatal driver errors during Teardown
Given a job script:
#!/bin/bash

#DW Teardown action=error message=TEST_FATAL_ERROR severity=Fatal
/bin/hostname

When the job is run
Then a Workflow has been created for the job
And the Workflow reports a fatal error in the Teardown state
And the job's intermediate system comment contains the following:
TEST FATAL ERROR
# Eventually the driver (this test) must work through the Teardown
# issues and complete that step. Slurm has already marked the job
# as completed and is now looping over slurm_bb_job_teardown() in
# burst_buffer.lua.
And the Workflow error is cleared from the Teardown state
| PostRun |
| DataOut |
Original file line number Diff line number Diff line change
@@ -36,7 +36,7 @@ Feature: Integration test environment
srun -l /bin/hostname
srun -l /bin/pwd
When the job is run
Then the job state is COMPLETED
Then the job has eventually been COMPLETED

Scenario: Kubernetes and slurm are connected
Given the kubernetes cluster kube-system UID
4 changes: 3 additions & 1 deletion testsuite/integration/src/pytest.ini
Original file line number Diff line number Diff line change
@@ -22,4 +22,6 @@ bdd_features_base_dir = features
markers =
environment
dws_states

happy_one
fatal_one
fatal_two
18 changes: 4 additions & 14 deletions testsuite/integration/src/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -72,17 +72,7 @@ def _(slurmctld, script_path):
# remove the slurm output from the jobs folder
slurmctld.remove_job_output(jobId, outputFilePath, errorFilePath)

@then(parsers.parse('the job state is {expectedJobState}'))
def _(slurmctld, jobId, expectedJobState):
"""the job state is <expectedJobState>"""
jobState, out = slurmctld.get_final_job_state(jobId)

if expectedJobState == "COMPLETED" and jobState == "FAILED":
warnings.warn(ResourceWarning((f"Job {jobId} failed unexpectedly.\n") + \
"This may happen if Slurm doesn't have enough resources to schedule the job.\n" + \
"This is not considered a test failure, in this context, since DWS isn't\n" + \
"dependent on the job's failure or success."
))
return

assert jobState == expectedJobState, "Unexpected Job State: " + jobState + "\n" + out
@then(parsers.parse('the job has eventually been {job_state:l}'))
def _(slurmctld, jobId, job_state):
"""the job has eventually been <job_state>"""
slurmctld.wait_until_job_has_been_x(jobId, job_state)
Loading