Skip to content

Commit

Permalink
Verify or add a kubectl cache dir (#24)
Browse files Browse the repository at this point in the history
Kubectl wants to place its discovery cache and http cache into $HOME/.kube,
however the PE slurmctld container may not have provided a home dir for the
user.  This detects the absence of a home dir and instead creates a dir in /tmp and sets a HOME env variable for the kubectl command to follow.

This can reduce the runtime of a typical kubectl command, run in the PE
slurmctld pod, from 25 seconds down to 0.4 seconds.

Loop forever while waiting for state changes.

Signed-off-by: Dean Roehrich <[email protected]>
  • Loading branch information
roehrich-hpe authored Dec 6, 2022
1 parent 725f8b7 commit 08f4ebd
Show file tree
Hide file tree
Showing 2 changed files with 191 additions and 33 deletions.
64 changes: 56 additions & 8 deletions src/burst_buffer/burst_buffer.lua
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ WLMID_PLACEHOLDER = "slurm"
-- The fully-qualified name of the DWS Workflow CRD.
local WORKFLOW_CRD = "workflows.dws.cray.hpe.com"

KUBECTL_CACHE_DIR = "/tmp/burst_buffer_kubectl_cache"

lua_script_name="burst_buffer.lua"

math.randomseed(os.time())
Expand Down Expand Up @@ -237,12 +239,13 @@ function DWS:get_hurry()
end

-- DWS:wait_for_status_complete will loop until the workflow reports that
-- its status is completed.
-- its status is completed. If the max_passes == -1 then this will loop
-- without limit until the state is complete or an error is encountered.
-- On success this returns true and a table containing the status.
-- On failure this returns false, an empty table, and an error message.
function DWS:wait_for_status_complete(max_passes)
local empty = {}
while max_passes > 0 do
while max_passes > 0 or max_passes == -1 do
local done, status, err = self:get_current_state()
if done == false then
return false, empty, err
Expand All @@ -255,7 +258,9 @@ function DWS:wait_for_status_complete(max_passes)
return false, empty, string.format("Error in Workflow %s", self.name)
end
os.execute("sleep 1")
max_passes = max_passes - 1
if max_passes > 0 then
max_passes = max_passes - 1
end
end
return false, empty, "exceeded max wait time"
end
Expand Down Expand Up @@ -288,14 +293,46 @@ function DWS:set_workflow_state_and_wait(new_state, hurry)
return done, "set_desired_state: " .. err
end

local done, status, err = self:wait_for_status_complete(60)
local done, status, err = self:wait_for_status_complete(-1)
if done == false then
return done, "wait_for_status_complete: " .. err
end

return true
end

-- DWS:kubectl_cache_home will determine where the kubectl discovery cache
-- and http cache may be located.
-- If the user's HOME dir exists then kubectl expects to use that and this does
-- nothing and returns true and an empty string.
-- Otherwise, this will create a dir in /tmp and will return true and a string
-- that defines the HOME variable with the new dir.
-- If there is an error creating the dir this will return false and a string
-- containing an error message.
function DWS:kubectl_cache_home()

local dir_exists = function(dname)
local cmd = "test -d " .. dname
local done, _ = self:io_popen(cmd)
if done == false then
end
return done
end

if dir_exists(os.getenv("HOME")) == false then
if dir_exists(KUBECTL_CACHE_DIR) == false then
local cmd = "mkdir " .. KUBECTL_CACHE_DIR
local done, result = self:io_popen(cmd)
if done == false then
return false, "Unable to create " .. KUBECTL_CACHE_DIR .. " cache dir for kubectl: " .. result
end
end
return true, "HOME=" .. KUBECTL_CACHE_DIR
end

return true, ""
end

-- DWS:token will find a ServiceAccount token and return a --token argument
-- for the kubectl command. If we're not inside the slurm pod, maybe in a test
-- env, then this will return an empty string.
Expand All @@ -314,8 +351,19 @@ end
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
function DWS:kubectl(cmd)
local kcmd = "kubectl " .. self:token() .. " " .. cmd
local handle = io.popen(kcmd)
local done, homedir_msg = self:kubectl_cache_home()
if done ~= true then
return false, homedir_msg
end
local kcmd = homedir_msg .. " kubectl " .. self:token() .. " " .. cmd
return self:io_popen(kcmd)
end

-- DWS:io_popen will run the given command and collect its output.
-- On success this returns true and the output of the command.
-- On failure this returns false and the output of the command.
function DWS:io_popen(cmd)
local handle = io.popen(cmd)
if handle == nil then
-- The io.popen was stubbed by a test. Use the provided
-- return values.
Expand Down Expand Up @@ -534,7 +582,7 @@ function slurm_bb_setup(job_id, uid, gid, pool, bb_size, job_script)

-- Wait for proposal state to complete, or pick up any error that may
-- be waiting in the Workflow.
local done, status, err = workflow:wait_for_status_complete(60)
local done, status, err = workflow:wait_for_status_complete(-1)
if done == false then
slurm.log_error("%s: slurm_bb_setup(), workflow=%s, waiting for Proposal state to complete: %s", lua_script_name, workflow_name, err)
return slurm.ERROR, err
Expand All @@ -546,7 +594,7 @@ function slurm_bb_setup(job_id, uid, gid, pool, bb_size, job_script)
return done, err
end

done, status, err = workflow:wait_for_status_complete(60)
done, status, err = workflow:wait_for_status_complete(-1)
if done == err then
slurm.log_error("%s: slurm_bb_setup(), workflow=%s, waiting for Setup state to complete: %s", lua_script_name, workflow_name, err)
return done, err
Expand Down
Loading

0 comments on commit 08f4ebd

Please sign in to comment.