From 1b95fc421a674517959b79511301f76427302f31 Mon Sep 17 00:00:00 2001 From: Grant Griffiths Date: Fri, 10 Jun 2022 20:00:22 +0000 Subject: [PATCH] CSI: make plugin health_timeout configurable in csi_plugin stanza Signed-off-by: Grant Griffiths --- .changelog/13340.txt | 3 +++ api/tasks.go | 4 ++++ client/allocrunner/taskrunner/plugin_supervisor_hook.go | 4 ++-- command/agent/job_endpoint.go | 1 + jobspec/parse_test.go | 7 ++++--- nomad/structs/csi.go | 4 ++++ nomad/structs/structs.go | 4 ++++ 7 files changed, 22 insertions(+), 5 deletions(-) create mode 100644 .changelog/13340.txt diff --git a/.changelog/13340.txt b/.changelog/13340.txt new file mode 100644 index 00000000000..1f8e40f21b9 --- /dev/null +++ b/.changelog/13340.txt @@ -0,0 +1,3 @@ +```release-note:improvements +csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field +``` diff --git a/api/tasks.go b/api/tasks.go index d03a4cb4a9a..8433f222c29 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -1039,6 +1039,10 @@ type TaskCSIPluginConfig struct { // // Default is /csi. MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"` + + // HealthTimeout is the time after which the CSI plugin tasks will be killed + // if the CSI Plugin is not healthy. + HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"` } func (t *TaskCSIPluginConfig) Canonicalize() { diff --git a/client/allocrunner/taskrunner/plugin_supervisor_hook.go b/client/allocrunner/taskrunner/plugin_supervisor_hook.go index 3983d001d1c..50713fc075c 100644 --- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go +++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go @@ -253,7 +253,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) { // We're in Poststart at this point, so if we can't connect within // this deadline, assume it's broken so we can restart the task - startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second) + startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout) defer startCancelFn() var err error @@ -441,7 +441,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) { if err := h.lifecycle.Kill(ctx, structs.NewTaskEvent(structs.TaskKilling). SetFailsTask(). - SetDisplayMessage("CSI plugin did not become healthy before timeout"), + SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())), ); err != nil { h.logger.Error("failed to kill task", "kill_reason", reason, "error", err) } diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index f88a58a2e0b..f7a74c4f8d0 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -1263,6 +1263,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf sc.ID = apiConfig.ID sc.Type = structs.CSIPluginType(apiConfig.Type) sc.MountDir = apiConfig.MountDir + sc.HealthTimeout = apiConfig.HealthTimeout return sc } diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index 7c9ff243a10..45d624aa22f 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -626,9 +626,10 @@ func TestParse(t *testing.T) { Name: "binstore", Driver: "docker", CSIPluginConfig: &api.TaskCSIPluginConfig{ - ID: "org.hashicorp.csi", - Type: api.CSIPluginTypeMonolith, - MountDir: "/csi/test", + ID: "org.hashicorp.csi", + Type: api.CSIPluginTypeMonolith, + MountDir: "/csi/test", + HealthTimeout: 1 * time.Minute, }, }, }, diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go index 2147cc08a2b..eea20b597d6 100644 --- a/nomad/structs/csi.go +++ b/nomad/structs/csi.go @@ -67,6 +67,10 @@ type TaskCSIPluginConfig struct { // to be created by the plugin, and will provide references into // "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts. MountDir string + + // HealthTimeout is the time after which the CSI plugin tasks will be killed + // if the CSI Plugin is not healthy. + HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"` } func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index b376ebfaa14..cc82abc380f 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -7255,6 +7255,10 @@ func (t *Task) Validate(ephemeralDisk *EphemeralDisk, jobType string, tgServices mErr.Errors = append(mErr.Errors, fmt.Errorf("CSIPluginConfig PluginType must be one of 'node', 'controller', or 'monolith', got: \"%s\"", t.CSIPluginConfig.Type)) } + if t.CSIPluginConfig.HealthTimeout == 0 { + t.CSIPluginConfig.HealthTimeout = 30 * time.Second + } + // TODO: Investigate validation of the PluginMountDir. Not much we can do apart from check IsAbs until after we understand its execution environment though :( }