From d4efa26a89818b6058fc72cabe8535caa9b2e4c3 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Wed, 11 Mar 2020 15:54:53 -0700 Subject: [PATCH 01/39] add resource manager --- go/tasks/plugins/array/k8s/config.go | 6 ++++++ go/tasks/plugins/array/k8s/executor.go | 9 +++++++++ go/tasks/plugins/array/k8s/launcher.go | 12 ++++++++++++ go/tasks/plugins/array/k8s/monitor.go | 13 ++++++++++++- 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index a1d94d300..739e9df8c 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -32,6 +32,11 @@ var ( configSection = config.MustRegisterSection(configSectionKey, defaultConfig) ) +type TokenConfig struct { + primaryLabel string + limit int +} + // Defines custom config for K8s Array plugin type Config struct { DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` @@ -39,6 +44,7 @@ type Config struct { MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config + TokenConfigs TokenConfig } func GetConfig() *Config { diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 125ee09a4..1eebb2846 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -7,6 +7,7 @@ import ( "github.com/lyft/flyteplugins/go/tasks/plugins/array" arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" + "github.com/lyft/flytestdlib/logger" "github.com/lyft/flytestdlib/promutils" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery" @@ -164,5 +165,13 @@ func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plu return nil, err } + primaryLabel := GetConfig().TokenConfigs.primaryLabel + tokenLimit := GetConfig().TokenConfigs.limit + + if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), tokenLimit); err != nil { + logger.Errorf(ctx, "Token Resource registration for [%v] failed due to error [%v]", primaryLabel, err) + return nil, err + } + return exec, nil } diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index c58c673f8..2b07f6580 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -6,6 +6,7 @@ import ( "strconv" "strings" + "github.com/lyft/flyteplugins/go/tasks/errors" "github.com/lyft/flyteplugins/go/tasks/plugins/array/errorcollector" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -31,6 +32,7 @@ const ( ErrSubmitJob errors2.ErrorCode = "SUBMIT_JOB_FAILED" JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" + TokenPrimaryLabel string = "token" ) var arrayJobEnvVars = []corev1.EnvVar{ @@ -94,6 +96,16 @@ func LaunchSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeCli pod = ApplyPodPolicies(ctx, config, pod) + // Allocate Token + resourceNamespace := core.ResourceNamespace(pod.Namespace) + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, pod.Name, core.ResourceConstraintsSpec{}) + if err != nil { + logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", + tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), pod.Name, err) + return newState, errors.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", pod.Name) + } + logger.Infof(ctx, "Allocation result for [%s] is [%s]", pod.Name, allocationStatus) + err = kubeClient.GetClient().Create(ctx, pod) if err != nil && !k8serrors.IsAlreadyExists(err) { if k8serrors.IsForbidden(err) { diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 58ccd852c..e409d5c32 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -6,6 +6,7 @@ import ( "strconv" "time" + "github.com/lyft/flytestdlib/logger" "github.com/lyft/flytestdlib/storage" "github.com/lyft/flyteplugins/go/tasks/plugins/array" @@ -59,9 +60,10 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub continue } + generatedName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), strconv.Itoa(childIdx)) phaseInfo, err := CheckPodStatus(ctx, kubeClient, k8sTypes.NamespacedName{ - Name: formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), strconv.Itoa(childIdx)), + Name: generatedName, Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), }) if err != nil { @@ -78,6 +80,15 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub actualPhase := phaseInfo.Phase() if phaseInfo.Phase().IsSuccess() { + + // Release token + resourceNamespace := core.ResourceNamespace(tCtx.TaskExecutionMetadata().GetOwnerID().Namespace) + err = tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, generatedName) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", uniqueID, err) + return currentState, logLinks, errors.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") + } + originalIdx := arrayCore.CalculateOriginalIndex(childIdx, currentState.GetIndexesToCache()) actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, childIdx, originalIdx) if err != nil { From 74523da50ab4472ddac63b5a93e72cdd47384505 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 20 Mar 2020 11:11:21 -0700 Subject: [PATCH 02/39] launchandmonitor func --- go/tasks/plugins/array/core/state.go | 1 + go/tasks/plugins/array/k8s/config.go | 8 +-- go/tasks/plugins/array/k8s/executor.go | 15 ++--- go/tasks/plugins/array/k8s/launcher.go | 15 +++-- go/tasks/plugins/array/k8s/monitor.go | 87 +++++++++++++++++++++----- 5 files changed, 94 insertions(+), 32 deletions(-) diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index fc371ca00..726cd1d68 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -27,6 +27,7 @@ const ( PhaseStart Phase = iota PhasePreLaunch PhaseLaunch + PhaseLaunchAndMonitor PhaseWaitingForResources PhaseCheckingSubTaskExecutions PhaseAssembleFinalOutput diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index 739e9df8c..7cbb42fd7 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -32,9 +32,9 @@ var ( configSection = config.MustRegisterSection(configSectionKey, defaultConfig) ) -type TokenConfig struct { - primaryLabel string - limit int +type ResourceConfig struct { + PrimaryLabel string `json:"primaryLabel" pflag:",PrimaryLabel of a given service cluster"` + Limit int `json:"limit" pflag:",Resource quota (in the number of outstanding requests) of the service cluster"` } // Defines custom config for K8s Array plugin @@ -44,7 +44,7 @@ type Config struct { MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config - TokenConfigs TokenConfig + ResourcesConfig ResourceConfig } func GetConfig() *Config { diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 1eebb2846..9d74edc4b 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -77,11 +77,8 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c case arrayCore.PhaseWaitingForResources: fallthrough - case arrayCore.PhaseLaunch: - nextState, err = LaunchSubTasks(ctx, tCtx, e.kubeClient, pluginConfig, pluginState) - - case arrayCore.PhaseCheckingSubTaskExecutions: - nextState, logLinks, err = CheckSubTasksState(ctx, tCtx, e.kubeClient, tCtx.DataStore(), + case arrayCore.PhaseLaunchAndMonitor: + nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, tCtx.DataStore(), tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) case arrayCore.PhaseAssembleFinalOutput: @@ -129,7 +126,7 @@ func (e Executor) Finalize(ctx context.Context, tCtx core.TaskExecutionContext) return errors.Wrapf(errors.CorruptedPluginState, err, "Failed to read unmarshal custom state") } - return TerminateSubTasks(ctx, tCtx.TaskExecutionMetadata(), e.kubeClient, pluginConfig.MaxErrorStringLength, + return TerminateSubTasks(ctx, tCtx, e.kubeClient, pluginConfig.MaxErrorStringLength, pluginState) } @@ -165,10 +162,10 @@ func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plu return nil, err } - primaryLabel := GetConfig().TokenConfigs.primaryLabel - tokenLimit := GetConfig().TokenConfigs.limit + primaryLabel := GetConfig().ResourcesConfig.PrimaryLabel + resourceLimit := GetConfig().ResourcesConfig.Limit - if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), tokenLimit); err != nil { + if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), resourceLimit); err != nil { logger.Errorf(ctx, "Token Resource registration for [%v] failed due to error [%v]", primaryLabel, err) return nil, err } diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 2b07f6580..36b7d6b6e 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -32,7 +32,7 @@ const ( ErrSubmitJob errors2.ErrorCode = "SUBMIT_JOB_FAILED" JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" - TokenPrimaryLabel string = "token" + ResourcesPrimaryLabel string = "token" ) var arrayJobEnvVars = []corev1.EnvVar{ @@ -138,14 +138,14 @@ func LaunchSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeCli return currentState, nil } -func TerminateSubTasks(ctx context.Context, tMeta core.TaskExecutionMetadata, kubeClient core.KubeClient, +func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, errsMaxLength int, currentState *arrayCore.State) error { size := currentState.GetExecutionArraySize() errs := errorcollector.NewErrorMessageCollector() for i := 0; i < size; i++ { indexStr := strconv.Itoa(i) - podName := formatSubTaskName(ctx, tMeta.GetTaskExecutionID().GetGeneratedName(), indexStr) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) pod := &corev1.Pod{ TypeMeta: metav1.TypeMeta{ Kind: PodKind, @@ -153,7 +153,7 @@ func TerminateSubTasks(ctx context.Context, tMeta core.TaskExecutionMetadata, ku }, ObjectMeta: metav1.ObjectMeta{ Name: podName, - Namespace: tMeta.GetNamespace(), + Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), }, } @@ -165,6 +165,13 @@ func TerminateSubTasks(ctx context.Context, tMeta core.TaskExecutionMetadata, ku errs.Collect(i, err.Error()) } + + // Deallocate Resource + err = tCtx.ResourceManager().ReleaseResource(ctx, tCtx.TaskExecutionMetadata().GetNamespace(), podName) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + errs.Collect(i, err.Error()) + } } if errs.Length() > 0 { diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index e409d5c32..4a546b663 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -3,6 +3,8 @@ package k8s import ( "context" "fmt" + "github.com/lyft/flyteplugins/go/tasks/errors" + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" "strconv" "time" @@ -24,7 +26,7 @@ import ( "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/flytek8s" idlCore "github.com/lyft/flyteidl/gen/pb-go/flyteidl/core" - "github.com/lyft/flytestdlib/errors" + errors2 "github.com/lyft/flytestdlib/errors" k8serrors "k8s.io/apimachinery/pkg/api/errors" "github.com/lyft/flyteplugins/go/tasks/logs" @@ -35,7 +37,7 @@ const ( ErrCheckPodStatus errors.ErrorCode = "CHECK_POD_FAILED" ) -func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, +func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, dataStore *storage.DataStore, outputPrefix storage.DataReference, currentState *arrayCore.State) ( newState *arrayCore.State, logLinks []*idlCore.TaskLog, err error) { @@ -48,11 +50,29 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), } + podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) + if err != nil { + return newState, logLinks, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for task") + } + + var args []string + if len(podTemplate.Spec.Containers) > 0 { + args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) + podTemplate.Spec.Containers[0].Command = []string{} + } + for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { existingPhase := core.Phases[existingPhaseIdx] if existingPhase.IsTerminal() { // If we get here it means we have already "processed" this terminal phase since we will only persist // the phase after all processing is done (e.g. check outputs/errors file, record events... etc.). + + // Deallocate Resource + err = tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, podName) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") + } newArrayStatus.Summary.Inc(existingPhase) newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase)) @@ -60,14 +80,58 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub continue } - generatedName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), strconv.Itoa(childIdx)) + pod := podTemplate.DeepCopy() + indexStr := strconv.Itoa(childIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + pod.Name = podName + pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{ + Name: FlyteK8sArrayIndexVarName, + Value: indexStr, + }) + + pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].env, arrayJobEnvVars...) + + pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) + if err != nil { + return newState, logLinks, error2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + } + + pod = ApplyPodPolicies(ctx, config, pod) + + resourceNamespace := core.ResourceNamespace(pod.Namespace) + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, core.ResourceConstraintsSpec{}) + if err != nil { + logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", + tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) + return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) + } + logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) + + err = kubeClient.GetClient().Create(ctx, pod) + if err != nil && !k8serrors.IsAlreadyExists(err) { + if k8serrors.IsForbidden(err) { + if strings.Contains(err.Error(), "exceeded quota") { + // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. + logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) + newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job.") + } else { + newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") + } + + newState = newState.SetReason(err.Error()) + return newState, logLinks, nil + } + + return newState, logLinks, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job") + } + phaseInfo, err := CheckPodStatus(ctx, kubeClient, k8sTypes.NamespacedName{ - Name: generatedName, + Name: podName, Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), }) if err != nil { - return currentState, logLinks, errors.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status") + return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status") } if phaseInfo.Info() != nil { @@ -80,15 +144,6 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub actualPhase := phaseInfo.Phase() if phaseInfo.Phase().IsSuccess() { - - // Release token - resourceNamespace := core.ResourceNamespace(tCtx.TaskExecutionMetadata().GetOwnerID().Namespace) - err = tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, generatedName) - if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", uniqueID, err) - return currentState, logLinks, errors.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") - } - originalIdx := arrayCore.CalculateOriginalIndex(childIdx, currentState.GetIndexesToCache()) actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, childIdx, originalIdx) if err != nil { @@ -98,6 +153,7 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(actualPhase)) newArrayStatus.Summary.Inc(actualPhase) + } newState = newState.SetArrayStatus(newArrayStatus) @@ -116,8 +172,9 @@ func CheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kub newState = newState.SetReason(errorMsg) } - if phase == arrayCore.PhaseCheckingSubTaskExecutions { + if phase == arrayCore.PhaseLaunchAndMonitor { newPhaseVersion := uint32(0) + // For now, the only changes to PhaseVersion and PreviousSummary occur for running array jobs. for phase, count := range newState.GetArrayStatus().Summary { newPhaseVersion += uint32(phase) * uint32(count) From e2c3aae051c720fa8e88efe8f16b49119c5e3920 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 20 Mar 2020 11:25:05 -0700 Subject: [PATCH 03/39] check allocation status --- go/tasks/plugins/array/k8s/monitor.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 4a546b663..54ad3adde 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -105,6 +105,9 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) } + if allocationStatus != core.AllocationStatusGranted { + continue + } logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) err = kubeClient.GetClient().Create(ctx, pod) From e12e28767517f87ed221fc1896490471b6d9878d Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 20 Mar 2020 11:28:08 -0700 Subject: [PATCH 04/39] fmt --- boilerplate/lyft/golang_support_tools/tools.go | 2 +- .../core/mocks/resource_negotiator.go | 9 ++++++--- go/tasks/plugins/array/k8s/monitor.go | 13 ++++++++----- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/boilerplate/lyft/golang_support_tools/tools.go b/boilerplate/lyft/golang_support_tools/tools.go index 4310b39d7..88ff64523 100644 --- a/boilerplate/lyft/golang_support_tools/tools.go +++ b/boilerplate/lyft/golang_support_tools/tools.go @@ -3,8 +3,8 @@ package tools import ( + _ "github.com/alvaroloes/enumer" _ "github.com/golangci/golangci-lint/cmd/golangci-lint" _ "github.com/lyft/flytestdlib/cli/pflags" _ "github.com/vektra/mockery/cmd/mockery" - _ "github.com/alvaroloes/enumer" ) diff --git a/go/tasks/pluginmachinery/core/mocks/resource_negotiator.go b/go/tasks/pluginmachinery/core/mocks/resource_negotiator.go index b0d5ad227..9ead9b529 100644 --- a/go/tasks/pluginmachinery/core/mocks/resource_negotiator.go +++ b/go/tasks/pluginmachinery/core/mocks/resource_negotiator.go @@ -2,9 +2,12 @@ package mocks -import context "context" -import core "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" -import mock "github.com/stretchr/testify/mock" +import ( + context "context" + + core "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" + mock "github.com/stretchr/testify/mock" +) // ResourceRegistrar is an autogenerated mock type for the ResourceRegistrar type type ResourceNegotiator struct { diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 54ad3adde..f51bce1f6 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -3,11 +3,13 @@ package k8s import ( "context" "fmt" - "github.com/lyft/flyteplugins/go/tasks/errors" - "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" "strconv" + "strings" "time" + "github.com/lyft/flyteplugins/go/tasks/errors" + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" + "github.com/lyft/flytestdlib/logger" "github.com/lyft/flytestdlib/storage" @@ -19,6 +21,7 @@ import ( "github.com/lyft/flyteplugins/go/tasks/plugins/array/errorcollector" "github.com/lyft/flytestdlib/bitarray" + corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sTypes "k8s.io/apimachinery/pkg/types" @@ -85,7 +88,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) pod.Name = podName pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{ - Name: FlyteK8sArrayIndexVarName, + Name: FlyteK8sArrayIndexVarName, Value: indexStr, }) @@ -105,7 +108,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) } - if allocationStatus != core.AllocationStatusGranted { + if allocationStatus != core.AllocationStatusGranted { continue } logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) @@ -130,7 +133,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon phaseInfo, err := CheckPodStatus(ctx, kubeClient, k8sTypes.NamespacedName{ - Name: podName, + Name: podName, Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), }) if err != nil { From e7c226d58e0d3e5ffc4f90a7c0044ac46c7b3555 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 20 Mar 2020 13:33:10 -0700 Subject: [PATCH 05/39] fix build --- go/tasks/plugins/array/k8s/executor.go | 4 ++-- go/tasks/plugins/array/k8s/launcher.go | 2 +- go/tasks/plugins/array/k8s/monitor.go | 12 ++++++------ 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 9d74edc4b..86a5952a7 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -78,8 +78,8 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c fallthrough case arrayCore.PhaseLaunchAndMonitor: - nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, tCtx.DataStore(), - tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) + nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, pluginConfig, + tCtx.DataStore(), tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) case arrayCore.PhaseAssembleFinalOutput: nextState, err = array.AssembleFinalOutputs(ctx, e.outputsAssembler, tCtx, arrayCore.PhaseSuccess, pluginState) diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 36b7d6b6e..1264a479f 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -167,7 +167,7 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube } // Deallocate Resource - err = tCtx.ResourceManager().ReleaseResource(ctx, tCtx.TaskExecutionMetadata().GetNamespace(), podName) + err = tCtx.ResourceManager().ReleaseResource(ctx, core.ResourceNamespace(ResourcesPrimaryLabel), podName) if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) errs.Collect(i, err.Error()) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index f51bce1f6..1c746901d 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -41,7 +41,7 @@ const ( ) func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, - dataStore *storage.DataStore, outputPrefix storage.DataReference, currentState *arrayCore.State) ( + config *Config, dataStore *storage.DataStore, outputPrefix storage.DataReference, currentState *arrayCore.State) ( newState *arrayCore.State, logLinks []*idlCore.TaskLog, err error) { logLinks = make([]*idlCore.TaskLog, 0, 4) @@ -66,12 +66,14 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { existingPhase := core.Phases[existingPhaseIdx] + indexStr := strconv.Itoa(childIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) if existingPhase.IsTerminal() { // If we get here it means we have already "processed" this terminal phase since we will only persist // the phase after all processing is done (e.g. check outputs/errors file, record events... etc.). // Deallocate Resource - err = tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, podName) + err = tCtx.ResourceManager().ReleaseResource(ctx, core.ResourceNamespace(ResourcesPrimaryLabel), podName) if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") @@ -84,19 +86,17 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon } pod := podTemplate.DeepCopy() - indexStr := strconv.Itoa(childIdx) - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) pod.Name = podName pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{ Name: FlyteK8sArrayIndexVarName, Value: indexStr, }) - pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].env, arrayJobEnvVars...) + pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, arrayJobEnvVars...) pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) if err != nil { - return newState, logLinks, error2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + return newState, logLinks, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") } pod = ApplyPodPolicies(ctx, config, pod) From 8ed48d447650620815d5dc3bc5496451deb08a29 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 20 Mar 2020 13:38:51 -0700 Subject: [PATCH 06/39] errors2 --- go/tasks/plugins/array/k8s/monitor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 1c746901d..66728d313 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -37,7 +37,7 @@ import ( ) const ( - ErrCheckPodStatus errors.ErrorCode = "CHECK_POD_FAILED" + ErrCheckPodStatus errors2.ErrorCode = "CHECK_POD_FAILED" ) func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, From 6090a7b5461a8f02ddaed583670ab1334e7c97f7 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 23 Mar 2020 16:07:19 -0700 Subject: [PATCH 07/39] update state machine --- go/tasks/plugins/array/core/state.go | 4 ++++ go/tasks/plugins/array/k8s/executor.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index 726cd1d68..f8dbf3023 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -178,6 +178,10 @@ func MapArrayStateToPluginPhase(_ context.Context, state *State, logLinks []*idl // The first time we return a Running core.Phase, we can just use the version inside the state object itself. phaseInfo = core.PhaseInfoRunning(version, nowTaskInfo) + case PhaseLaunchAndMonitor: + version := GetPhaseVersionOffset(p, 1) + version + phaseInfo = core.PhaseInfoRunning(version, nowTaskInfo) + case PhaseWaitingForResources: phaseInfo = core.PhaseInfoWaitingForResources(t, version, state.GetReason()) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 86a5952a7..c06564589 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -71,7 +71,7 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c nextState, err = array.DetermineDiscoverability(ctx, tCtx, pluginState) case arrayCore.PhasePreLaunch: - nextState = pluginState.SetPhase(arrayCore.PhaseLaunch, core.DefaultPhaseVersion).SetReason("Nothing to do in PreLaunch phase.") + nextState = pluginState.SetPhase(arrayCore.PhaseLaunchAndMonitor, core.DefaultPhaseVersion).SetReason("Nothing to do in PreLaunch phase.") err = nil case arrayCore.PhaseWaitingForResources: From 3bd1ef424ffeda9d87cf1bd403dd37359be19e51 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Tue, 24 Mar 2020 10:17:57 -0700 Subject: [PATCH 08/39] upd state machine --- go/tasks/plugins/array/k8s/executor.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index c06564589..562139336 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -71,16 +71,20 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c nextState, err = array.DetermineDiscoverability(ctx, tCtx, pluginState) case arrayCore.PhasePreLaunch: - nextState = pluginState.SetPhase(arrayCore.PhaseLaunchAndMonitor, core.DefaultPhaseVersion).SetReason("Nothing to do in PreLaunch phase.") + nextState = pluginState.SetPhase(arrayCore.PhaseLaunch, core.DefaultPhaseVersion).SetReason("Nothing to do in PreLaunch phase.") err = nil - case arrayCore.PhaseWaitingForResources: - fallthrough + case arrayCore.PhaseLaunch: + nextState = pluginState.SetPhase(arrayCore.PhaseLaunchAndMonitor, core.DefaultPhaseVersion).SetReason("Nothing to do in Launch phase.") + err = nil case arrayCore.PhaseLaunchAndMonitor: nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, pluginConfig, tCtx.DataStore(), tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) + case arrayCore.PhaseWaitingForResources: + fallthrough + case arrayCore.PhaseAssembleFinalOutput: nextState, err = array.AssembleFinalOutputs(ctx, e.outputsAssembler, tCtx, arrayCore.PhaseSuccess, pluginState) From b91380621fface3f16a91c3d998d63edd8e6ea38 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Tue, 24 Mar 2020 12:40:04 -0700 Subject: [PATCH 09/39] remove phaseLaunchAndMonitor state --- go/tasks/pluginmachinery/core/phase.go | 4 ++++ go/tasks/plugins/array/core/state.go | 13 ++++++++----- go/tasks/plugins/array/k8s/executor.go | 7 +++++-- go/tasks/plugins/array/k8s/monitor.go | 4 +++- 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/go/tasks/pluginmachinery/core/phase.go b/go/tasks/pluginmachinery/core/phase.go index 470cf1368..de88c7538 100644 --- a/go/tasks/pluginmachinery/core/phase.go +++ b/go/tasks/pluginmachinery/core/phase.go @@ -60,6 +60,10 @@ func (p Phase) IsSuccess() bool { return p == PhaseSuccess } +func (p Phase) IsWaitingForResources() bool { + return p == PhaseWaitingForResources +} + type TaskInfo struct { // log information for the task execution Logs []*core.TaskLog diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index f8dbf3023..1453c0e79 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -27,7 +27,6 @@ const ( PhaseStart Phase = iota PhasePreLaunch PhaseLaunch - PhaseLaunchAndMonitor PhaseWaitingForResources PhaseCheckingSubTaskExecutions PhaseAssembleFinalOutput @@ -178,10 +177,6 @@ func MapArrayStateToPluginPhase(_ context.Context, state *State, logLinks []*idl // The first time we return a Running core.Phase, we can just use the version inside the state object itself. phaseInfo = core.PhaseInfoRunning(version, nowTaskInfo) - case PhaseLaunchAndMonitor: - version := GetPhaseVersionOffset(p, 1) + version - phaseInfo = core.PhaseInfoRunning(version, nowTaskInfo) - case PhaseWaitingForResources: phaseInfo = core.PhaseInfoWaitingForResources(t, version, state.GetReason()) @@ -231,6 +226,7 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus totalSuccesses := int64(0) totalFailures := int64(0) totalRunning := int64(0) + totalWaitingForResources := int64(0) for phase, count := range summary { totalCount += count if phase.IsTerminal() { @@ -243,11 +239,18 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus // TODO: preferable to auto-combine to array tasks for now. totalFailures += count } + } else if phase.IsWaitingForResources() { + totalWaitingForResources += count } else { totalRunning += count } } + if totalWaitingForResources > 0 { + logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) + return PhaseCheckingSubTaskExecutions + } + if totalCount < minSuccesses { logger.Infof(ctx, "Array failed because totalCount[%v] < minSuccesses[%v]", totalCount, minSuccesses) return PhaseWriteToDiscoveryThenFail diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 562139336..235cb38d4 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -75,10 +75,13 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c err = nil case arrayCore.PhaseLaunch: - nextState = pluginState.SetPhase(arrayCore.PhaseLaunchAndMonitor, core.DefaultPhaseVersion).SetReason("Nothing to do in Launch phase.") + // This is a hack. In order to maintain backwards compatibility with the state transitions + // in the aws batch plugin. Forward to PhaseCheckingSubTasksExecutions where the launching + // is actually occurring. + nextState = pluginState.SetPhase(arrayCore.PhaseCheckingSubTaskExecutions, core.DefaultPhaseVersion).SetReason("Nothing to do in Launch phase.") err = nil - case arrayCore.PhaseLaunchAndMonitor: + case arrayCore.PhaseCheckingSubTaskExecutions: nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, pluginConfig, tCtx.DataStore(), tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 66728d313..9a8dd8d05 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -109,6 +109,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) } if allocationStatus != core.AllocationStatusGranted { + newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) + newArrayStatus.Summary.Inc(core.PhaseWaitingForResources) continue } logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) @@ -178,7 +180,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon newState = newState.SetReason(errorMsg) } - if phase == arrayCore.PhaseLaunchAndMonitor { + if phase == arrayCore.PhaseCheckingSubTaskExecutions { newPhaseVersion := uint32(0) // For now, the only changes to PhaseVersion and PreviousSummary occur for running array jobs. From 61f475118d8b14d2e5b65f6d2b01099c7f6f13a5 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Thu, 26 Mar 2020 09:27:31 -0700 Subject: [PATCH 10/39] generate --- .../core/mocks/resource_registrar.go | 5 --- go/tasks/plugins/array/k8s/config_flags.go | 2 + .../plugins/array/k8s/config_flags_test.go | 44 +++++++++++++++++++ go/tasks/plugins/array/k8s/executor.go | 6 +-- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/go/tasks/pluginmachinery/core/mocks/resource_registrar.go b/go/tasks/pluginmachinery/core/mocks/resource_registrar.go index fdfaf11a9..c2707c49d 100644 --- a/go/tasks/pluginmachinery/core/mocks/resource_registrar.go +++ b/go/tasks/pluginmachinery/core/mocks/resource_registrar.go @@ -14,11 +14,6 @@ type ResourceRegistrar struct { mock.Mock } -// RegisterResourceNamespaceQuotaProportionCap provides a mock function with given fields: ctx, proportionCap -func (_m *ResourceRegistrar) RegisterResourceNamespaceQuotaProportionCap(ctx context.Context, proportionCap float64) { - _m.Called(ctx, proportionCap) -} - type ResourceRegistrar_RegisterResourceQuota struct { *mock.Call } diff --git a/go/tasks/plugins/array/k8s/config_flags.go b/go/tasks/plugins/array/k8s/config_flags.go index 4a03aefc9..92f05aca7 100755 --- a/go/tasks/plugins/array/k8s/config_flags.go +++ b/go/tasks/plugins/array/k8s/config_flags.go @@ -50,5 +50,7 @@ func (cfg Config) GetPFlagSet(prefix string) *pflag.FlagSet { cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.workers"), defaultConfig.ErrorAssembler.Workers, "Number of concurrent workers to start processing the queue.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.maxRetries"), defaultConfig.ErrorAssembler.MaxRetries, "Maximum number of retries per item.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.maxItems"), defaultConfig.ErrorAssembler.IndexCacheMaxItems, "Maximum number of entries to keep in the index.") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "ResourcesConfig.primaryLabel"), defaultConfig.ResourcesConfig.PrimaryLabel, "PrimaryLabel of a given service cluster") + cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ResourcesConfig.limit"), defaultConfig.ResourcesConfig.Limit, "Resource quota (in the number of outstanding requests) of the service cluster") return cmdFlags } diff --git a/go/tasks/plugins/array/k8s/config_flags_test.go b/go/tasks/plugins/array/k8s/config_flags_test.go index df7b41909..4329d54f6 100755 --- a/go/tasks/plugins/array/k8s/config_flags_test.go +++ b/go/tasks/plugins/array/k8s/config_flags_test.go @@ -297,4 +297,48 @@ func TestConfig_SetFlags(t *testing.T) { } }) }) + t.Run("Test_ResourcesConfig.primaryLabel", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("ResourcesConfig.primaryLabel"); err == nil { + assert.Equal(t, string(defaultConfig.ResourcesConfig.PrimaryLabel), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("ResourcesConfig.primaryLabel", testValue) + if vString, err := cmdFlags.GetString("ResourcesConfig.primaryLabel"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ResourcesConfig.PrimaryLabel) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_ResourcesConfig.limit", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vInt, err := cmdFlags.GetInt("ResourcesConfig.limit"); err == nil { + assert.Equal(t, int(defaultConfig.ResourcesConfig.Limit), vInt) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("ResourcesConfig.limit", testValue) + if vInt, err := cmdFlags.GetInt("ResourcesConfig.limit"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vInt), &actual.ResourcesConfig.Limit) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) } diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 235cb38d4..3ab053c0d 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -74,6 +74,9 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c nextState = pluginState.SetPhase(arrayCore.PhaseLaunch, core.DefaultPhaseVersion).SetReason("Nothing to do in PreLaunch phase.") err = nil + case arrayCore.PhaseWaitingForResources: + fallthrough + case arrayCore.PhaseLaunch: // This is a hack. In order to maintain backwards compatibility with the state transitions // in the aws batch plugin. Forward to PhaseCheckingSubTasksExecutions where the launching @@ -85,9 +88,6 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c nextState, logLinks, err = LaunchAndCheckSubTasksState(ctx, tCtx, e.kubeClient, pluginConfig, tCtx.DataStore(), tCtx.OutputWriter().GetOutputPrefixPath(), pluginState) - case arrayCore.PhaseWaitingForResources: - fallthrough - case arrayCore.PhaseAssembleFinalOutput: nextState, err = array.AssembleFinalOutputs(ctx, e.outputsAssembler, tCtx, arrayCore.PhaseSuccess, pluginState) From d39b4578fd93ea43de6090545322ceac059a3b02 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 27 Mar 2020 08:52:00 -0700 Subject: [PATCH 11/39] initialize array if empty --- go/tasks/plugins/array/k8s/monitor.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 9a8dd8d05..8cfb027d5 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -47,12 +47,24 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon logLinks = make([]*idlCore.TaskLog, 0, 4) newState = currentState + if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { + ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) + logger.Info(ctx, ee) + currentState = currentState.SetPhase(arrayCore.PhasePermanentFailure, 0).SetReason(ee.Error()) + return currentState, logLinks, nil + } + msg := errorcollector.NewErrorMessageCollector() + newArrayStatus := arraystatus.ArrayStatus{ Summary: arraystatus.ArraySummary{}, Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), } + if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { + currentState.ArrayStatus = newArrayStatus + } + podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) if err != nil { return newState, logLinks, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for task") @@ -76,7 +88,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon err = tCtx.ResourceManager().ReleaseResource(ctx, core.ResourceNamespace(ResourcesPrimaryLabel), podName) if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") + return newState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") } newArrayStatus.Summary.Inc(existingPhase) newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase)) From e371f279e83110c8c024a23663bbe5b024d2d62b Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 30 Mar 2020 09:18:48 -0700 Subject: [PATCH 12/39] create resource contraint spec --- go/tasks/plugins/array/k8s/monitor.go | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index bd0fe5923..09ac34806 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -115,7 +115,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon pod = ApplyPodPolicies(ctx, config, pod) resourceNamespace := core.ResourceNamespace(pod.Namespace) - allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, core.ResourceConstraintsSpec{}) + resourceConstrainSpec := createResourceConstraintsSpec(ctx, tCtx, config, core.ResourceNamespace(ResourcesPrimaryLabel)) + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstrainSpec) if err != nil { logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) @@ -267,3 +268,21 @@ func CheckPodStatus(ctx context.Context, client core.KubeClient, name k8sTypes.N return core.PhaseInfoRunning(core.DefaultPhaseVersion, &taskInfo), nil } + +func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext, config *Config, primaryLabel core.ResourceNamespace) core.ResourceConstraintsSpec { + constraintsSpec := core.ResourceConstraintsSpec{ + ProjectScopeResourceConstraint: nil, + NamespaceScopeResourceConstraint: nil, + } + + if config.ResourcesConfig == (ResourceConfig{}) { + logger.Infof(ctx, "No Resource config is found. Returning an empty resource constraints spec") + return constraintsSpec + } + + constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} + constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} + + logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) + return constraintsSpec +} From f2a16bfb02a1e21c81ff7790de83751bf285a403 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 30 Mar 2020 11:10:47 -0700 Subject: [PATCH 13/39] add json doc for pflag --- go/tasks/plugins/array/k8s/config.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index 7cbb42fd7..b8858e012 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -39,12 +39,12 @@ type ResourceConfig struct { // Defines custom config for K8s Array plugin type Config struct { - DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` - MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` - MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` + DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` + MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` + MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` + ResourcesConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to be used with Resource Manager.` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config - ResourcesConfig ResourceConfig } func GetConfig() *Config { From ae4f1464bc81e915d480b13873bd6ce09de5b5b4 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Wed, 1 Apr 2020 08:42:07 -0700 Subject: [PATCH 14/39] use primary label --- go/tasks/plugins/array/k8s/config.go | 2 +- go/tasks/plugins/array/k8s/monitor.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index b8858e012..43c1304d8 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -42,7 +42,7 @@ type Config struct { DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` - ResourcesConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to be used with Resource Manager.` + ResourcesConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to be used with Resource Manager."` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config } diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 09ac34806..4c4825ba4 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -114,8 +114,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon pod = ApplyPodPolicies(ctx, config, pod) - resourceNamespace := core.ResourceNamespace(pod.Namespace) - resourceConstrainSpec := createResourceConstraintsSpec(ctx, tCtx, config, core.ResourceNamespace(ResourcesPrimaryLabel)) + resourceNamespace := core.ResourceNamespace(ResourcesPrimaryLabel) + resourceConstrainSpec := createResourceConstraintsSpec(ctx, tCtx, config) allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstrainSpec) if err != nil { logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", @@ -269,7 +269,7 @@ func CheckPodStatus(ctx context.Context, client core.KubeClient, name k8sTypes.N } -func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext, config *Config, primaryLabel core.ResourceNamespace) core.ResourceConstraintsSpec { +func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext, config *Config) core.ResourceConstraintsSpec { constraintsSpec := core.ResourceConstraintsSpec{ ProjectScopeResourceConstraint: nil, NamespaceScopeResourceConstraint: nil, From 286425c108a70ceb74c87390f42074fa92f2ebf0 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 3 Apr 2020 14:37:36 -0700 Subject: [PATCH 15/39] upd config and task launch --- go/tasks/plugins/array/k8s/config.go | 8 ++- go/tasks/plugins/array/k8s/executor.go | 13 ++-- go/tasks/plugins/array/k8s/monitor.go | 57 ++++++++++++------ go/tasks/plugins/array/k8s/task.go | 83 ++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 27 deletions(-) create mode 100644 go/tasks/plugins/array/k8s/task.go diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index 43c1304d8..61248e48e 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -34,7 +34,7 @@ var ( type ResourceConfig struct { PrimaryLabel string `json:"primaryLabel" pflag:",PrimaryLabel of a given service cluster"` - Limit int `json:"limit" pflag:",Resource quota (in the number of outstanding requests) of the service cluster"` + Limit int `json:"limit" pflag:",Resource quota (in the number of outstanding requests) for the cluster"` } // Defines custom config for K8s Array plugin @@ -42,7 +42,7 @@ type Config struct { DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` - ResourcesConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to be used with Resource Manager."` + ResourceConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to limit number of resources used by k8s-array."` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config } @@ -50,3 +50,7 @@ type Config struct { func GetConfig() *Config { return configSection.GetConfig().(*Config) } + +func IsResourceConfigSet() bool { + return GetConfig().ResourceConfig != (ResourceConfig{}) +} diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 850ee1250..a77680ed0 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -169,12 +169,13 @@ func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plu return nil, err } - primaryLabel := GetConfig().ResourcesConfig.PrimaryLabel - resourceLimit := GetConfig().ResourcesConfig.Limit - - if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), resourceLimit); err != nil { - logger.Errorf(ctx, "Token Resource registration for [%v] failed due to error [%v]", primaryLabel, err) - return nil, err + if IsResourceConfigSet() { + primaryLabel := GetConfig().ResourceConfig.PrimaryLabel + limit := GetConfig().ResourceConfig.Limit + if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), limit); err != nil { + logger.Errorf(ctx, "Token Resource registration for [%v] failed due to error [%v]", primaryLabel, err) + return nil, err + } } return exec, nil diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 4c4825ba4..5471a9537 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -47,6 +47,11 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon logLinks = make([]*idlCore.TaskLog, 0, 4) newState = currentState + msg := errorcollector.NewErrorMessageCollector() + newArrayStatus := arraystatus.ArrayStatus{ + Summary: arraystatus.ArraySummary{}, + Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), + } if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) @@ -55,13 +60,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return currentState, logLinks, nil } - msg := errorcollector.NewErrorMessageCollector() - - newArrayStatus := arraystatus.ArrayStatus{ - Summary: arraystatus.ArraySummary{}, - Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), - } - + // If we have arrived at this state for the first time then currentState has not been + // initialized with number of sub tasks. if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { currentState.ArrayStatus = newArrayStatus } @@ -114,20 +114,14 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon pod = ApplyPodPolicies(ctx, config, pod) - resourceNamespace := core.ResourceNamespace(ResourcesPrimaryLabel) - resourceConstrainSpec := createResourceConstraintsSpec(ctx, tCtx, config) - allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstrainSpec) + success, err := allocateResource(ctx, tCtx, config, podName, childIdx, &newArrayStatus) if err != nil { - logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", - tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) } - if allocationStatus != core.AllocationStatusGranted { - newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) - newArrayStatus.Summary.Inc(core.PhaseWaitingForResources) + + if !success { continue } - logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) err = kubeClient.GetClient().Create(ctx, pod) if err != nil && !k8serrors.IsAlreadyExists(err) { @@ -269,20 +263,45 @@ func CheckPodStatus(ctx context.Context, client core.KubeClient, name k8sTypes.N } -func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext, config *Config) core.ResourceConstraintsSpec { +func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext) core.ResourceConstraintsSpec { constraintsSpec := core.ResourceConstraintsSpec{ ProjectScopeResourceConstraint: nil, NamespaceScopeResourceConstraint: nil, } - if config.ResourcesConfig == (ResourceConfig{}) { + if !IsResourceConfigSet() { logger.Infof(ctx, "No Resource config is found. Returning an empty resource constraints spec") return constraintsSpec } constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} - logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) + return constraintsSpec } + +func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (bool, error) { + if !IsResourceConfigSet() { + return true, nil + } + + resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) + resourceConstraintSpec := createResourceConstraintsSpec(ctx, tCtx) + + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) + if err != nil { + logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", + tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) + return false, err + } + + if allocationStatus != core.AllocationStatusGranted { + arrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) + arrayStatus.Summary.Inc(core.PhaseWaitingForResources) + return false, nil + } + + logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) + return true, nil +} diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go new file mode 100644 index 000000000..e7f30499f --- /dev/null +++ b/go/tasks/plugins/array/k8s/task.go @@ -0,0 +1,83 @@ +package k8s + +import ( + "context" + "strconv" + "strings" + + idlCore "github.com/lyft/flyteidl/gen/pb-go/flyteidl/core" + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" + arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" + errors2 "github.com/lyft/flytestdlib/errors" + "github.com/lyft/flytestdlib/logger" + corev1 "k8s.io/api/core/v1" + k8serrors "k8s.io/apimachinery/pkg/api/errors" +) + +type Task struct { + LogLinks *[]idlCore.TaskLog + State *arrayCore.State + Config *Config + ChildIdx int +} + +func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (bool, error) { + newState := t.State + podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) + if err != nil { + return false, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") + } + + var args []string + if len(podTemplate.Spec.Containers) > 0 { + args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) + podTemplate.Spec.Containers[0].Command = []string{} + } + + indexStr := strconv.Itoa(t.ChildIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + + pod := podTemplate.DeepCopy() + pod.Name = podName + pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{ + Name: FlyteK8sArrayIndexVarName, + Value: indexStr, + }) + + pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, arrayJobEnvVars...) + pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) + if err != nil { + return false, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + } + + pod = ApplyPodPolicies(ctx, t.Config, pod) + + succeeded, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, &newState.ArrayStatus) + if err != nil { + return succeeded, err + } + + if !succeeded { + return false, nil + } + + err = kubeClient.GetClient().Create(ctx, pod) + if err != nil && !k8serrors.IsAlreadyExists(err) { + if k8serrors.IsForbidden(err) { + if strings.Contains(err.Error(), "exceeded quota") { + // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. + logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) + newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") + } else { + newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") + } + + newState = newState.SetReason(err.Error()) + return false, nil + } + + return false, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") + } + +} From 9cd4e4e601085f5f7b049064ed55b7387c8b62e3 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 11:12:32 -0700 Subject: [PATCH 16/39] switch to enum return status --- go/tasks/plugins/array/k8s/task.go | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index e7f30499f..67735d605 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -22,11 +22,20 @@ type Task struct { ChildIdx int } -func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (bool, error) { +type TaskLaunchStatus int8 + +const ( + Success TaskLaunchStatus = iota + Error + Skip + ReturnState +) + +func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskLaunchStatus, error) { newState := t.State podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) if err != nil { - return false, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") + return Error, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") } var args []string @@ -48,18 +57,18 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, arrayJobEnvVars...) pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) if err != nil { - return false, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + return Error, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") } pod = ApplyPodPolicies(ctx, t.Config, pod) succeeded, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, &newState.ArrayStatus) if err != nil { - return succeeded, err + return Error, err } if !succeeded { - return false, nil + return Skip, nil } err = kubeClient.GetClient().Create(ctx, pod) @@ -74,10 +83,11 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl } newState = newState.SetReason(err.Error()) - return false, nil + return ReturnState, nil } - return false, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") + return Error, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") } + return Success, nil } From 1a5c62cd9b6fc5f617bce44cfa4224ee89833762 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 15:46:38 -0700 Subject: [PATCH 17/39] impl Monitor and Finalize in task --- go/tasks/plugins/array/k8s/config_flags.go | 2 - .../plugins/array/k8s/config_flags_test.go | 44 ----- go/tasks/plugins/array/k8s/monitor.go | 163 ++++-------------- go/tasks/plugins/array/k8s/task.go | 163 ++++++++++++++++-- 4 files changed, 184 insertions(+), 188 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config_flags.go b/go/tasks/plugins/array/k8s/config_flags.go index 92f05aca7..4a03aefc9 100755 --- a/go/tasks/plugins/array/k8s/config_flags.go +++ b/go/tasks/plugins/array/k8s/config_flags.go @@ -50,7 +50,5 @@ func (cfg Config) GetPFlagSet(prefix string) *pflag.FlagSet { cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.workers"), defaultConfig.ErrorAssembler.Workers, "Number of concurrent workers to start processing the queue.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.maxRetries"), defaultConfig.ErrorAssembler.MaxRetries, "Maximum number of retries per item.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ErrorAssembler.maxItems"), defaultConfig.ErrorAssembler.IndexCacheMaxItems, "Maximum number of entries to keep in the index.") - cmdFlags.String(fmt.Sprintf("%v%v", prefix, "ResourcesConfig.primaryLabel"), defaultConfig.ResourcesConfig.PrimaryLabel, "PrimaryLabel of a given service cluster") - cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "ResourcesConfig.limit"), defaultConfig.ResourcesConfig.Limit, "Resource quota (in the number of outstanding requests) of the service cluster") return cmdFlags } diff --git a/go/tasks/plugins/array/k8s/config_flags_test.go b/go/tasks/plugins/array/k8s/config_flags_test.go index 4329d54f6..df7b41909 100755 --- a/go/tasks/plugins/array/k8s/config_flags_test.go +++ b/go/tasks/plugins/array/k8s/config_flags_test.go @@ -297,48 +297,4 @@ func TestConfig_SetFlags(t *testing.T) { } }) }) - t.Run("Test_ResourcesConfig.primaryLabel", func(t *testing.T) { - t.Run("DefaultValue", func(t *testing.T) { - // Test that default value is set properly - if vString, err := cmdFlags.GetString("ResourcesConfig.primaryLabel"); err == nil { - assert.Equal(t, string(defaultConfig.ResourcesConfig.PrimaryLabel), vString) - } else { - assert.FailNow(t, err.Error()) - } - }) - - t.Run("Override", func(t *testing.T) { - testValue := "1" - - cmdFlags.Set("ResourcesConfig.primaryLabel", testValue) - if vString, err := cmdFlags.GetString("ResourcesConfig.primaryLabel"); err == nil { - testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ResourcesConfig.PrimaryLabel) - - } else { - assert.FailNow(t, err.Error()) - } - }) - }) - t.Run("Test_ResourcesConfig.limit", func(t *testing.T) { - t.Run("DefaultValue", func(t *testing.T) { - // Test that default value is set properly - if vInt, err := cmdFlags.GetInt("ResourcesConfig.limit"); err == nil { - assert.Equal(t, int(defaultConfig.ResourcesConfig.Limit), vInt) - } else { - assert.FailNow(t, err.Error()) - } - }) - - t.Run("Override", func(t *testing.T) { - testValue := "1" - - cmdFlags.Set("ResourcesConfig.limit", testValue) - if vInt, err := cmdFlags.GetInt("ResourcesConfig.limit"); err == nil { - testDecodeJson_Config(t, fmt.Sprintf("%v", vInt), &actual.ResourcesConfig.Limit) - - } else { - assert.FailNow(t, err.Error()) - } - }) - }) } diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 5471a9537..94971b7a6 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -4,17 +4,11 @@ import ( "context" "fmt" "strconv" - "strings" "time" - "github.com/lyft/flyteplugins/go/tasks/errors" - "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" - "github.com/lyft/flytestdlib/logger" "github.com/lyft/flytestdlib/storage" - "github.com/lyft/flyteplugins/go/tasks/plugins/array" - arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" "github.com/lyft/flytestdlib/bitarray" @@ -22,7 +16,6 @@ import ( "github.com/lyft/flyteplugins/go/tasks/plugins/array/arraystatus" "github.com/lyft/flyteplugins/go/tasks/plugins/array/errorcollector" - corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sTypes "k8s.io/apimachinery/pkg/types" @@ -44,11 +37,10 @@ const ( func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( newState *arrayCore.State, logLinks []*idlCore.TaskLog, err error) { - logLinks = make([]*idlCore.TaskLog, 0, 4) newState = currentState msg := errorcollector.NewErrorMessageCollector() - newArrayStatus := arraystatus.ArrayStatus{ + newArrayStatus := &arraystatus.ArrayStatus{ Summary: arraystatus.ArraySummary{}, Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), } @@ -63,18 +55,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // If we have arrived at this state for the first time then currentState has not been // initialized with number of sub tasks. if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { - currentState.ArrayStatus = newArrayStatus - } - - podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) - if err != nil { - return newState, logLinks, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for task") - } - - var args []string - if len(podTemplate.Spec.Containers) > 0 { - args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) - podTemplate.Spec.Containers[0].Command = []string{} + currentState.ArrayStatus = *newArrayStatus } for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { @@ -85,11 +66,11 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // If we get here it means we have already "processed" this terminal phase since we will only persist // the phase after all processing is done (e.g. check outputs/errors file, record events... etc.). - // Deallocate Resource - err = tCtx.ResourceManager().ReleaseResource(ctx, core.ResourceNamespace(ResourcesPrimaryLabel), podName) + // Since we know we have already "processed" this terminal phase we can safely deallocate resource + err = deallocateResource(ctx, tCtx, config, childIdx) if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return newState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") + logger.Errorf(ctx, "Error releasing allocation token [%s] in LaunchAndCheckSubTasks [%s]", podName, err) + return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") } newArrayStatus.Summary.Inc(existingPhase) newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(existingPhase)) @@ -98,81 +79,44 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon continue } - pod := podTemplate.DeepCopy() - pod.Name = podName - pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, corev1.EnvVar{ - Name: FlyteK8sArrayIndexVarName, - Value: indexStr, - }) - - pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, arrayJobEnvVars...) - - pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) - if err != nil { - return newState, logLinks, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + task := &Task{ + LogLinks: logLinks, + State: newState, + NewArrayStatus: newArrayStatus, + Config: config, + ChildIdx: childIdx, + MessageCollector: &msg, } - pod = ApplyPodPolicies(ctx, config, pod) - - success, err := allocateResource(ctx, tCtx, config, podName, childIdx, &newArrayStatus) + // The first time we enter this state we will launch every subtask. On subsequent rounds, the pod + // has already been created so we return a Success value and continue with the Monitor step. + var status TaskStatus + status, err = task.Launch(ctx, tCtx, kubeClient) if err != nil { - return newState, logLinks, errors2.Wrapf(errors.ResourceManagerFailure, err, "Error requesting allocation token %s", podName) + return currentState, logLinks, err } - if !success { + switch status { + case Success: + // Continue with execution if successful + case Error: + return currentState, logLinks, err + // If Resource manager is enabled and there are currently not enough resources we can skip this round + // for a subtask and wait until there are enough resources. + case Skip: continue + case ReturnState: + return currentState, logLinks, nil } - err = kubeClient.GetClient().Create(ctx, pod) - if err != nil && !k8serrors.IsAlreadyExists(err) { - if k8serrors.IsForbidden(err) { - if strings.Contains(err.Error(), "exceeded quota") { - // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. - logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) - newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job.") - } else { - newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") - } - - newState = newState.SetReason(err.Error()) - return newState, logLinks, nil - } - - return newState, logLinks, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job") - } - - phaseInfo, err := CheckPodStatus(ctx, kubeClient, - k8sTypes.NamespacedName{ - Name: podName, - Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), - }) - if err != nil { - return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status") + status, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) + if status != Success { + return currentState, logLinks, err } - if phaseInfo.Info() != nil { - logLinks = append(logLinks, phaseInfo.Info().Logs...) - } - - if phaseInfo.Err() != nil { - msg.Collect(childIdx, phaseInfo.Err().String()) - } - - actualPhase := phaseInfo.Phase() - if phaseInfo.Phase().IsSuccess() { - originalIdx := arrayCore.CalculateOriginalIndex(childIdx, currentState.GetIndexesToCache()) - actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, childIdx, originalIdx) - if err != nil { - return nil, nil, err - } - } - - newArrayStatus.Detailed.SetItem(childIdx, bitarray.Item(actualPhase)) - newArrayStatus.Summary.Inc(actualPhase) - } - newState = newState.SetArrayStatus(newArrayStatus) + newState = newState.SetArrayStatus(*newArrayStatus) // Check that the taskTemplate is valid taskTemplate, err := tCtx.TaskReader().Read(ctx) @@ -262,46 +206,3 @@ func CheckPodStatus(ctx context.Context, client core.KubeClient, name k8sTypes.N return core.PhaseInfoRunning(core.DefaultPhaseVersion, &taskInfo), nil } - -func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext) core.ResourceConstraintsSpec { - constraintsSpec := core.ResourceConstraintsSpec{ - ProjectScopeResourceConstraint: nil, - NamespaceScopeResourceConstraint: nil, - } - - if !IsResourceConfigSet() { - logger.Infof(ctx, "No Resource config is found. Returning an empty resource constraints spec") - return constraintsSpec - } - - constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} - constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} - logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) - - return constraintsSpec -} - -func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (bool, error) { - if !IsResourceConfigSet() { - return true, nil - } - - resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) - resourceConstraintSpec := createResourceConstraintsSpec(ctx, tCtx) - - allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) - if err != nil { - logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", - tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) - return false, err - } - - if allocationStatus != core.AllocationStatusGranted { - arrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) - arrayStatus.Summary.Inc(core.PhaseWaitingForResources) - return false, nil - } - - logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) - return true, nil -} diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 67735d605..f7a57ba2b 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -8,30 +8,39 @@ import ( idlCore "github.com/lyft/flyteidl/gen/pb-go/flyteidl/core" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/utils" + "github.com/lyft/flyteplugins/go/tasks/plugins/array" + "github.com/lyft/flyteplugins/go/tasks/plugins/array/arraystatus" arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" + "github.com/lyft/flyteplugins/go/tasks/plugins/array/errorcollector" + "github.com/lyft/flytestdlib/bitarray" errors2 "github.com/lyft/flytestdlib/errors" "github.com/lyft/flytestdlib/logger" + "github.com/lyft/flytestdlib/storage" corev1 "k8s.io/api/core/v1" k8serrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sTypes "k8s.io/apimachinery/pkg/types" ) type Task struct { - LogLinks *[]idlCore.TaskLog - State *arrayCore.State - Config *Config - ChildIdx int + LogLinks []*idlCore.TaskLog + State *arrayCore.State + NewArrayStatus *arraystatus.ArrayStatus + Config *Config + ChildIdx int + MessageCollector *errorcollector.ErrorMessageCollector } -type TaskLaunchStatus int8 +type TaskStatus int8 const ( - Success TaskLaunchStatus = iota + Success TaskStatus = iota Error Skip ReturnState ) -func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskLaunchStatus, error) { +func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskStatus, error) { newState := t.State podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) if err != nil { @@ -62,7 +71,7 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl pod = ApplyPodPolicies(ctx, t.Config, pod) - succeeded, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, &newState.ArrayStatus) + succeeded, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, t.NewArrayStatus) if err != nil { return Error, err } @@ -77,12 +86,12 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl if strings.Contains(err.Error(), "exceeded quota") { // TODO: Quota errors are retried forever, it would be good to have support for backoff strategy. logger.Infof(ctx, "Failed to launch job, resource quota exceeded. Err: %v", err) - newState = newState.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") + t.State = t.State.SetPhase(arrayCore.PhaseWaitingForResources, 0).SetReason("Not enough resources to launch job") } else { - newState = newState.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") + t.State = t.State.SetPhase(arrayCore.PhaseRetryableFailure, 0).SetReason("Failed to launch job.") } - newState = newState.SetReason(err.Error()) + t.State = t.State.SetReason(err.Error()) return ReturnState, nil } @@ -91,3 +100,135 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl return Success, nil } + +func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference) (TaskStatus, error) { + indexStr := strconv.Itoa(t.ChildIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + phaseInfo, err := CheckPodStatus(ctx, kubeClient, + k8sTypes.NamespacedName{ + Name: podName, + Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), + }) + if err != nil { + return Error, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") + } + + if phaseInfo.Info() != nil { + t.LogLinks = append(t.LogLinks, phaseInfo.Info().Logs...) + } + + if phaseInfo.Err() != nil { + t.MessageCollector.Collect(t.ChildIdx, phaseInfo.Err().String()) + } + + actualPhase := phaseInfo.Phase() + if phaseInfo.Phase().IsSuccess() { + originalIdx := arrayCore.CalculateOriginalIndex(t.ChildIdx, t.State.GetIndexesToCache()) + actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, t.ChildIdx, originalIdx) + } + + t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(actualPhase)) + t.NewArrayStatus.Summary.Inc(actualPhase) + + return Success, nil +} + +func (t Task) Abort() {} + +func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskStatus, error) { + indexStr := strconv.Itoa(t.ChildIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + pod := &corev1.Pod{ + TypeMeta: metav1.TypeMeta{ + Kind: PodKind, + APIVersion: metav1.SchemeGroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), + }, + } + + err := kubeClient.GetClient().Delete(ctx, pod) + if err != nil { + if k8serrors.IsNotFound(err) { + return Success, nil + } + + return Error, err + } + + if !IsResourceConfigSet() { + return Success, nil + } + + // Deallocate Resouce + err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + return Error, err + } + + return Success, nil + +} + +func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (bool, error) { + if !IsResourceConfigSet() { + return true, nil + } + + resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) + resourceConstraintSpec := createResourceConstraintsSpec(ctx, tCtx) + + allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) + if err != nil { + logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", + tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) + return false, err + } + + if allocationStatus != core.AllocationStatusGranted { + arrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) + arrayStatus.Summary.Inc(core.PhaseWaitingForResources) + return false, nil + } + + logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) + return true, nil +} + +func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, childIdx int) error { + if !IsResourceConfigSet() { + return nil + } + indexStr := strconv.Itoa((childIdx)) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) + + err := tCtx.ResourceManager().ReleaseResource(ctx, resourceNamespace, podName) + if err != nil { + logger.Errorf(ctx, "Error releasing token [%s]. error %s", podName, err) + return err + } + + return nil +} + +func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext) core.ResourceConstraintsSpec { + constraintsSpec := core.ResourceConstraintsSpec{ + ProjectScopeResourceConstraint: nil, + NamespaceScopeResourceConstraint: nil, + } + + if !IsResourceConfigSet() { + logger.Infof(ctx, "No Resource config is found. Returning an empty resource constraints spec") + return constraintsSpec + } + + constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} + constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} + logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) + + return constraintsSpec +} From a72846d36f8e068d7c4378d1d7f597eae924f194 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 16:07:00 -0700 Subject: [PATCH 18/39] use task in finalize --- go/tasks/plugins/array/k8s/executor.go | 3 +- go/tasks/plugins/array/k8s/launcher.go | 41 +++++++------------------- go/tasks/plugins/array/k8s/task.go | 15 ++++------ 3 files changed, 16 insertions(+), 43 deletions(-) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index a77680ed0..da6a4541f 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -133,8 +133,7 @@ func (e Executor) Finalize(ctx context.Context, tCtx core.TaskExecutionContext) return errors.Wrapf(errors.CorruptedPluginState, err, "Failed to read unmarshal custom state") } - return TerminateSubTasks(ctx, tCtx, e.kubeClient, pluginConfig.MaxErrorStringLength, - pluginState) + return TerminateSubTasks(ctx, tCtx, e.kubeClient, pluginConfig, pluginState) } func (e Executor) Start(ctx context.Context) error { diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 1264a479f..9ea38950c 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -9,8 +9,6 @@ import ( "github.com/lyft/flyteplugins/go/tasks/errors" "github.com/lyft/flyteplugins/go/tasks/plugins/array/errorcollector" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" arraystatus2 "github.com/lyft/flyteplugins/go/tasks/plugins/array/arraystatus" @@ -138,44 +136,25 @@ func LaunchSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeCli return currentState, nil } -func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, - errsMaxLength int, currentState *arrayCore.State) error { +func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, + currentState *arrayCore.State) error { size := currentState.GetExecutionArraySize() errs := errorcollector.NewErrorMessageCollector() - for i := 0; i < size; i++ { - indexStr := strconv.Itoa(i) - podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) - pod := &corev1.Pod{ - TypeMeta: metav1.TypeMeta{ - Kind: PodKind, - APIVersion: metav1.SchemeGroupVersion.String(), - }, - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), - }, + for childIdx := 0; childIdx < size; childIdx++ { + task := Task{ + ChildIdx: childIdx, + Config: config, + State: currentState, } - - err := kubeClient.GetClient().Delete(ctx, pod) - if err != nil { - if k8serrors.IsNotFound(err) { - continue - } - - errs.Collect(i, err.Error()) - } - - // Deallocate Resource - err = tCtx.ResourceManager().ReleaseResource(ctx, core.ResourceNamespace(ResourcesPrimaryLabel), podName) + err := task.Finalize(ctx, tCtx, kubeClient) if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - errs.Collect(i, err.Error()) + errs.Collect(childIdx, err.Error()) } } if errs.Length() > 0 { - return fmt.Errorf(errs.Summary(errsMaxLength)) + return fmt.Errorf(errs.Summary(config.MaxErrorStringLength)) } return nil diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index f7a57ba2b..88be36995 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -41,7 +41,6 @@ const ( ) func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskStatus, error) { - newState := t.State podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) if err != nil { return Error, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") @@ -135,7 +134,7 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC func (t Task) Abort() {} -func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskStatus, error) { +func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { indexStr := strconv.Itoa(t.ChildIdx) podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) pod := &corev1.Pod{ @@ -152,24 +151,20 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube err := kubeClient.GetClient().Delete(ctx, pod) if err != nil { if k8serrors.IsNotFound(err) { - return Success, nil + return nil } - return Error, err - } - - if !IsResourceConfigSet() { - return Success, nil + return err } // Deallocate Resouce err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return Error, err + return err } - return Success, nil + return nil } From 94f695dafe1f36bfc02957f3b637028adef1a617 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 16:10:44 -0700 Subject: [PATCH 19/39] upd comment --- go/tasks/plugins/array/k8s/executor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index da6a4541f..0de354164 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -78,7 +78,7 @@ func (e Executor) Handle(ctx context.Context, tCtx core.TaskExecutionContext) (c fallthrough case arrayCore.PhaseLaunch: - // This is a hack. In order to maintain backwards compatibility with the state transitions + // In order to maintain backwards compatibility with the state transitions // in the aws batch plugin. Forward to PhaseCheckingSubTasksExecutions where the launching // is actually occurring. nextState = pluginState.SetPhase(arrayCore.PhaseCheckingSubTaskExecutions, core.DefaultPhaseVersion).SetReason("Nothing to do in Launch phase.") From 98409ba508eacb19a308592d965389247091a1c7 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 16:17:55 -0700 Subject: [PATCH 20/39] upd Skip to Waiting --- go/tasks/plugins/array/k8s/monitor.go | 2 +- go/tasks/plugins/array/k8s/task.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 94971b7a6..82b7b790b 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -103,7 +103,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon return currentState, logLinks, err // If Resource manager is enabled and there are currently not enough resources we can skip this round // for a subtask and wait until there are enough resources. - case Skip: + case Waiting: continue case ReturnState: return currentState, logLinks, nil diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 88be36995..e10daff30 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -36,7 +36,7 @@ type TaskStatus int8 const ( Success TaskStatus = iota Error - Skip + Waiting ReturnState ) @@ -76,7 +76,7 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl } if !succeeded { - return Skip, nil + return Waiting, nil } err = kubeClient.GetClient().Create(ctx, pod) From 08b75e7c5221c769581adb7c5404db669b18fed1 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 6 Apr 2020 16:28:20 -0700 Subject: [PATCH 21/39] lint and err --- go/tasks/plugins/array/k8s/task.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index e10daff30..d7ab41300 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -124,6 +124,9 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC if phaseInfo.Phase().IsSuccess() { originalIdx := arrayCore.CalculateOriginalIndex(t.ChildIdx, t.State.GetIndexesToCache()) actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, t.ChildIdx, originalIdx) + if err != nil { + return Error, err + } } t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(actualPhase)) @@ -157,7 +160,7 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube return err } - // Deallocate Resouce + // Deallocate Resource err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) From ba2a74d46c0dc5cca05e68c175b496960a2d56b3 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Tue, 14 Apr 2020 15:23:17 -0700 Subject: [PATCH 22/39] upd based on pr feedback --- go/tasks/plugins/array/core/state.go | 2 +- go/tasks/plugins/array/k8s/monitor.go | 14 +++++++------- go/tasks/plugins/array/k8s/task.go | 6 ++++++ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index 1453c0e79..8cf6d47e4 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -248,7 +248,7 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus if totalWaitingForResources > 0 { logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) - return PhaseCheckingSubTaskExecutions + return PhaseWaitingForResources } if totalCount < minSuccesses { diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 82b7b790b..e1fdf5179 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -37,6 +37,13 @@ const ( func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, config *Config, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference, currentState *arrayCore.State) ( newState *arrayCore.State, logLinks []*idlCore.TaskLog, err error) { + if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { + ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) + logger.Info(ctx, ee) + currentState = currentState.SetPhase(arrayCore.PhasePermanentFailure, 0).SetReason(ee.Error()) + return currentState, logLinks, nil + } + logLinks = make([]*idlCore.TaskLog, 0, 4) newState = currentState msg := errorcollector.NewErrorMessageCollector() @@ -45,13 +52,6 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon Detailed: arrayCore.NewPhasesCompactArray(uint(currentState.GetExecutionArraySize())), } - if int64(currentState.GetExecutionArraySize()) > config.MaxArrayJobSize { - ee := fmt.Errorf("array size > max allowed. Requested [%v]. Allowed [%v]", currentState.GetExecutionArraySize(), config.MaxArrayJobSize) - logger.Info(ctx, ee) - currentState = currentState.SetPhase(arrayCore.PhasePermanentFailure, 0).SetReason(ee.Error()) - return currentState, logLinks, nil - } - // If we have arrived at this state for the first time then currentState has not been // initialized with number of sub tasks. if len(currentState.GetArrayStatus().Detailed.GetItems()) == 0 { diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index d7ab41300..2e840f8aa 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -154,6 +154,12 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube err := kubeClient.GetClient().Delete(ctx, pod) if err != nil { if k8serrors.IsNotFound(err) { + // Deallocate Resource + err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + return err + } return nil } From dd5db901cba908a6be5964554c1290182a468592 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Tue, 14 Apr 2020 17:00:51 -0700 Subject: [PATCH 23/39] upd allocate to return status, throw error on no containers --- go/tasks/plugins/array/k8s/task.go | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 2e840f8aa..60b0276a1 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -50,6 +50,8 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl if len(podTemplate.Spec.Containers) > 0 { args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) podTemplate.Spec.Containers[0].Command = []string{} + } else { + return Error, errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") } indexStr := strconv.Itoa(t.ChildIdx) @@ -70,12 +72,13 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl pod = ApplyPodPolicies(ctx, t.Config, pod) - succeeded, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, t.NewArrayStatus) + allocationStatus, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, t.NewArrayStatus) if err != nil { return Error, err } - - if !succeeded { + if allocationStatus != core.AllocationStatusGranted { + t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(core.PhaseWaitingForResources)) + t.NewArrayStatus.Summary.Inc(core.PhaseWaitingForResources) return Waiting, nil } @@ -177,9 +180,9 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube } -func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (bool, error) { +func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (core.AllocationStatus, error) { if !IsResourceConfigSet() { - return true, nil + return core.AllocationStatusGranted, nil } resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) @@ -189,17 +192,11 @@ func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, confi if err != nil { logger.Errorf(ctx, "Resource manager failed for TaskExecId [%s] token [%s]. error %s", tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetID(), podName, err) - return false, err - } - - if allocationStatus != core.AllocationStatusGranted { - arrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseWaitingForResources)) - arrayStatus.Summary.Inc(core.PhaseWaitingForResources) - return false, nil + return core.AllocationUndefined, err } logger.Infof(ctx, "Allocation result for [%s] is [%s]", podName, allocationStatus) - return true, nil + return allocationStatus, nil } func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, childIdx int) error { From edd233e62fe47557576934a80bfa2152f833aa9c Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Tue, 14 Apr 2020 17:38:38 -0700 Subject: [PATCH 24/39] lint --- go/tasks/plugins/array/k8s/task.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 07b2c4954..5e0a34c10 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -74,7 +74,7 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl pod = applyNodeSelectorLabels(ctx, t.Config, pod) pod = applyPodTolerations(ctx, t.Config, pod) - allocationStatus, err := allocateResource(ctx, tCtx, t.Config, podName, t.ChildIdx, t.NewArrayStatus) + allocationStatus, err := allocateResource(ctx, tCtx, t.Config, podName) if err != nil { return Error, err } @@ -182,7 +182,7 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube } -func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string, childIdx int, arrayStatus *arraystatus.ArrayStatus) (core.AllocationStatus, error) { +func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { if !IsResourceConfigSet() { return core.AllocationStatusGranted, nil } From 5f5d1a9699cc35c69e7e7c60aa4f969b75cc88fe Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Thu, 16 Apr 2020 15:02:17 -0700 Subject: [PATCH 25/39] upd pflag, launch and monitor status --- go/tasks/plugins/array/k8s/config.go | 2 +- go/tasks/plugins/array/k8s/config_flags.go | 2 + .../plugins/array/k8s/config_flags_test.go | 44 +++++++++++++ go/tasks/plugins/array/k8s/launcher.go | 4 ++ go/tasks/plugins/array/k8s/monitor.go | 19 +++--- go/tasks/plugins/array/k8s/task.go | 61 ++++++++++++------- 6 files changed, 101 insertions(+), 31 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index abcf74c54..c7e5d515c 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -44,7 +44,7 @@ type Config struct { DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` - ResourceConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to limit number of resources used by k8s-array."` + ResourceConfig ResourceConfig `json:"resourceConfig" pflag:",ResourceConfiguration to limit number of resources used by k8s-array."` NodeSelector map[string]string `json:"node-selector" pflag:"-,Defines a set of node selector labels to add to the pod."` Tolerations []v1.Toleration `json:"tolerations" pflag:"-,Tolerations to be applied for k8s-array pods"` OutputAssembler workqueue.Config diff --git a/go/tasks/plugins/array/k8s/config_flags.go b/go/tasks/plugins/array/k8s/config_flags.go index 4a03aefc9..7bb1404de 100755 --- a/go/tasks/plugins/array/k8s/config_flags.go +++ b/go/tasks/plugins/array/k8s/config_flags.go @@ -44,6 +44,8 @@ func (cfg Config) GetPFlagSet(prefix string) *pflag.FlagSet { cmdFlags.String(fmt.Sprintf("%v%v", prefix, "scheduler"), defaultConfig.DefaultScheduler, "Decides the scheduler to use when launching array-pods.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "maxErrLength"), defaultConfig.MaxErrorStringLength, "Determines the maximum length of the error string returned for the array.") cmdFlags.Int64(fmt.Sprintf("%v%v", prefix, "maxArrayJobSize"), defaultConfig.MaxArrayJobSize, "Maximum size of array job.") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "resourceConfig.primaryLabel"), defaultConfig.ResourceConfig.PrimaryLabel, "PrimaryLabel of a given service cluster") + cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "resourceConfig.limit"), defaultConfig.ResourceConfig.Limit, "Resource quota (in the number of outstanding requests) for the cluster") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.workers"), defaultConfig.OutputAssembler.Workers, "Number of concurrent workers to start processing the queue.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.maxRetries"), defaultConfig.OutputAssembler.MaxRetries, "Maximum number of retries per item.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.maxItems"), defaultConfig.OutputAssembler.IndexCacheMaxItems, "Maximum number of entries to keep in the index.") diff --git a/go/tasks/plugins/array/k8s/config_flags_test.go b/go/tasks/plugins/array/k8s/config_flags_test.go index df7b41909..d0ffbdb49 100755 --- a/go/tasks/plugins/array/k8s/config_flags_test.go +++ b/go/tasks/plugins/array/k8s/config_flags_test.go @@ -165,6 +165,50 @@ func TestConfig_SetFlags(t *testing.T) { } }) }) + t.Run("Test_resourceConfig.primaryLabel", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("resourceConfig.primaryLabel"); err == nil { + assert.Equal(t, string(defaultConfig.ResourceConfig.PrimaryLabel), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("resourceConfig.primaryLabel", testValue) + if vString, err := cmdFlags.GetString("resourceConfig.primaryLabel"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.ResourceConfig.PrimaryLabel) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_resourceConfig.limit", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vInt, err := cmdFlags.GetInt("resourceConfig.limit"); err == nil { + assert.Equal(t, int(defaultConfig.ResourceConfig.Limit), vInt) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("resourceConfig.limit", testValue) + if vInt, err := cmdFlags.GetInt("resourceConfig.limit"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vInt), &actual.ResourceConfig.Limit) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) t.Run("Test_OutputAssembler.workers", func(t *testing.T) { t.Run("DefaultValue", func(t *testing.T) { // Test that default value is set properly diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 919b956ee..5af0a6965 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -74,6 +74,10 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube if err != nil { errs.Collect(childIdx, err.Error()) } + err = task.Abort(ctx, tCtx, kubeClient) + if err != nil { + errs.Collect(childIdx, err.Error()) + } } if errs.Length() > 0 { diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index e1fdf5179..38ace640c 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -90,27 +90,28 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // The first time we enter this state we will launch every subtask. On subsequent rounds, the pod // has already been created so we return a Success value and continue with the Monitor step. - var status TaskStatus - status, err = task.Launch(ctx, tCtx, kubeClient) + var launchResult LaunchResult + launchResult, err = task.Launch(ctx, tCtx, kubeClient) if err != nil { return currentState, logLinks, err } - switch status { - case Success: + switch launchResult { + case LaunchSuccess: // Continue with execution if successful - case Error: + case LaunchError: return currentState, logLinks, err // If Resource manager is enabled and there are currently not enough resources we can skip this round // for a subtask and wait until there are enough resources. - case Waiting: + case LaunchWaiting: continue - case ReturnState: + case LaunchReturnState: return currentState, logLinks, nil } - status, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) - if status != Success { + var monitorResult MonitorResult + monitorResult, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) + if monitorResult != MonitorSuccess { return currentState, logLinks, err } diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 5e0a34c10..c52610869 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -31,19 +31,25 @@ type Task struct { MessageCollector *errorcollector.ErrorMessageCollector } -type TaskStatus int8 +type LaunchResult int8 +type MonitorResult int8 const ( - Success TaskStatus = iota - Error - Waiting - ReturnState + LaunchSuccess LaunchResult = iota + LaunchError + LaunchWaiting + LaunchReturnState ) -func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (TaskStatus, error) { +const ( + MonitorSuccess MonitorResult = iota + MonitorError +) + +func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (LaunchResult, error) { podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) if err != nil { - return Error, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") + return LaunchError, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") } var args []string @@ -51,7 +57,7 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) podTemplate.Spec.Containers[0].Command = []string{} } else { - return Error, errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") + return LaunchError, errors2.Wrapf(ErrReplaceCmdTemplate, err, "No containers found in podSpec.") } indexStr := strconv.Itoa(t.ChildIdx) @@ -67,7 +73,7 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl pod.Spec.Containers[0].Env = append(pod.Spec.Containers[0].Env, arrayJobEnvVars...) pod.Spec.Containers[0].Args, err = utils.ReplaceTemplateCommandArgs(ctx, args, arrayJobInputReader{tCtx.InputReader()}, tCtx.OutputWriter()) if err != nil { - return Error, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") + return LaunchError, errors2.Wrapf(ErrReplaceCmdTemplate, err, "Failed to replace cmd args") } pod = ApplyPodPolicies(ctx, t.Config, pod) @@ -76,12 +82,12 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl allocationStatus, err := allocateResource(ctx, tCtx, t.Config, podName) if err != nil { - return Error, err + return LaunchError, err } if allocationStatus != core.AllocationStatusGranted { t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(core.PhaseWaitingForResources)) t.NewArrayStatus.Summary.Inc(core.PhaseWaitingForResources) - return Waiting, nil + return LaunchWaiting, nil } err = kubeClient.GetClient().Create(ctx, pod) @@ -96,16 +102,16 @@ func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeCl } t.State = t.State.SetReason(err.Error()) - return ReturnState, nil + return LaunchReturnState, nil } - return Error, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") + return LaunchError, errors2.Wrapf(ErrSubmitJob, err, "Failed to submit job.") } - return Success, nil + return LaunchSuccess, nil } -func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference) (TaskStatus, error) { +func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient, dataStore *storage.DataStore, outputPrefix, baseOutputDataSandbox storage.DataReference) (MonitorResult, error) { indexStr := strconv.Itoa(t.ChildIdx) podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) phaseInfo, err := CheckPodStatus(ctx, kubeClient, @@ -114,7 +120,7 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC Namespace: tCtx.TaskExecutionMetadata().GetNamespace(), }) if err != nil { - return Error, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") + return MonitorError, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") } if phaseInfo.Info() != nil { @@ -130,19 +136,17 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC originalIdx := arrayCore.CalculateOriginalIndex(t.ChildIdx, t.State.GetIndexesToCache()) actualPhase, err = array.CheckTaskOutput(ctx, dataStore, outputPrefix, baseOutputDataSandbox, t.ChildIdx, originalIdx) if err != nil { - return Error, err + return MonitorError, err } } t.NewArrayStatus.Detailed.SetItem(t.ChildIdx, bitarray.Item(actualPhase)) t.NewArrayStatus.Summary.Inc(actualPhase) - return Success, nil + return MonitorSuccess, nil } -func (t Task) Abort() {} - -func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { +func (t Task) Abort(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { indexStr := strconv.Itoa(t.ChildIdx) podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) pod := &corev1.Pod{ @@ -182,6 +186,21 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube } +func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) error { + indexStr := strconv.Itoa(t.ChildIdx) + podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) + + // Deallocate Resource + err := deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) + if err != nil { + logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) + return err + } + + return nil + +} + func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { if !IsResourceConfigSet() { return core.AllocationStatusGranted, nil From ad4ce209a87ad56c6a20d660531561b0d68b530d Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Fri, 24 Apr 2020 12:54:15 -0700 Subject: [PATCH 26/39] abort then finalize --- go/tasks/plugins/array/k8s/launcher.go | 5 +++-- go/tasks/plugins/array/k8s/task.go | 7 ------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 5af0a6965..25dd53f3a 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -70,11 +70,12 @@ func TerminateSubTasks(ctx context.Context, tCtx core.TaskExecutionContext, kube Config: config, State: currentState, } - err := task.Finalize(ctx, tCtx, kubeClient) + + err := task.Abort(ctx, tCtx, kubeClient) if err != nil { errs.Collect(childIdx, err.Error()) } - err = task.Abort(ctx, tCtx, kubeClient) + err = task.Finalize(ctx, tCtx, kubeClient) if err != nil { errs.Collect(childIdx, err.Error()) } diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index c52610869..6af9c0aea 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -175,13 +175,6 @@ func (t Task) Abort(ctx context.Context, tCtx core.TaskExecutionContext, kubeCli return err } - // Deallocate Resource - err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) - if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return err - } - return nil } From 050e8b2019f5d2dd9f896fa2bcda5b688870afe1 Mon Sep 17 00:00:00 2001 From: Miguel Toledo Date: Mon, 27 Apr 2020 12:56:42 -0700 Subject: [PATCH 27/39] rm unnecessary code & update state transition --- go/tasks/plugins/array/core/state.go | 15 +++++++-------- go/tasks/plugins/array/k8s/launcher.go | 1 - go/tasks/plugins/array/k8s/task.go | 8 +------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index 8cf6d47e4..8518d5631 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -246,23 +246,22 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus } } - if totalWaitingForResources > 0 { - logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) - return PhaseWaitingForResources - } - if totalCount < minSuccesses { logger.Infof(ctx, "Array failed because totalCount[%v] < minSuccesses[%v]", totalCount, minSuccesses) return PhaseWriteToDiscoveryThenFail } // No chance to reach the required success numbers. - if totalRunning+totalSuccesses < minSuccesses { - logger.Infof(ctx, "Array failed early because totalRunning[%v] + totalSuccesses[%v] < minSuccesses[%v]", - totalRunning, totalSuccesses, minSuccesses) + if totalRunning+totalSuccesses+totalWaitingForResources < minSuccesses { + logger.Infof(ctx, "Array failed early because totalRunning[%v] + totalSuccesses[%v] + totalWaitingForResource[%v] < minSuccesses[%v]", + totalRunning, totalSuccesses, totalWaitingForResources, minSuccesses) return PhaseWriteToDiscoveryThenFail } + if totalWaitingForResources > 0 { + logger.Infof(ctx, "Array is still running and waiting for resources totalWaitingForResources[%v]", totalWaitingForResources) + return PhaseWaitingForResources + } if totalSuccesses >= minSuccesses && totalRunning == 0 { logger.Infof(ctx, "Array succeeded because totalSuccesses[%v] >= minSuccesses[%v]", totalSuccesses, minSuccesses) return PhaseWriteToDiscovery diff --git a/go/tasks/plugins/array/k8s/launcher.go b/go/tasks/plugins/array/k8s/launcher.go index 25dd53f3a..8de45d4d0 100644 --- a/go/tasks/plugins/array/k8s/launcher.go +++ b/go/tasks/plugins/array/k8s/launcher.go @@ -21,7 +21,6 @@ const ( ErrSubmitJob errors2.ErrorCode = "SUBMIT_JOB_FAILED" JobIndexVarName string = "BATCH_JOB_ARRAY_INDEX_VAR_NAME" FlyteK8sArrayIndexVarName string = "FLYTE_K8S_ARRAY_INDEX" - ResourcesPrimaryLabel string = "token" ) var arrayJobEnvVars = []corev1.EnvVar{ diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 6af9c0aea..6723ce630 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -163,15 +163,9 @@ func (t Task) Abort(ctx context.Context, tCtx core.TaskExecutionContext, kubeCli err := kubeClient.GetClient().Delete(ctx, pod) if err != nil { if k8serrors.IsNotFound(err) { - // Deallocate Resource - err = deallocateResource(ctx, tCtx, t.Config, t.ChildIdx) - if err != nil { - logger.Errorf(ctx, "Error releasing allocation token [%s] in Finalize [%s]", podName, err) - return err - } + return nil } - return err } From f6c4d72b4e79de520c1c4324ff0f07fc97bc161c Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Tue, 15 Sep 2020 01:58:03 -0700 Subject: [PATCH 28/39] Adding unit tests, remote endpoint and fixes --- go.mod | 2 +- go.sum | 4 ++ go/tasks/pluginmachinery/core/kube_client.go | 67 ++++++++++++++++++++ go/tasks/plugins/array/k8s/config.go | 18 +++--- go/tasks/plugins/array/k8s/executor.go | 41 ++++++++++-- go/tasks/plugins/array/k8s/task.go | 9 +-- 6 files changed, 119 insertions(+), 22 deletions(-) diff --git a/go.mod b/go.mod index 610aba01e..847454974 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,7 @@ require ( github.com/googleapis/gnostic v0.4.1 // indirect github.com/hashicorp/golang-lru v0.5.4 github.com/lyft/flyteidl v0.17.9 - github.com/lyft/flytestdlib v0.3.3 + github.com/lyft/flytestdlib v0.3.9 github.com/magiconair/properties v1.8.1 github.com/mitchellh/mapstructure v1.1.2 github.com/pkg/errors v0.9.1 diff --git a/go.sum b/go.sum index ae9c3db9e..8dec0a488 100644 --- a/go.sum +++ b/go.sum @@ -117,12 +117,14 @@ github.com/emicklei/go-restful v0.0.0-20170410110728-ff4f55a20633/go.mod h1:otzb github.com/emicklei/go-restful v2.9.5+incompatible/go.mod h1:otzb+WCGbkyDHkqmQmT5YD2WR4BBwUdeQoFo8l/7tVs= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/ernesto-jimenez/gogen v0.0.0-20180125220232-d7d4131e6607 h1:cTavhURetDkezJCvxFggiyLeP40Mrk/TtVg2+ycw1Es= github.com/ernesto-jimenez/gogen v0.0.0-20180125220232-d7d4131e6607/go.mod h1:Cg4fM0vhYWOZdgM7RIOSTRNIc8/VT7CXClC3Ni86lu4= github.com/evanphx/json-patch v4.2.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch v4.5.0+incompatible h1:ouOWdg56aJriqS0huScTkVXPC5IcNrDCXZ6OoTAWu7M= github.com/evanphx/json-patch v4.5.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/fatih/color v1.9.0 h1:8xPHl4/q1VyqGIPif1F+1V3Y3lSmrq01EabUW3CoW5s= github.com/fatih/color v1.9.0/go.mod h1:eQcE1qtQxscV5RaZvpXrrb8Drkc3/DdQ+uUYCNjL+zU= +github.com/fatih/structtag v1.2.0 h1:/OdNE99OxoI/PqaW/SuSK9uxxT3f/tcSZgon/ssNSx4= github.com/fatih/structtag v1.2.0/go.mod h1:mBJUNpUnHmRKrKlQQlmCrh5PuhftFbNv8Ys4/aAZl94= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.8-0.20191012010759-4bf2d1fec783 h1:SmsgwFZy9pdTk/k8BZz40D3P5umP5+Ejt3hAi0paBNQ= @@ -304,6 +306,8 @@ github.com/lyft/flytestdlib v0.3.2 h1:bY6Y+Fg6Jdc7zY4GAYuR7t2hjWwynIdmRvtLcRNaGn github.com/lyft/flytestdlib v0.3.2/go.mod h1:LJPPJlkFj+wwVWMrQT3K5JZgNhZi2mULsCG4ZYhinhU= github.com/lyft/flytestdlib v0.3.3 h1:MkWXPkwQinh6MR3Yf5siZhmRSt9r4YmsF+5kvVVVedE= github.com/lyft/flytestdlib v0.3.3/go.mod h1:LJPPJlkFj+wwVWMrQT3K5JZgNhZi2mULsCG4ZYhinhU= +github.com/lyft/flytestdlib v0.3.9 h1:NaKp9xkeWWwhVvqTOcR/FqlASy1N2gu/kN7PVe4S7YI= +github.com/lyft/flytestdlib v0.3.9/go.mod h1:LJPPJlkFj+wwVWMrQT3K5JZgNhZi2mULsCG4ZYhinhU= github.com/lyft/spark-on-k8s-operator v0.1.3 h1:rmke8lR2Oy8mvKXRhloKuEu7fgGuXepDxiBNiorVUFI= github.com/lyft/spark-on-k8s-operator v0.1.3/go.mod h1:hkRqdqAsdNnxT/Zst6MNMRbTAoiCZ0JRw7svRgAYb0A= github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ= diff --git a/go/tasks/pluginmachinery/core/kube_client.go b/go/tasks/pluginmachinery/core/kube_client.go index 54c886112..91a0047de 100644 --- a/go/tasks/pluginmachinery/core/kube_client.go +++ b/go/tasks/pluginmachinery/core/kube_client.go @@ -1,10 +1,19 @@ package core import ( + "fmt" + + "github.com/pkg/errors" + + "io/ioutil" + + restclient "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" ) +//go:generate pflags Config --default-var=defaultConfig + // TODO we may not want to expose this? // A friendly controller-runtime client that gets passed to executors type KubeClient interface { @@ -14,3 +23,61 @@ type KubeClient interface { // GetCache returns a cache.Cache GetCache() cache.Cache } + +type ClusterConfig struct { + Name string `json:"name" pflag:","` + Endpoint string `json:"endpoint" pflag:","` + Auth Auth `json:"auth" pflag:","` + Enabled bool `json:"enabled" pflag:","` +} + +type Auth struct { + Type string `json:"type" pflag:","` + TokenPath string `json:"tokenPath" pflag:","` + CertPath string `json:"certPath" pflag:","` +} + +func (auth Auth) GetCA() ([]byte, error) { + cert, err := ioutil.ReadFile(auth.CertPath) + if err != nil { + return nil, errors.Wrap(err, "failed to read k8s CA cert from configured path") + } + return cert, nil +} + +func (auth Auth) GetToken() (string, error) { + token, err := ioutil.ReadFile(auth.TokenPath) + if err != nil { + return "", errors.Wrap(err, "failed to read k8s bearer token from configured path") + } + return string(token), nil +} + +// Reads secret values from paths specified in the config to initialize a Kubernetes rest client Config. +func RemoteClusterConfig(host string, auth Auth) (*restclient.Config, error) { + tokenString, err := auth.GetToken() + if err != nil { + return nil, errors.New(fmt.Sprintf("Failed to get auth token: %+v", err)) + } + + caCert, err := auth.GetCA() + if err != nil { + return nil, errors.New(fmt.Sprintf("Failed to get auth CA: %+v", err)) + } + + tlsClientConfig := restclient.TLSClientConfig{} + tlsClientConfig.CAData = caCert + return &restclient.Config{ + Host: host, + TLSClientConfig: tlsClientConfig, + BearerToken: tokenString, + }, nil +} + +func GetK8sClient(config ClusterConfig) (client.Client, error) { + kubeConf, err := RemoteClusterConfig(config.Endpoint, config.Auth) + if err != nil { + return nil, err + } + return client.New(kubeConf, client.Options{}) +} diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index c7e5d515c..448b5ba18 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -5,6 +5,7 @@ package k8s import ( + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" v1 "k8s.io/api/core/v1" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/workqueue" @@ -41,12 +42,13 @@ type ResourceConfig struct { // Defines custom config for K8s Array plugin type Config struct { - DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` - MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` - MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` - ResourceConfig ResourceConfig `json:"resourceConfig" pflag:",ResourceConfiguration to limit number of resources used by k8s-array."` - NodeSelector map[string]string `json:"node-selector" pflag:"-,Defines a set of node selector labels to add to the pod."` - Tolerations []v1.Toleration `json:"tolerations" pflag:"-,Tolerations to be applied for k8s-array pods"` + DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` + MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` + MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` + ResourceConfig *ResourceConfig `json:"resourceConfig" pflag:",ResourceConfiguration to limit number of resources used by k8s-array."` + RemoteClusterConfig *core.ClusterConfig `json:"remoteClusterConfig" pflag:",Configuration of remote K8s cluster for array jobs"` + NodeSelector map[string]string `json:"node-selector" pflag:"-,Defines a set of node selector labels to add to the pod."` + Tolerations []v1.Toleration `json:"tolerations" pflag:"-,Tolerations to be applied for k8s-array pods"` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config } @@ -54,7 +56,3 @@ type Config struct { func GetConfig() *Config { return configSection.GetConfig().(*Config) } - -func IsResourceConfigSet() bool { - return GetConfig().ResourceConfig != (ResourceConfig{}) -} diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index 0de354164..d2e5376b7 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -3,6 +3,9 @@ package k8s import ( "context" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + idlCore "github.com/lyft/flyteidl/gen/pb-go/flyteidl/core" "github.com/lyft/flyteplugins/go/tasks/plugins/array" @@ -26,6 +29,24 @@ type Executor struct { errorAssembler array.OutputAssembler } +type KubeClientObj struct { + client client.Client +} + +func (k KubeClientObj) GetClient() client.Client { + return k.client +} + +func (k KubeClientObj) GetCache() cache.Cache { + return nil +} + +func NewKubeClientObj(c client.Client) core.KubeClient { + return &KubeClientObj{ + client: c, + } +} + func NewExecutor(kubeClient core.KubeClient, cfg *Config, scope promutils.Scope) (Executor, error) { outputAssembler, err := array.NewOutputAssembler(cfg.OutputAssembler, scope.NewSubScope("output_assembler")) if err != nil { @@ -159,7 +180,18 @@ func init() { } func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plugin, error) { - exec, err := NewExecutor(iCtx.KubeClient(), GetConfig(), iCtx.MetricsScope()) + var kubeClient core.KubeClient + remoteClusterConfig := GetConfig().RemoteClusterConfig + if remoteClusterConfig != nil && remoteClusterConfig.Enabled { + client, err := core.GetK8sClient(*remoteClusterConfig) + if err != nil { + return nil, err + } + kubeClient = NewKubeClientObj(client) + } else { + kubeClient = iCtx.KubeClient() + } + exec, err := NewExecutor(kubeClient, GetConfig(), iCtx.MetricsScope()) if err != nil { return nil, err } @@ -168,9 +200,10 @@ func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plu return nil, err } - if IsResourceConfigSet() { - primaryLabel := GetConfig().ResourceConfig.PrimaryLabel - limit := GetConfig().ResourceConfig.Limit + resourceConfig := GetConfig().ResourceConfig + if resourceConfig != nil { + primaryLabel := resourceConfig.PrimaryLabel + limit := resourceConfig.Limit if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), limit); err != nil { logger.Errorf(ctx, "Token Resource registration for [%v] failed due to error [%v]", primaryLabel, err) return nil, err diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 6723ce630..dc23c0739 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -189,7 +189,7 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube } func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { - if !IsResourceConfigSet() { + if config.ResourceConfig == nil { return core.AllocationStatusGranted, nil } @@ -208,7 +208,7 @@ func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, confi } func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, childIdx int) error { - if !IsResourceConfigSet() { + if config.ResourceConfig == nil { return nil } indexStr := strconv.Itoa((childIdx)) @@ -230,11 +230,6 @@ func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionCont NamespaceScopeResourceConstraint: nil, } - if !IsResourceConfigSet() { - logger.Infof(ctx, "No Resource config is found. Returning an empty resource constraints spec") - return constraintsSpec - } - constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) From b03129221f53578fd5250c69aa470116d603c0e6 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Tue, 15 Sep 2020 02:01:16 -0700 Subject: [PATCH 29/39] Adding unit tests, remote endpoint and fixes --- go/tasks/plugins/array/k8s/monitor_test.go | 205 +++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 go/tasks/plugins/array/k8s/monitor_test.go diff --git a/go/tasks/plugins/array/k8s/monitor_test.go b/go/tasks/plugins/array/k8s/monitor_test.go new file mode 100644 index 000000000..0c77cce30 --- /dev/null +++ b/go/tasks/plugins/array/k8s/monitor_test.go @@ -0,0 +1,205 @@ +package k8s + +import ( + "testing" + + core2 "github.com/lyft/flyteidl/gen/pb-go/flyteidl/core" + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" + mocks2 "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/io/mocks" + "github.com/lyft/flyteplugins/go/tasks/plugins/array/arraystatus" + "github.com/lyft/flytestdlib/bitarray" + "github.com/stretchr/testify/mock" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + v12 "k8s.io/apimachinery/pkg/apis/meta/v1" + + arrayCore "github.com/lyft/flyteplugins/go/tasks/plugins/array/core" + + "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core/mocks" + "github.com/stretchr/testify/assert" + "golang.org/x/net/context" +) + +func createSampleContainerTask() *core2.Container { + return &core2.Container{ + Command: []string{"cmd"}, + Args: []string{"{{$inputPrefix}}"}, + Image: "img1", + } +} + +func getMockTaskExecutionContext(ctx context.Context) *mocks.TaskExecutionContext { + tr := &mocks.TaskReader{} + tr.OnRead(ctx).Return(&core2.TaskTemplate{ + Target: &core2.TaskTemplate_Container{ + Container: createSampleContainerTask(), + }, + }, nil) + + tID := &mocks.TaskExecutionID{} + tID.OnGetGeneratedName().Return("notfound") + tID.OnGetID().Return(core2.TaskExecutionIdentifier{ + TaskId: &core2.Identifier{ + ResourceType: core2.ResourceType_TASK, + Project: "a", + Domain: "d", + Name: "n", + Version: "abc", + }, + NodeExecutionId: &core2.NodeExecutionIdentifier{ + NodeId: "node1", + ExecutionId: &core2.WorkflowExecutionIdentifier{ + Project: "a", + Domain: "d", + Name: "exec", + }, + }, + RetryAttempt: 0, + }) + + overrides := &mocks.TaskOverrides{} + overrides.OnGetResources().Return(&v1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("10"), + }, + }) + + tMeta := &mocks.TaskExecutionMetadata{} + tMeta.OnGetTaskExecutionID().Return(tID) + tMeta.OnGetOverrides().Return(overrides) + tMeta.OnIsInterruptible().Return(false) + tMeta.OnGetK8sServiceAccount().Return("s") + + tMeta.OnGetNamespace().Return("n") + tMeta.OnGetLabels().Return(nil) + tMeta.OnGetAnnotations().Return(nil) + tMeta.OnGetOwnerReference().Return(v12.OwnerReference{}) + + ow := &mocks2.OutputWriter{} + ow.OnGetOutputPrefixPath().Return("/prefix/") + + ir := &mocks2.InputReader{} + ir.OnGetInputPrefixPath().Return("/prefix/") + ir.OnGetInputPath().Return("/prefix/inputs.pb") + + tCtx := &mocks.TaskExecutionContext{} + tCtx.OnTaskReader().Return(tr) + tCtx.OnTaskExecutionMetadata().Return(tMeta) + tCtx.OnOutputWriter().Return(ow) + tCtx.OnInputReader().Return(ir) + return tCtx +} + +func TestCheckSubTasksState(t *testing.T) { + ctx := context.Background() + + tCtx := getMockTaskExecutionContext(ctx) + kubeClient := mocks.KubeClient{} + kubeClient.OnGetClient().Return(mocks.NewFakeKubeClient()) + resourceManager := mocks.ResourceManager{} + resourceManager.OnAllocateResourceMatch(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(core.AllocationStatusExhausted, nil) + tCtx.OnResourceManager().Return(&resourceManager) + + t.Run("Happy case", func(t *testing.T) { + config := Config{MaxArrayJobSize: 100} + newState, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{ + CurrentPhase: arrayCore.PhaseCheckingSubTaskExecutions, + ExecutionArraySize: 5, + OriginalArraySize: 10, + OriginalMinSuccesses: 5, + }) + + assert.Nil(t, err) + //assert.NotEmpty(t, logLinks) + p, _ := newState.GetPhase() + assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String()) + resourceManager.AssertNumberOfCalls(t, "AllocateResource", 0) + }) + + t.Run("Resource exhausted", func(t *testing.T) { + config := Config{ + MaxArrayJobSize: 100, + ResourceConfig: &ResourceConfig{ + PrimaryLabel: "p", + Limit: 10, + }, + } + + newState, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{ + CurrentPhase: arrayCore.PhaseCheckingSubTaskExecutions, + ExecutionArraySize: 5, + OriginalArraySize: 10, + OriginalMinSuccesses: 5, + }) + + assert.Nil(t, err) + p, _ := newState.GetPhase() + assert.Equal(t, arrayCore.PhaseWaitingForResources.String(), p.String()) + resourceManager.AssertNumberOfCalls(t, "AllocateResource", 5) + }) +} + +func TestCheckSubTasksStateResourceGranted(t *testing.T) { + ctx := context.Background() + + tCtx := getMockTaskExecutionContext(ctx) + kubeClient := mocks.KubeClient{} + kubeClient.OnGetClient().Return(mocks.NewFakeKubeClient()) + resourceManager := mocks.ResourceManager{} + resourceManager.OnAllocateResourceMatch(mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(core.AllocationStatusGranted, nil) + resourceManager.OnReleaseResourceMatch(mock.Anything, mock.Anything, mock.Anything).Return(nil) + tCtx.OnResourceManager().Return(&resourceManager) + + t.Run("Resource granted", func(t *testing.T) { + config := Config{ + MaxArrayJobSize: 100, + ResourceConfig: &ResourceConfig{ + PrimaryLabel: "p", + Limit: 10, + }, + } + + newState, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{ + CurrentPhase: arrayCore.PhaseCheckingSubTaskExecutions, + ExecutionArraySize: 5, + OriginalArraySize: 10, + OriginalMinSuccesses: 5, + }) + + assert.Nil(t, err) + p, _ := newState.GetPhase() + assert.Equal(t, arrayCore.PhaseCheckingSubTaskExecutions.String(), p.String()) + resourceManager.AssertNumberOfCalls(t, "AllocateResource", 5) + }) + + t.Run("All tasks success", func(t *testing.T) { + config := Config{ + MaxArrayJobSize: 100, + ResourceConfig: &ResourceConfig{ + PrimaryLabel: "p", + Limit: 10, + }, + } + + arrayStatus := &arraystatus.ArrayStatus{ + Summary: arraystatus.ArraySummary{}, + Detailed: arrayCore.NewPhasesCompactArray(uint(5)), + } + for childIdx := range arrayStatus.Detailed.GetItems() { + arrayStatus.Detailed.SetItem(childIdx, bitarray.Item(core.PhaseSuccess)) + + } + newState, _, err := LaunchAndCheckSubTasksState(ctx, tCtx, &kubeClient, &config, nil, "/prefix/", "/prefix-sand/", &arrayCore.State{ + CurrentPhase: arrayCore.PhaseCheckingSubTaskExecutions, + ExecutionArraySize: 5, + OriginalArraySize: 10, + OriginalMinSuccesses: 5, + ArrayStatus: *arrayStatus, + }) + + assert.Nil(t, err) + p, _ := newState.GetPhase() + assert.Equal(t, arrayCore.PhaseWriteToDiscovery.String(), p.String()) + resourceManager.AssertNumberOfCalls(t, "ReleaseResource", 5) + }) +} From 726c37b83f8043ff31a044a5c63dedc1f87a1859 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Tue, 15 Sep 2020 23:40:34 -0700 Subject: [PATCH 30/39] Fix pflags --- go/tasks/pluginmachinery/core/kube_client.go | 67 --------- go/tasks/plugins/array/k8s/config.go | 84 +++++++++-- go/tasks/plugins/array/k8s/config_flags.go | 6 + .../plugins/array/k8s/config_flags_test.go | 132 ++++++++++++++++++ go/tasks/plugins/array/k8s/executor.go | 6 +- go/tasks/plugins/array/k8s/monitor_test.go | 6 +- go/tasks/plugins/array/k8s/task.go | 4 +- 7 files changed, 222 insertions(+), 83 deletions(-) diff --git a/go/tasks/pluginmachinery/core/kube_client.go b/go/tasks/pluginmachinery/core/kube_client.go index 91a0047de..54c886112 100644 --- a/go/tasks/pluginmachinery/core/kube_client.go +++ b/go/tasks/pluginmachinery/core/kube_client.go @@ -1,19 +1,10 @@ package core import ( - "fmt" - - "github.com/pkg/errors" - - "io/ioutil" - - restclient "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client" ) -//go:generate pflags Config --default-var=defaultConfig - // TODO we may not want to expose this? // A friendly controller-runtime client that gets passed to executors type KubeClient interface { @@ -23,61 +14,3 @@ type KubeClient interface { // GetCache returns a cache.Cache GetCache() cache.Cache } - -type ClusterConfig struct { - Name string `json:"name" pflag:","` - Endpoint string `json:"endpoint" pflag:","` - Auth Auth `json:"auth" pflag:","` - Enabled bool `json:"enabled" pflag:","` -} - -type Auth struct { - Type string `json:"type" pflag:","` - TokenPath string `json:"tokenPath" pflag:","` - CertPath string `json:"certPath" pflag:","` -} - -func (auth Auth) GetCA() ([]byte, error) { - cert, err := ioutil.ReadFile(auth.CertPath) - if err != nil { - return nil, errors.Wrap(err, "failed to read k8s CA cert from configured path") - } - return cert, nil -} - -func (auth Auth) GetToken() (string, error) { - token, err := ioutil.ReadFile(auth.TokenPath) - if err != nil { - return "", errors.Wrap(err, "failed to read k8s bearer token from configured path") - } - return string(token), nil -} - -// Reads secret values from paths specified in the config to initialize a Kubernetes rest client Config. -func RemoteClusterConfig(host string, auth Auth) (*restclient.Config, error) { - tokenString, err := auth.GetToken() - if err != nil { - return nil, errors.New(fmt.Sprintf("Failed to get auth token: %+v", err)) - } - - caCert, err := auth.GetCA() - if err != nil { - return nil, errors.New(fmt.Sprintf("Failed to get auth CA: %+v", err)) - } - - tlsClientConfig := restclient.TLSClientConfig{} - tlsClientConfig.CAData = caCert - return &restclient.Config{ - Host: host, - TLSClientConfig: tlsClientConfig, - BearerToken: tokenString, - }, nil -} - -func GetK8sClient(config ClusterConfig) (client.Client, error) { - kubeConf, err := RemoteClusterConfig(config.Endpoint, config.Auth) - if err != nil { - return nil, err - } - return client.New(kubeConf, client.Options{}) -} diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index 448b5ba18..fa9e83437 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -5,8 +5,13 @@ package k8s import ( - "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/core" + "fmt" + "io/ioutil" + + "github.com/pkg/errors" v1 "k8s.io/api/core/v1" + restclient "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/workqueue" "github.com/lyft/flytestdlib/config" @@ -40,15 +45,73 @@ type ResourceConfig struct { Limit int `json:"limit" pflag:",Resource quota (in the number of outstanding requests) for the cluster"` } +type ClusterConfig struct { + Name string `json:"name" pflag:",Friendly name of the remote cluster"` + Endpoint string `json:"endpoint" pflag:", Remote K8s cluster endpoint"` + Auth Auth `json:"auth" pflag:"-, Auth setting for the cluster"` + Enabled bool `json:"enabled" pflag:", Boolean flag to enable or disable"` +} + +type Auth struct { + Type string `json:"type" pflag:", Authentication type"` + TokenPath string `json:"tokenPath" pflag:", Token path"` + CertPath string `json:"certPath" pflag:", Certificate path"` +} + +func (auth Auth) GetCA() ([]byte, error) { + cert, err := ioutil.ReadFile(auth.CertPath) + if err != nil { + return nil, errors.Wrap(err, "failed to read k8s CA cert from configured path") + } + return cert, nil +} + +func (auth Auth) GetToken() (string, error) { + token, err := ioutil.ReadFile(auth.TokenPath) + if err != nil { + return "", errors.Wrap(err, "failed to read k8s bearer token from configured path") + } + return string(token), nil +} + +// Reads secret values from paths specified in the config to initialize a Kubernetes rest client Config. +func RemoteClusterConfig(host string, auth Auth) (*restclient.Config, error) { + tokenString, err := auth.GetToken() + if err != nil { + return nil, errors.New(fmt.Sprintf("Failed to get auth token: %+v", err)) + } + + caCert, err := auth.GetCA() + if err != nil { + return nil, errors.New(fmt.Sprintf("Failed to get auth CA: %+v", err)) + } + + tlsClientConfig := restclient.TLSClientConfig{} + tlsClientConfig.CAData = caCert + return &restclient.Config{ + Host: host, + TLSClientConfig: tlsClientConfig, + BearerToken: tokenString, + }, nil +} + +func GetK8sClient(config ClusterConfig) (client.Client, error) { + kubeConf, err := RemoteClusterConfig(config.Endpoint, config.Auth) + if err != nil { + return nil, err + } + return client.New(kubeConf, client.Options{}) +} + // Defines custom config for K8s Array plugin type Config struct { - DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` - MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` - MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` - ResourceConfig *ResourceConfig `json:"resourceConfig" pflag:",ResourceConfiguration to limit number of resources used by k8s-array."` - RemoteClusterConfig *core.ClusterConfig `json:"remoteClusterConfig" pflag:",Configuration of remote K8s cluster for array jobs"` - NodeSelector map[string]string `json:"node-selector" pflag:"-,Defines a set of node selector labels to add to the pod."` - Tolerations []v1.Toleration `json:"tolerations" pflag:"-,Tolerations to be applied for k8s-array pods"` + DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` + MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` + MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` + ResourceConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to limit number of resources used by k8s-array."` + RemoteClusterConfig ClusterConfig `json:"remoteClusterConfig" pflag:"-,Configuration of remote K8s cluster for array jobs"` + NodeSelector map[string]string `json:"node-selector" pflag:"-,Defines a set of node selector labels to add to the pod."` + Tolerations []v1.Toleration `json:"tolerations" pflag:"-,Tolerations to be applied for k8s-array pods"` OutputAssembler workqueue.Config ErrorAssembler workqueue.Config } @@ -56,3 +119,8 @@ type Config struct { func GetConfig() *Config { return configSection.GetConfig().(*Config) } + +func IsResourceConfigSet(resourceConfig ResourceConfig) bool { + emptyResouceConfig := ResourceConfig{} + return resourceConfig != emptyResouceConfig +} diff --git a/go/tasks/plugins/array/k8s/config_flags.go b/go/tasks/plugins/array/k8s/config_flags.go index 7bb1404de..f8b40cc45 100755 --- a/go/tasks/plugins/array/k8s/config_flags.go +++ b/go/tasks/plugins/array/k8s/config_flags.go @@ -46,6 +46,12 @@ func (cfg Config) GetPFlagSet(prefix string) *pflag.FlagSet { cmdFlags.Int64(fmt.Sprintf("%v%v", prefix, "maxArrayJobSize"), defaultConfig.MaxArrayJobSize, "Maximum size of array job.") cmdFlags.String(fmt.Sprintf("%v%v", prefix, "resourceConfig.primaryLabel"), defaultConfig.ResourceConfig.PrimaryLabel, "PrimaryLabel of a given service cluster") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "resourceConfig.limit"), defaultConfig.ResourceConfig.Limit, "Resource quota (in the number of outstanding requests) for the cluster") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.name"), defaultConfig.RemoteClusterConfig.Name, "") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.endpoint"), defaultConfig.RemoteClusterConfig.Endpoint, "") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.auth.type"), defaultConfig.RemoteClusterConfig.Auth.Type, "") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.auth.tokenPath"), defaultConfig.RemoteClusterConfig.Auth.TokenPath, "") + cmdFlags.String(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.auth.certPath"), defaultConfig.RemoteClusterConfig.Auth.CertPath, "") + cmdFlags.Bool(fmt.Sprintf("%v%v", prefix, "remoteClusterConfig.enabled"), defaultConfig.RemoteClusterConfig.Enabled, "") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.workers"), defaultConfig.OutputAssembler.Workers, "Number of concurrent workers to start processing the queue.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.maxRetries"), defaultConfig.OutputAssembler.MaxRetries, "Maximum number of retries per item.") cmdFlags.Int(fmt.Sprintf("%v%v", prefix, "OutputAssembler.maxItems"), defaultConfig.OutputAssembler.IndexCacheMaxItems, "Maximum number of entries to keep in the index.") diff --git a/go/tasks/plugins/array/k8s/config_flags_test.go b/go/tasks/plugins/array/k8s/config_flags_test.go index d0ffbdb49..95c7350b0 100755 --- a/go/tasks/plugins/array/k8s/config_flags_test.go +++ b/go/tasks/plugins/array/k8s/config_flags_test.go @@ -209,6 +209,138 @@ func TestConfig_SetFlags(t *testing.T) { } }) }) + t.Run("Test_remoteClusterConfig.name", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("remoteClusterConfig.name"); err == nil { + assert.Equal(t, string(defaultConfig.RemoteClusterConfig.Name), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.name", testValue) + if vString, err := cmdFlags.GetString("remoteClusterConfig.name"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.RemoteClusterConfig.Name) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_remoteClusterConfig.endpoint", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("remoteClusterConfig.endpoint"); err == nil { + assert.Equal(t, string(defaultConfig.RemoteClusterConfig.Endpoint), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.endpoint", testValue) + if vString, err := cmdFlags.GetString("remoteClusterConfig.endpoint"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.RemoteClusterConfig.Endpoint) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_remoteClusterConfig.auth.type", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.type"); err == nil { + assert.Equal(t, string(defaultConfig.RemoteClusterConfig.Auth.Type), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.auth.type", testValue) + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.type"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.RemoteClusterConfig.Auth.Type) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_remoteClusterConfig.auth.tokenPath", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.tokenPath"); err == nil { + assert.Equal(t, string(defaultConfig.RemoteClusterConfig.Auth.TokenPath), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.auth.tokenPath", testValue) + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.tokenPath"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.RemoteClusterConfig.Auth.TokenPath) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_remoteClusterConfig.auth.certPath", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.certPath"); err == nil { + assert.Equal(t, string(defaultConfig.RemoteClusterConfig.Auth.CertPath), vString) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.auth.certPath", testValue) + if vString, err := cmdFlags.GetString("remoteClusterConfig.auth.certPath"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vString), &actual.RemoteClusterConfig.Auth.CertPath) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) + t.Run("Test_remoteClusterConfig.enabled", func(t *testing.T) { + t.Run("DefaultValue", func(t *testing.T) { + // Test that default value is set properly + if vBool, err := cmdFlags.GetBool("remoteClusterConfig.enabled"); err == nil { + assert.Equal(t, bool(defaultConfig.RemoteClusterConfig.Enabled), vBool) + } else { + assert.FailNow(t, err.Error()) + } + }) + + t.Run("Override", func(t *testing.T) { + testValue := "1" + + cmdFlags.Set("remoteClusterConfig.enabled", testValue) + if vBool, err := cmdFlags.GetBool("remoteClusterConfig.enabled"); err == nil { + testDecodeJson_Config(t, fmt.Sprintf("%v", vBool), &actual.RemoteClusterConfig.Enabled) + + } else { + assert.FailNow(t, err.Error()) + } + }) + }) t.Run("Test_OutputAssembler.workers", func(t *testing.T) { t.Run("DefaultValue", func(t *testing.T) { // Test that default value is set properly diff --git a/go/tasks/plugins/array/k8s/executor.go b/go/tasks/plugins/array/k8s/executor.go index d2e5376b7..eea808221 100644 --- a/go/tasks/plugins/array/k8s/executor.go +++ b/go/tasks/plugins/array/k8s/executor.go @@ -182,8 +182,8 @@ func init() { func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plugin, error) { var kubeClient core.KubeClient remoteClusterConfig := GetConfig().RemoteClusterConfig - if remoteClusterConfig != nil && remoteClusterConfig.Enabled { - client, err := core.GetK8sClient(*remoteClusterConfig) + if remoteClusterConfig.Enabled { + client, err := GetK8sClient(remoteClusterConfig) if err != nil { return nil, err } @@ -201,7 +201,7 @@ func GetNewExecutorPlugin(ctx context.Context, iCtx core.SetupContext) (core.Plu } resourceConfig := GetConfig().ResourceConfig - if resourceConfig != nil { + if IsResourceConfigSet(resourceConfig) { primaryLabel := resourceConfig.PrimaryLabel limit := resourceConfig.Limit if err := iCtx.ResourceRegistrar().RegisterResourceQuota(ctx, core.ResourceNamespace(primaryLabel), limit); err != nil { diff --git a/go/tasks/plugins/array/k8s/monitor_test.go b/go/tasks/plugins/array/k8s/monitor_test.go index 0c77cce30..2bbc32d46 100644 --- a/go/tasks/plugins/array/k8s/monitor_test.go +++ b/go/tasks/plugins/array/k8s/monitor_test.go @@ -119,7 +119,7 @@ func TestCheckSubTasksState(t *testing.T) { t.Run("Resource exhausted", func(t *testing.T) { config := Config{ MaxArrayJobSize: 100, - ResourceConfig: &ResourceConfig{ + ResourceConfig: ResourceConfig{ PrimaryLabel: "p", Limit: 10, }, @@ -153,7 +153,7 @@ func TestCheckSubTasksStateResourceGranted(t *testing.T) { t.Run("Resource granted", func(t *testing.T) { config := Config{ MaxArrayJobSize: 100, - ResourceConfig: &ResourceConfig{ + ResourceConfig: ResourceConfig{ PrimaryLabel: "p", Limit: 10, }, @@ -175,7 +175,7 @@ func TestCheckSubTasksStateResourceGranted(t *testing.T) { t.Run("All tasks success", func(t *testing.T) { config := Config{ MaxArrayJobSize: 100, - ResourceConfig: &ResourceConfig{ + ResourceConfig: ResourceConfig{ PrimaryLabel: "p", Limit: 10, }, diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index dc23c0739..dbd031937 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -189,7 +189,7 @@ func (t Task) Finalize(ctx context.Context, tCtx core.TaskExecutionContext, kube } func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, podName string) (core.AllocationStatus, error) { - if config.ResourceConfig == nil { + if !IsResourceConfigSet(config.ResourceConfig) { return core.AllocationStatusGranted, nil } @@ -208,7 +208,7 @@ func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, confi } func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, config *Config, childIdx int) error { - if config.ResourceConfig == nil { + if !IsResourceConfigSet(config.ResourceConfig) { return nil } indexStr := strconv.Itoa((childIdx)) From 089f9ba64c455f85427cc1c19f5b8850c5583dcb Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Thu, 17 Sep 2020 15:40:56 -0700 Subject: [PATCH 31/39] MErge and fix --- go/tasks/plugins/array/k8s/monitor_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/go/tasks/plugins/array/k8s/monitor_test.go b/go/tasks/plugins/array/k8s/monitor_test.go index 2bbc32d46..9ebba1a40 100644 --- a/go/tasks/plugins/array/k8s/monitor_test.go +++ b/go/tasks/plugins/array/k8s/monitor_test.go @@ -77,10 +77,12 @@ func getMockTaskExecutionContext(ctx context.Context) *mocks.TaskExecutionContex ow := &mocks2.OutputWriter{} ow.OnGetOutputPrefixPath().Return("/prefix/") + ow.OnGetRawOutputPrefix().Return("/raw_prefix/") ir := &mocks2.InputReader{} ir.OnGetInputPrefixPath().Return("/prefix/") ir.OnGetInputPath().Return("/prefix/inputs.pb") + ir.OnGetMatch(mock.Anything).Return(&core2.LiteralMap{}, nil) tCtx := &mocks.TaskExecutionContext{} tCtx.OnTaskReader().Return(tr) From 0b5580a3f9e839c3448bc149f21f93bf0d341b0d Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Mon, 28 Sep 2020 17:34:50 -0700 Subject: [PATCH 32/39] update propeller --- go/tasks/aws/config.go | 3 ++- go/tasks/plugins/array/k8s/config.go | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/go/tasks/aws/config.go b/go/tasks/aws/config.go index 5b69c8c76..4010db6e2 100644 --- a/go/tasks/aws/config.go +++ b/go/tasks/aws/config.go @@ -7,8 +7,9 @@ package aws import ( "time" - pluginsConfig "github.com/lyft/flyteplugins/go/tasks/config" "github.com/lyft/flytestdlib/config" + + pluginsConfig "github.com/lyft/flyteplugins/go/tasks/config" ) //go:generate pflags Config --default-var defaultConfig diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index fa9e83437..518c04e73 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -13,8 +13,8 @@ import ( restclient "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" + pluginsConfig "github.com/lyft/flyteplugins/go/tasks/config" "github.com/lyft/flyteplugins/go/tasks/pluginmachinery/workqueue" - "github.com/lyft/flytestdlib/config" ) //go:generate pflags Config --default-var=defaultConfig @@ -37,7 +37,7 @@ var ( }, } - configSection = config.MustRegisterSection(configSectionKey, defaultConfig) + configSection = pluginsConfig.MustRegisterSubSection(configSectionKey, defaultConfig) ) type ResourceConfig struct { From e54c0819eb0c6b02ea35265904507c35659147b6 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Tue, 29 Sep 2020 15:13:19 -0700 Subject: [PATCH 33/39] Fix config --- go/tasks/plugins/array/k8s/config.go | 2 +- go/tasks/plugins/array/k8s/monitor.go | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index 518c04e73..d8f77b0c8 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -106,7 +106,7 @@ func GetK8sClient(config ClusterConfig) (client.Client, error) { // Defines custom config for K8s Array plugin type Config struct { DefaultScheduler string `json:"scheduler" pflag:",Decides the scheduler to use when launching array-pods."` - MaxErrorStringLength int `json:"maxErrLength" pflag:",Determines the maximum length of the error string returned for the array."` + MaxErrorStringLength int `json:"maxErrorLength" pflag:",Determines the maximum length of the error string returned for the array."` MaxArrayJobSize int64 `json:"maxArrayJobSize" pflag:",Maximum size of array job."` ResourceConfig ResourceConfig `json:"resourceConfig" pflag:"-,ResourceConfiguration to limit number of resources used by k8s-array."` RemoteClusterConfig ClusterConfig `json:"remoteClusterConfig" pflag:"-,Configuration of remote K8s cluster for array jobs"` diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 38ace640c..66e4dd0f3 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -114,7 +114,6 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon if monitorResult != MonitorSuccess { return currentState, logLinks, err } - } newState = newState.SetArrayStatus(*newArrayStatus) From 50ccc6b7bd455bbb80eb12bb0f03d98b20cb1378 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Thu, 1 Oct 2020 17:10:13 -0700 Subject: [PATCH 34/39] Fix resource manager and debug --- go/tasks/plugins/array/k8s/monitor.go | 2 ++ go/tasks/plugins/array/k8s/task.go | 18 ++++-------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 66e4dd0f3..a10cac2cd 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -60,6 +60,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { existingPhase := core.Phases[existingPhaseIdx] + logger.Debugf(ctx, "Current phase and index %v - %v", childIdx, existingPhase) indexStr := strconv.Itoa(childIdx) podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) if existingPhase.IsTerminal() { @@ -68,6 +69,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // Since we know we have already "processed" this terminal phase we can safely deallocate resource err = deallocateResource(ctx, tCtx, config, childIdx) + logger.Debugf(ctx, "K8s array - deallocate resource") if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in LaunchAndCheckSubTasks [%s]", podName, err) return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index dbd031937..9c0e8e42b 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -194,7 +194,10 @@ func allocateResource(ctx context.Context, tCtx core.TaskExecutionContext, confi } resourceNamespace := core.ResourceNamespace(config.ResourceConfig.PrimaryLabel) - resourceConstraintSpec := createResourceConstraintsSpec(ctx, tCtx) + resourceConstraintSpec := core.ResourceConstraintsSpec{ + ProjectScopeResourceConstraint: nil, + NamespaceScopeResourceConstraint: nil, + } allocationStatus, err := tCtx.ResourceManager().AllocateResource(ctx, resourceNamespace, podName, resourceConstraintSpec) if err != nil { @@ -223,16 +226,3 @@ func deallocateResource(ctx context.Context, tCtx core.TaskExecutionContext, con return nil } - -func createResourceConstraintsSpec(ctx context.Context, _ core.TaskExecutionContext) core.ResourceConstraintsSpec { - constraintsSpec := core.ResourceConstraintsSpec{ - ProjectScopeResourceConstraint: nil, - NamespaceScopeResourceConstraint: nil, - } - - constraintsSpec.ProjectScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} - constraintsSpec.NamespaceScopeResourceConstraint = &core.ResourceConstraint{Value: int64(1)} - logger.Infof(ctx, "Created a resource constraints spec: [%v]", constraintsSpec) - - return constraintsSpec -} From 68912a6d6b5014667a15f40ddc5de9b2689da0b2 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Mon, 5 Oct 2020 15:37:05 -0700 Subject: [PATCH 35/39] Fix owner refs --- go/tasks/plugins/array/k8s/monitor.go | 4 ++++ go/tasks/plugins/array/k8s/task.go | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index a10cac2cd..8b0a32489 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -95,6 +95,7 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon var launchResult LaunchResult launchResult, err = task.Launch(ctx, tCtx, kubeClient) if err != nil { + logger.Errorf(ctx, "K8s array - Launch error %v", err) return currentState, logLinks, err } @@ -114,6 +115,9 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon var monitorResult MonitorResult monitorResult, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) if monitorResult != MonitorSuccess { + if err != nil { + logger.Errorf(ctx, "K8s array - Monitor error %v", err) + } return currentState, logLinks, err } } diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 9c0e8e42b..f699ea7d5 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -48,6 +48,10 @@ const ( func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (LaunchResult, error) { podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) + // Do not owner references for remote cluster execution + if t.Config.RemoteClusterConfig.Enabled { + podTemplate.OwnerReferences = nil + } if err != nil { return LaunchError, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") } From 0d1961c0c9551460affa0058d23ca17725ea305d Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Tue, 6 Oct 2020 13:44:42 -0700 Subject: [PATCH 36/39] Fix owner refs --- go/tasks/plugins/array/k8s/monitor.go | 2 ++ go/tasks/plugins/array/k8s/task.go | 1 + 2 files changed, 3 insertions(+) diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index 8b0a32489..f4e80c640 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -114,6 +114,8 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon var monitorResult MonitorResult monitorResult, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) + logger.Debugf(ctx, "Log links %v", logLinks) + if monitorResult != MonitorSuccess { if err != nil { logger.Errorf(ctx, "K8s array - Monitor error %v", err) diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index f699ea7d5..893600280 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -127,6 +127,7 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC return MonitorError, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") } + logger.Debugf(ctx, "Monitor - phase info %v", phaseInfo) if phaseInfo.Info() != nil { t.LogLinks = append(t.LogLinks, phaseInfo.Info().Logs...) } From cf9c388c3c9bf4a7c73dc27f6f304d2eaaea969c Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Wed, 7 Oct 2020 10:12:07 -0700 Subject: [PATCH 37/39] Fix and revert --- go/tasks/plugins/array/k8s/config.go | 1 + go/tasks/plugins/array/k8s/monitor.go | 3 --- go/tasks/plugins/array/k8s/task.go | 10 ++++------ 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/go/tasks/plugins/array/k8s/config.go b/go/tasks/plugins/array/k8s/config.go index d8f77b0c8..e5d1ea04a 100644 --- a/go/tasks/plugins/array/k8s/config.go +++ b/go/tasks/plugins/array/k8s/config.go @@ -74,6 +74,7 @@ func (auth Auth) GetToken() (string, error) { return string(token), nil } +// TODO: Move logic to flytestdlib // Reads secret values from paths specified in the config to initialize a Kubernetes rest client Config. func RemoteClusterConfig(host string, auth Auth) (*restclient.Config, error) { tokenString, err := auth.GetToken() diff --git a/go/tasks/plugins/array/k8s/monitor.go b/go/tasks/plugins/array/k8s/monitor.go index f4e80c640..be948b1f6 100644 --- a/go/tasks/plugins/array/k8s/monitor.go +++ b/go/tasks/plugins/array/k8s/monitor.go @@ -60,7 +60,6 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon for childIdx, existingPhaseIdx := range currentState.GetArrayStatus().Detailed.GetItems() { existingPhase := core.Phases[existingPhaseIdx] - logger.Debugf(ctx, "Current phase and index %v - %v", childIdx, existingPhase) indexStr := strconv.Itoa(childIdx) podName := formatSubTaskName(ctx, tCtx.TaskExecutionMetadata().GetTaskExecutionID().GetGeneratedName(), indexStr) if existingPhase.IsTerminal() { @@ -69,7 +68,6 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon // Since we know we have already "processed" this terminal phase we can safely deallocate resource err = deallocateResource(ctx, tCtx, config, childIdx) - logger.Debugf(ctx, "K8s array - deallocate resource") if err != nil { logger.Errorf(ctx, "Error releasing allocation token [%s] in LaunchAndCheckSubTasks [%s]", podName, err) return currentState, logLinks, errors2.Wrapf(ErrCheckPodStatus, err, "Error releasing allocation token.") @@ -114,7 +112,6 @@ func LaunchAndCheckSubTasksState(ctx context.Context, tCtx core.TaskExecutionCon var monitorResult MonitorResult monitorResult, err = task.Monitor(ctx, tCtx, kubeClient, dataStore, outputPrefix, baseOutputDataSandbox) - logger.Debugf(ctx, "Log links %v", logLinks) if monitorResult != MonitorSuccess { if err != nil { diff --git a/go/tasks/plugins/array/k8s/task.go b/go/tasks/plugins/array/k8s/task.go index 893600280..a236b4871 100644 --- a/go/tasks/plugins/array/k8s/task.go +++ b/go/tasks/plugins/array/k8s/task.go @@ -48,14 +48,13 @@ const ( func (t Task) Launch(ctx context.Context, tCtx core.TaskExecutionContext, kubeClient core.KubeClient) (LaunchResult, error) { podTemplate, _, err := FlyteArrayJobToK8sPodTemplate(ctx, tCtx) - // Do not owner references for remote cluster execution - if t.Config.RemoteClusterConfig.Enabled { - podTemplate.OwnerReferences = nil - } if err != nil { return LaunchError, errors2.Wrapf(ErrBuildPodTemplate, err, "Failed to convert task template to a pod template for a task") } - + // Remove owner references for remote cluster execution + if t.Config.RemoteClusterConfig.Enabled { + podTemplate.OwnerReferences = nil + } var args []string if len(podTemplate.Spec.Containers) > 0 { args = append(podTemplate.Spec.Containers[0].Command, podTemplate.Spec.Containers[0].Args...) @@ -127,7 +126,6 @@ func (t Task) Monitor(ctx context.Context, tCtx core.TaskExecutionContext, kubeC return MonitorError, errors2.Wrapf(ErrCheckPodStatus, err, "Failed to check pod status.") } - logger.Debugf(ctx, "Monitor - phase info %v", phaseInfo) if phaseInfo.Info() != nil { t.LogLinks = append(t.LogLinks, phaseInfo.Info().Logs...) } From 24c18e001c330418a54c368b61dcf290a19304e1 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Wed, 7 Oct 2020 17:13:10 -0700 Subject: [PATCH 38/39] comments --- go/tasks/plugins/array/core/state.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/tasks/plugins/array/core/state.go b/go/tasks/plugins/array/core/state.go index 06818166f..15849390b 100644 --- a/go/tasks/plugins/array/core/state.go +++ b/go/tasks/plugins/array/core/state.go @@ -253,7 +253,7 @@ func SummaryToPhase(ctx context.Context, minSuccesses int64, summary arraystatus // No chance to reach the required success numbers. if totalRunning+totalSuccesses+totalWaitingForResources < minSuccesses { - logger.Infof(ctx, "Array failed early because totalRunning[%v] + totalSuccesses[%v] + totalWaitingForResource[%v] < minSuccesses[%v]", + logger.Infof(ctx, "Array failed early because total failures > minSuccesses[%v]. Snapshot totalRunning[%v] + totalSuccesses[%v] + totalWaitingForResource[%v]", totalRunning, totalSuccesses, totalWaitingForResources, minSuccesses) return PhaseWriteToDiscoveryThenFail } From fac441382a995ce1699ce954b0b56351e6efa880 Mon Sep 17 00:00:00 2001 From: Anand Swaminathan Date: Mon, 12 Oct 2020 23:16:32 -0700 Subject: [PATCH 39/39] go mod --- go.mod | 4 ---- 1 file changed, 4 deletions(-) diff --git a/go.mod b/go.mod index 5aa663fe7..08e30c84d 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,6 @@ require ( github.com/go-test/deep v1.0.5 github.com/gogo/protobuf v1.3.1 github.com/golang/protobuf v1.3.5 - github.com/googleapis/gnostic v0.4.1 // indirect github.com/hashicorp/golang-lru v0.5.4 github.com/kubeflow/pytorch-operator v0.6.0 github.com/kubeflow/tf-operator v0.5.3 @@ -27,19 +26,16 @@ require ( github.com/spf13/cobra v0.0.6 github.com/spf13/pflag v1.0.5 github.com/stretchr/testify v1.6.1 - go.opencensus.io v0.22.3 // indirect golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb google.golang.org/grpc v1.28.0 gopkg.in/yaml.v2 v2.2.8 - gopkg.in/yaml.v3 v3.0.0-20200605160147-a5ece683394c // indirect gotest.tools v2.2.0+incompatible k8s.io/api v0.17.3 k8s.io/apimachinery v0.17.3 k8s.io/client-go v11.0.1-0.20190918222721-c0e3722d5cf0+incompatible k8s.io/klog v1.0.0 sigs.k8s.io/controller-runtime v0.5.1 - sigs.k8s.io/yaml v1.2.0 // indirect ) // Pin the version of client-go to something that's compatible with katrogan's fork of api and apimachinery