From 5cc2206f208188c86706e19ba0f158d3570eae88 Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:19:57 +0000 Subject: [PATCH] [Feature] Parametrize Scheduling Graceful Duration --- CHANGELOG.md | 1 + README.md | 1 + cmd/cmd.go | 3 +++ ...n_builder_member_pod_scheduling_failure.go | 21 +++++++++++++++++++ pkg/deployment/resources/pod_inspector.go | 5 +++++ pkg/util/globals/global.go | 9 +++++++- 6 files changed, 39 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 046c24af3..d2ad2f9fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ - (Maintenance) Update Go to 1.22.2 - (Feature) Object Checksum - (Bugfix) Use Rendered Spec in case of scheduling compare +- (Feature) Parametrize Scheduling Graceful Duration ## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11) - (Feature) Extract Scheduler API diff --git a/README.md b/README.md index 3cd38eb7a..a563eaa1b 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,7 @@ Flags: --timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s) --timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s) --timeout.k8s duration The request timeout to the kubernetes (default 2s) + --timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s) --timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s) --timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s) --timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s) diff --git a/cmd/cmd.go b/cmd/cmd.go index 2261192b1..f77d3d35f 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -157,6 +157,7 @@ var ( backupArangoD time.Duration backupUploadArangoD time.Duration forcePodDeletionGracePeriod time.Duration + podSchedulingGracePeriod time.Duration } operatorImageDiscovery struct { timeout time.Duration @@ -226,6 +227,7 @@ func init() { f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls") f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files") f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals") + f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable") f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers") f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers") f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop") @@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) { globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD) globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD) globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod) + globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod) globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay) globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count) diff --git a/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go b/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go index 1062fb2c3..86a802d57 100644 --- a/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go +++ b/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go @@ -23,12 +23,14 @@ package reconcile import ( "context" "reflect" + "time" core "k8s.io/api/core/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/deployment/actions" "github.com/arangodb/kube-arangodb/pkg/util" + "github.com/arangodb/kube-arangodb/pkg/util/globals" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { var p api.Plan + + if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 { + // Scheduling grace period is not enabled + return nil + } + if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) { return p } @@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context, continue } + if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok { + // Action cant proceed if pod is not scheduled + continue + } else if c.LastTransitionTime.IsZero() { + // LastTransitionTime is not set + continue + } else { + if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() { + // In grace period + continue + } + } + imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member) if !imageFound { l.Warn("could not find image for already created member") diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index a2b54d79a..3f50c35dc 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) } } else { + if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") { + updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) + } + if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) { // Pod cannot be scheduled for to long log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout") diff --git a/pkg/util/globals/global.go b/pkg/util/globals/global.go index 9224ae58e..eb5d2c173 100644 --- a/pkg/util/globals/global.go +++ b/pkg/util/globals/global.go @@ -29,6 +29,7 @@ const ( DefaultArangoDCheckTimeout = time.Second * 2 DefaultReconciliationTimeout = time.Minute DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute + DefaultPodSchedulingGracePeriod = 15 * time.Second BackupDefaultArangoClientTimeout = 30 * time.Second BackupUploadArangoClientTimeout = 300 * time.Second @@ -61,6 +62,7 @@ var globalObj = &globals{ backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout), backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout), forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout), + podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod), }, kubernetes: &globalKubernetes{ requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize), @@ -147,6 +149,7 @@ type GlobalTimeouts interface { Agency() Timeout ForcePodDeletionGracePeriodTimeout() Timeout + PodSchedulingGracePeriod() Timeout BackupArangoClientTimeout() Timeout BackupArangoClientUploadTimeout() Timeout @@ -156,13 +159,17 @@ type globalTimeouts struct { requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout backupArangoClientTimeout Timeout backupArangoClientUploadTimeout Timeout - forcePodDeletionGracePeriodTimeout Timeout + forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout } func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout { return g.forcePodDeletionGracePeriodTimeout } +func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout { + return g.podSchedulingGracePeriod +} + func (g *globalTimeouts) Agency() Timeout { return g.agency }