From 85fa8ec66861355ad3021d895b312a5d691afcba Mon Sep 17 00:00:00 2001 From: ajanikow <12255597+ajanikow@users.noreply.github.com> Date: Mon, 8 Apr 2024 09:19:57 +0000 Subject: [PATCH] [Feature] Parametrize Scheduling Graceful Duration --- CHANGELOG.md | 1 + README.md | 3 ++- cmd/cmd.go | 3 +++ pkg/deployment/deployment.go | 8 +++++-- ...n_builder_member_pod_scheduling_failure.go | 21 +++++++++++++++++++ pkg/deployment/resources/pod_inspector.go | 5 +++++ pkg/util/globals/global.go | 9 +++++++- 7 files changed, 46 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 046c24af3..d2ad2f9fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ - (Maintenance) Update Go to 1.22.2 - (Feature) Object Checksum - (Bugfix) Use Rendered Spec in case of scheduling compare +- (Feature) Parametrize Scheduling Graceful Duration ## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11) - (Feature) Extract Scheduler API diff --git a/README.md b/README.md index 3cd38eb7a..c9fc72f78 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ Flags: --kubernetes.max-batch-size int Size of batch during objects read (default 256) --kubernetes.qps float32 Number of queries per second for k8s API (default 15) --log.format string Set log format. Allowed values: 'pretty', 'JSON'. If empty, default format is used (default "pretty") - --log.level stringArray Set log levels in format or =. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, scheduler, server, server-authentication (default [info]) + --log.level stringArray Set log levels in format or =. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [info]) --log.sampling If true, operator will try to minimize duplication of logging events (default true) --memory-limit uint Define memory limit for hard shutdown and the dump of goroutines. Used for testing --metrics.excluded-prefixes stringArray List of the excluded metrics prefixes @@ -198,6 +198,7 @@ Flags: --timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s) --timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s) --timeout.k8s duration The request timeout to the kubernetes (default 2s) + --timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s) --timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s) --timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s) --timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s) diff --git a/cmd/cmd.go b/cmd/cmd.go index 2261192b1..f77d3d35f 100644 --- a/cmd/cmd.go +++ b/cmd/cmd.go @@ -157,6 +157,7 @@ var ( backupArangoD time.Duration backupUploadArangoD time.Duration forcePodDeletionGracePeriod time.Duration + podSchedulingGracePeriod time.Duration } operatorImageDiscovery struct { timeout time.Duration @@ -226,6 +227,7 @@ func init() { f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls") f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files") f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals") + f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable") f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers") f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers") f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop") @@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) { globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD) globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD) globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod) + globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod) globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay) globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count) diff --git a/pkg/deployment/deployment.go b/pkg/deployment/deployment.go index 6edbdd13b..50700765c 100644 --- a/pkg/deployment/deployment.go +++ b/pkg/deployment/deployment.go @@ -498,8 +498,12 @@ func (d *Deployment) acceptNewSpec(ctx context.Context, depl *api.ArangoDeployme } func (d *Deployment) patchAcceptedSpec(ctx context.Context, spec *api.DeploymentSpec, checksum string) error { - return d.ApplyPatch(ctx, patch.ItemReplace(patch.NewPath("status", "accepted-spec"), spec), - patch.ItemReplace(patch.NewPath("status", "acceptedSpecVersion"), checksum)) + s := d.GetStatus() + + s.AcceptedSpecVersion = util.NewType(checksum) + s.AcceptedSpec = spec.DeepCopy() + + return d.updateCRStatus(ctx, s) } // handleArangoDeploymentUpdatedEvent is called when the deployment is updated by the user. diff --git a/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go b/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go index 1062fb2c3..86a802d57 100644 --- a/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go +++ b/pkg/deployment/reconcile/plan_builder_member_pod_scheduling_failure.go @@ -23,12 +23,14 @@ package reconcile import ( "context" "reflect" + "time" core "k8s.io/api/core/v1" api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1" "github.com/arangodb/kube-arangodb/pkg/deployment/actions" "github.com/arangodb/kube-arangodb/pkg/util" + "github.com/arangodb/kube-arangodb/pkg/util/globals" "github.com/arangodb/kube-arangodb/pkg/util/k8sutil" ) @@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context, _ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan { var p api.Plan + + if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 { + // Scheduling grace period is not enabled + return nil + } + if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) { return p } @@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context, continue } + if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok { + // Action cant proceed if pod is not scheduled + continue + } else if c.LastTransitionTime.IsZero() { + // LastTransitionTime is not set + continue + } else { + if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() { + // In grace period + continue + } + } + imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member) if !imageFound { l.Warn("could not find image for already created member") diff --git a/pkg/deployment/resources/pod_inspector.go b/pkg/deployment/resources/pod_inspector.go index a2b54d79a..3f50c35dc 100644 --- a/pkg/deployment/resources/pod_inspector.go +++ b/pkg/deployment/resources/pod_inspector.go @@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) } } else { + if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") { + updateMemberStatusNeeded = true + nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval) + } + if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) { // Pod cannot be scheduled for to long log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout") diff --git a/pkg/util/globals/global.go b/pkg/util/globals/global.go index 9224ae58e..eb5d2c173 100644 --- a/pkg/util/globals/global.go +++ b/pkg/util/globals/global.go @@ -29,6 +29,7 @@ const ( DefaultArangoDCheckTimeout = time.Second * 2 DefaultReconciliationTimeout = time.Minute DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute + DefaultPodSchedulingGracePeriod = 15 * time.Second BackupDefaultArangoClientTimeout = 30 * time.Second BackupUploadArangoClientTimeout = 300 * time.Second @@ -61,6 +62,7 @@ var globalObj = &globals{ backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout), backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout), forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout), + podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod), }, kubernetes: &globalKubernetes{ requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize), @@ -147,6 +149,7 @@ type GlobalTimeouts interface { Agency() Timeout ForcePodDeletionGracePeriodTimeout() Timeout + PodSchedulingGracePeriod() Timeout BackupArangoClientTimeout() Timeout BackupArangoClientUploadTimeout() Timeout @@ -156,13 +159,17 @@ type globalTimeouts struct { requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout backupArangoClientTimeout Timeout backupArangoClientUploadTimeout Timeout - forcePodDeletionGracePeriodTimeout Timeout + forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout } func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout { return g.forcePodDeletionGracePeriodTimeout } +func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout { + return g.podSchedulingGracePeriod +} + func (g *globalTimeouts) Agency() Timeout { return g.agency }