Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Parametrize Scheduling Graceful Duration #1641

Merged
merged 1 commit into from
Apr 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- (Maintenance) Update Go to 1.22.2
- (Feature) Object Checksum
- (Bugfix) Use Rendered Spec in case of scheduling compare
- (Feature) Parametrize Scheduling Graceful Duration

## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11)
- (Feature) Extract Scheduler API
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ Flags:
--timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s)
--timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s)
--timeout.k8s duration The request timeout to the kubernetes (default 2s)
--timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s)
--timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s)
--timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s)
--timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s)
Expand Down
3 changes: 3 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ var (
backupArangoD time.Duration
backupUploadArangoD time.Duration
forcePodDeletionGracePeriod time.Duration
podSchedulingGracePeriod time.Duration
}
operatorImageDiscovery struct {
timeout time.Duration
Expand Down Expand Up @@ -226,6 +227,7 @@ func init() {
f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls")
f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files")
f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals")
f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable")
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop")
Expand Down Expand Up @@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) {
globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD)
globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD)
globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod)
globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod)

globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay)
globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ package reconcile
import (
"context"
"reflect"
"time"

core "k8s.io/api/core/v1"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

Expand All @@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {

var p api.Plan

if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 {
// Scheduling grace period is not enabled
return nil
}

if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
return p
}
Expand All @@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
continue
}

if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok {
// Action cant proceed if pod is not scheduled
continue
} else if c.LastTransitionTime.IsZero() {
// LastTransitionTime is not set
continue
} else {
if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() {
// In grace period
continue
}
}

imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
if !imageFound {
l.Warn("could not find image for already created member")
Expand Down
5 changes: 5 additions & 0 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}
} else {
if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") {
updateMemberStatusNeeded = true
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}

if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) {
// Pod cannot be scheduled for to long
log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout")
Expand Down
9 changes: 8 additions & 1 deletion pkg/util/globals/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const (
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute
DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute
DefaultPodSchedulingGracePeriod = 15 * time.Second
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO it could be bit longer (60s)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before it was 0 seconds, now it will 15 (what is enough for AS to take an action - where default is set to 10s)


BackupDefaultArangoClientTimeout = 30 * time.Second
BackupUploadArangoClientTimeout = 300 * time.Second
Expand Down Expand Up @@ -61,6 +62,7 @@ var globalObj = &globals{
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout),
podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod),
},
kubernetes: &globalKubernetes{
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
Expand Down Expand Up @@ -147,6 +149,7 @@ type GlobalTimeouts interface {
Agency() Timeout

ForcePodDeletionGracePeriodTimeout() Timeout
PodSchedulingGracePeriod() Timeout

BackupArangoClientTimeout() Timeout
BackupArangoClientUploadTimeout() Timeout
Expand All @@ -156,13 +159,17 @@ type globalTimeouts struct {
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout
backupArangoClientTimeout Timeout
backupArangoClientUploadTimeout Timeout
forcePodDeletionGracePeriodTimeout Timeout
forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout
}

func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout {
return g.forcePodDeletionGracePeriodTimeout
}

func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout {
return g.podSchedulingGracePeriod
}

func (g *globalTimeouts) Agency() Timeout {
return g.agency
}
Expand Down