Skip to content

Commit

Permalink
fix: blue-green rollouts could pause prematurely during prePromotionA…
Browse files Browse the repository at this point in the history
…nalysis

Signed-off-by: Jesse Suen <[email protected]>
  • Loading branch information
jessesuen committed Feb 25, 2021
1 parent 7a7fbc0 commit 4ac024e
Show file tree
Hide file tree
Showing 12 changed files with 126 additions and 93 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ test-kustomize:

.PHONY: start-e2e
start-e2e:
go run ./cmd/rollouts-controller/main.go --instance-id ${E2E_INSTANCE_ID}
go run ./cmd/rollouts-controller/main.go --instance-id ${E2E_INSTANCE_ID} --loglevel debug

.PHONY: test-e2e
test-e2e:
Expand Down
8 changes: 4 additions & 4 deletions pkg/apis/rollouts/v1alpha1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

31 changes: 13 additions & 18 deletions pkg/apis/rollouts/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,30 +96,21 @@ type BlueGreenStrategy struct {
// Name of the service that the rollout modifies as the preview service.
// +optional
PreviewService string `json:"previewService,omitempty"`
// PreviewReplica the number of replicas to run under the preview service before the switchover. Once the rollout is
// resumed the new replicaset will be full scaled up before the switch occurs
// PreviewReplicaCount is the number of replicas to run for the preview stack before the
// switchover. Once the rollout is resumed the desired replicaset will be full scaled up before the switch occurs
// +optional
PreviewReplicaCount *int32 `json:"previewReplicaCount,omitempty"`
// AutoPromotionEnabled indicates if the rollout should automatically promote the new ReplicaSet
// to the active service or enter a paused state. If not specified, the default value is true.
// +optional
AutoPromotionEnabled *bool `json:"autoPromotionEnabled,omitempty"`
// AutoPromotionSeconds automatically promotes the current ReplicaSet to active after the
// specified pause delay in seconds after the ReplicaSet becomes ready.
// If omitted, the Rollout enters and remains in a paused state until manually resumed by
// removing the pause condition.
// AutoPromotionSeconds is a duration in seconds in which to delay auto-promotion (default: 0).
// The countdown begins after the preview ReplicaSet have reached full availability.
// This option is ignored if autoPromotionEnabled is set to false.
// +optional
AutoPromotionSeconds *int32 `json:"autoPromotionSeconds,omitempty"`
// MaxUnavailable The maximum number of pods that can be unavailable during the update.
// Value can be an absolute number (ex: 5) or a percentage of total pods at the start of update (ex: 10%).
// Absolute number is calculated from percentage by rounding down.
// This can not be 0 if MaxSurge is 0.
// By default, a fixed value of 1 is used.
// Example: when this is set to 30%, the old RC can be scaled down by 30%
// immediately when the rolling update starts. Once new pods are ready, old RC
// can be scaled down further, followed by scaling up the new RC, ensuring
// that at least 70% of original number of pods are available at all times
// during the update.
AutoPromotionSeconds int32 `json:"autoPromotionSeconds,omitempty"`
// MaxUnavailable The maximum number of pods that can be unavailable during a restart operation.
// Defaults to 25% of total replicas.
// +optional
MaxUnavailable *intstr.IntOrString `json:"maxUnavailable,omitempty"`
// ScaleDownDelaySeconds adds a delay before scaling down the previous replicaset.
Expand Down Expand Up @@ -530,7 +521,11 @@ type RolloutStatus struct {
Abort bool `json:"abort,omitempty"`
// PauseConditions indicates why the rollout is currently paused
PauseConditions []PauseCondition `json:"pauseConditions,omitempty"`
//ControllerPause indicates the controller has paused the rollout
// ControllerPause indicates the controller has paused the rollout. It is set to true when
// the controller adds a pause condition. This field helps to discern the scenario where a
// rollout was resumed after being paused by the controller (e.g. via the plugin). In that
// situation, the pauseConditions would have been cleared , but controllerPause would still be
// set to true.
ControllerPause bool `json:"controllerPause,omitempty"`
// AbortedAt indicates the controller reconciled an aborted rollout. The controller uses this to understand if
// the controller needs to do some specific work when a Rollout is aborted. For example, the reconcileAbort is used
Expand Down
5 changes: 0 additions & 5 deletions pkg/apis/rollouts/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pkg/apis/rollouts/validation/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ func TestValidateRolloutStrategyBlueGreen(t *testing.T) {
PreviewService: "service-name",
ActiveService: "service-name",
ScaleDownDelayRevisionLimit: &scaleDownDelayRevisionLimit,
AutoPromotionSeconds: &autoPromotionSeconds,
AutoPromotionSeconds: autoPromotionSeconds,
},
},
},
Expand Down
2 changes: 1 addition & 1 deletion rollout/analysis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1830,7 +1830,7 @@ func TestRolloutPrePromotionAnalysisHonorAutoPromotionSeconds(t *testing.T) {
r1 := newBlueGreenRollout("foo", 1, nil, "active", "")
r1.Spec.Strategy.BlueGreen.AutoPromotionEnabled = pointer.BoolPtr(true)
r2 := bumpVersion(r1)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = pointer.Int32Ptr(10)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = 10
r2.Spec.Strategy.BlueGreen.PrePromotionAnalysis = &v1alpha1.RolloutAnalysis{
Templates: []v1alpha1.RolloutAnalysisTemplate{{
TemplateName: at.Name,
Expand Down
75 changes: 51 additions & 24 deletions rollout/bluegreen.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func (c *rolloutContext) rolloutBlueGreen() error {

c.reconcileBlueGreenPause(activeSvc, previewSvc)

err = c.reconcileActiveService(previewSvc, activeSvc)
err = c.reconcileActiveService(activeSvc)
if err != nil {
return err
}
Expand Down Expand Up @@ -121,20 +121,15 @@ func (c *rolloutContext) reconcileBlueGreenTemplateChange() bool {
return c.rollout.Status.CurrentPodHash != c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
}

func (c *rolloutContext) skipPause(activeSvc *corev1.Service) bool {
// isBlueGreenFastTracked returns true if we should skip the pause step because update has been fast tracked
func (c *rolloutContext) isBlueGreenFastTracked(activeSvc *corev1.Service) bool {
if replicasetutil.HasScaleDownDeadline(c.newRS) {
c.log.Infof("Detected scale down annotation for ReplicaSet '%s' and will skip pause", c.newRS.Name)
return true
}
if c.rollout.Status.PromoteFull {
return true
}

// If a rollout has a PrePromotionAnalysis, the controller only skips the pause after the analysis passes
if defaults.GetAutoPromotionEnabledOrDefault(c.rollout) && c.completedPrePromotionAnalysis() {
return true
}

if _, ok := activeSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]; !ok {
return true
}
Expand All @@ -145,7 +140,6 @@ func (c *rolloutContext) skipPause(activeSvc *corev1.Service) bool {
}

func (c *rolloutContext) reconcileBlueGreenPause(activeSvc, previewSvc *corev1.Service) {
c.log.Info("Reconciling pause")
if c.rollout.Status.Abort {
return
}
Expand All @@ -155,31 +149,64 @@ func (c *rolloutContext) reconcileBlueGreenPause(activeSvc, previewSvc *corev1.S
return
}
if c.rollout.Spec.Paused {
c.log.Info("rollout has been paused by user")
return
}

if c.skipPause(activeSvc) {
if c.isBlueGreenFastTracked(activeSvc) {
c.log.Debug("skipping pause: fast-tracked update")
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
return
}

newRSPodHash := c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
cond := getPauseCondition(c.rollout, v1alpha1.PauseReasonBlueGreenPause)
// If the rollout is not paused and the active service is not point at the newRS, we should pause the rollout.
if cond == nil && !c.rollout.Status.ControllerPause && !c.rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint && activeSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] != newRSPodHash {
if activeSvc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] == newRSPodHash {
c.log.Debug("skipping pause: desired ReplicaSet already active")
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
return
}
if c.rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint {
c.log.Debug("skipping pause: scaleUpPreviewCheckPoint passed")
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
return
}
if !needsBlueGreenControllerPause(c.rollout) {
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
return
}

// if we get here, the controller should manage the pause/resume
c.log.Infof("reconciling pause (autoPromotionSeconds: %d)", c.rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds)
if !c.completedPrePromotionAnalysis() {
c.log.Infof("not ready for pause: prePromotionAnalysis incomplete")
return
}
pauseCond := getPauseCondition(c.rollout, v1alpha1.PauseReasonBlueGreenPause)
if pauseCond == nil && !c.rollout.Status.ControllerPause {
if pauseCond == nil {
c.log.Info("pausing")
}
c.pauseContext.AddPauseCondition(v1alpha1.PauseReasonBlueGreenPause)
return
}

autoPromoteActiveServiceDelaySeconds := c.rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds
if autoPromoteActiveServiceDelaySeconds != nil && cond != nil {
c.checkEnqueueRolloutDuringWait(cond.StartTime, *autoPromoteActiveServiceDelaySeconds)
switchDeadline := cond.StartTime.Add(time.Duration(*autoPromoteActiveServiceDelaySeconds) * time.Second)
now := metav1.Now()
if now.After(switchDeadline) {
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
if !c.pauseContext.CompletedBlueGreenPause() {
c.log.Info("pause incomplete")
if c.rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds > 0 {
c.checkEnqueueRolloutDuringWait(pauseCond.StartTime, c.rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds)
}
} else {
c.log.Infof("pause completed")
c.pauseContext.RemovePauseCondition(v1alpha1.PauseReasonBlueGreenPause)
}
}

// needsBlueGreenControllerPause indicates if the controller should manage the pause status of the blue-green rollout
func needsBlueGreenControllerPause(ro *v1alpha1.Rollout) bool {
if ro.Spec.Strategy.BlueGreen.AutoPromotionEnabled != nil {
if !*ro.Spec.Strategy.BlueGreen.AutoPromotionEnabled {
return true
}
}
return ro.Spec.Strategy.BlueGreen.AutoPromotionSeconds > 0
}

// scaleDownOldReplicaSetsForBlueGreen scales down old replica sets when rollout strategy is "Blue Green".
Expand Down Expand Up @@ -374,8 +401,8 @@ func (c *rolloutContext) calculateScaleUpPreviewCheckPoint(stableRSHash string)
if c.rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint {
return c.rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint
}
if !c.completedPrePromotionAnalysis() {
// do not set the checkpoint unless prePromotion was successful
if !c.completedPrePromotionAnalysis() || !c.pauseContext.CompletedBlueGreenPause() {
// do not set the checkpoint unless prePromotionAnalysis was successful and we completed our pause
return false
}
previewCountAvailable := *c.rollout.Spec.Strategy.BlueGreen.PreviewReplicaCount == replicasetutil.GetAvailableReplicaCountForReplicaSets([]*appsv1.ReplicaSet{c.newRS})
Expand Down
10 changes: 3 additions & 7 deletions rollout/bluegreen_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,9 +338,8 @@ func TestBlueGreenHandlePause(t *testing.T) {
defer f.Close()

r1 := newBlueGreenRollout("foo", 1, nil, "active", "preview")
r1.Spec.Strategy.BlueGreen.AutoPromotionEnabled = pointer.BoolPtr(false)
r2 := bumpVersion(r1)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = pointer.Int32Ptr(10)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = 10

rs1 := newReplicaSetWithStatus(r1, 1, 1)
rs2 := newReplicaSetWithStatus(r2, 1, 1)
Expand Down Expand Up @@ -373,9 +372,8 @@ func TestBlueGreenHandlePause(t *testing.T) {
defer f.Close()

r1 := newBlueGreenRollout("foo", 1, nil, "active", "preview")
r1.Spec.Strategy.BlueGreen.AutoPromotionEnabled = pointer.BoolPtr(false)
r2 := bumpVersion(r1)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = pointer.Int32Ptr(10)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = 10

rs1 := newReplicaSetWithStatus(r1, 1, 1)
rs2 := newReplicaSetWithStatus(r2, 1, 1)
Expand Down Expand Up @@ -427,9 +425,8 @@ func TestBlueGreenHandlePause(t *testing.T) {
defer f.Close()

r1 := newBlueGreenRollout("foo", 1, nil, "active", "preview")
r1.Spec.Strategy.BlueGreen.AutoPromotionEnabled = pointer.BoolPtr(false)
r2 := bumpVersion(r1)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = pointer.Int32Ptr(10)
r2.Spec.Strategy.BlueGreen.AutoPromotionSeconds = 10

rs1 := newReplicaSetWithStatus(r1, 1, 1)
rs2 := newReplicaSetWithStatus(r2, 1, 1)
Expand Down Expand Up @@ -843,7 +840,6 @@ func TestPreviewReplicaCountHandleScaleUpPreviewCheckPoint(t *testing.T) {

r1 := newBlueGreenRollout("foo", 5, nil, "active", "")
r1.Spec.Strategy.BlueGreen.PreviewReplicaCount = pointer.Int32Ptr(3)
r1.Spec.Strategy.BlueGreen.AutoPromotionEnabled = pointer.BoolPtr(false)
rs1 := newReplicaSetWithStatus(r1, 5, 5)
r2 := bumpVersion(r1)

Expand Down
2 changes: 1 addition & 1 deletion rollout/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ func (c *rolloutContext) completedCurrentCanaryStep() bool {
}
switch {
case currentStep.Pause != nil:
return c.pauseContext.CompletedPauseStep(*currentStep.Pause)
return c.pauseContext.CompletedCanaryPauseStep(*currentStep.Pause)
case currentStep.SetCanaryScale != nil:
return replicasetutil.AtDesiredReplicaCountsForCanary(c.rollout, c.newRS, c.stableRS, c.otherRSs)
case currentStep.SetWeight != nil:
Expand Down
60 changes: 35 additions & 25 deletions rollout/pause.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,53 +125,63 @@ func getPauseCondition(rollout *v1alpha1.Rollout, reason v1alpha1.PauseReason) *
return nil
}

// completedPrePromotionAnalysis checks if the Pre Promotion Analysis has completed successfully or the rollout passed
// the auto promote seconds.
// completedPrePromotionAnalysis checks if the Pre Promotion Analysis has completed successfully
func (c *rolloutContext) completedPrePromotionAnalysis() bool {
if c.rollout.Spec.Strategy.BlueGreen == nil || c.rollout.Spec.Strategy.BlueGreen.PrePromotionAnalysis == nil {
return true
}

cond := getPauseCondition(c.rollout, v1alpha1.PauseReasonBlueGreenPause)
autoPromoteActiveServiceDelaySeconds := c.rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds
if autoPromoteActiveServiceDelaySeconds != nil && cond != nil {
switchDeadline := cond.StartTime.Add(time.Duration(*autoPromoteActiveServiceDelaySeconds) * time.Second)
now := metav1.Now()
if now.After(switchDeadline) {
return true
}
return false
}

currentAr := c.currentArs.BlueGreenPrePromotion
if currentAr != nil && currentAr.Status.Phase == v1alpha1.AnalysisPhaseSuccessful {
return true
}

return false
}

// CompletedBlueGreenPause returns true if we have already completed our automated pause, either
// because a human has resumed the rollout, or we surpassed autoPromotionSeconds.
func (pCtx *pauseContext) CompletedBlueGreenPause() bool {
rollout := pCtx.rollout
if pCtx.HasAddPause() {
// return false if we just added a pause condition as part of this reconciliation
return false
}
cond := getPauseCondition(rollout, v1alpha1.PauseReasonBlueGreenPause)

autoPromoteActiveServiceDelaySeconds := rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds
if autoPromoteActiveServiceDelaySeconds != nil && cond != nil {
switchDeadline := cond.StartTime.Add(time.Duration(*autoPromoteActiveServiceDelaySeconds) * time.Second)
now := metav1.Now()
if now.After(switchDeadline) {
pCtx.log.Info("Rollout has waited the duration of the autoPromoteActiveServiceDelaySeconds")
if rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint {
return true
}
if !needsBlueGreenControllerPause(rollout) {
return true
}
pauseCond := getPauseCondition(rollout, v1alpha1.PauseReasonBlueGreenPause)
if rollout.Spec.Strategy.BlueGreen.AutoPromotionEnabled == nil || *rollout.Spec.Strategy.BlueGreen.AutoPromotionEnabled {
// autoPromotion is enabled. check if we surpassed the delay
autoPromotionSeconds := rollout.Spec.Strategy.BlueGreen.AutoPromotionSeconds
if autoPromotionSeconds == 0 {
return true
}
if rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint {
return true
}
if pauseCond != nil {
switchDeadline := pauseCond.StartTime.Add(time.Duration(autoPromotionSeconds) * time.Second)
now := metav1.Now()
if now.After(switchDeadline) {
return true
}
return false
}
// we never paused the rollout
return false
} else {
// autoPromotion is disabled. the presence of a pause condition means human has not resumed it
if rollout.Status.ControllerPause {
return pauseCond == nil
}
// status.controllerPause has not yet been set
return false
}
return cond == nil && (rollout.Status.ControllerPause || rollout.Status.BlueGreen.ScaleUpPreviewCheckPoint)
}

func (pCtx *pauseContext) CompletedPauseStep(pause v1alpha1.RolloutPause) bool {
func (pCtx *pauseContext) CompletedCanaryPauseStep(pause v1alpha1.RolloutPause) bool {
rollout := pCtx.rollout
pauseCondition := getPauseCondition(rollout, v1alpha1.PauseReasonCanaryPauseStep)

Expand Down
Loading

0 comments on commit 4ac024e

Please sign in to comment.