Skip to content

Commit

Permalink
improve finalising logic for canary release-2
Browse files Browse the repository at this point in the history
Signed-off-by: yunbo <[email protected]>
  • Loading branch information
Funinu committed Sep 2, 2024
1 parent 15b589d commit 2695255
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 58 deletions.
82 changes: 32 additions & 50 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,53 +386,29 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
return true, nil
}
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep)
// the steps. order is maitained by the nextStep
switch canaryStatus.FinalisingStep {
// call the FinalisingTrafficRouting function to:
// 1.restore stable service selector to select all pods
// 2.restore network api(ingress/ gateway api/ istio) configuration
// 3.delete canary service
case v1beta1.FinalisingStepTypeTrafficRouting:
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
if err != nil || !done {
canaryStatus.LastUpdateTime = tr.LastUpdateTime
return done, err
}

var retry bool
// the order of steps is maitained by calculating thenextStep
switch canaryStatus.FinalisingStep {
// set workload.pause=false; set workload.partition=0
case v1beta1.FinalisingStepTypeBatchRelease:
done, err := m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
retry, err = m.finalizingBatchRelease(c)
// delete batchRelease
case v1beta1.FinalisingStepTypeDeleteBR:
done, err := m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
return false, nil
}
retry, err = m.removeBatchRelease(c)
// restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
retry, err := m.trafficRoutingManager.RestoreGateway(tr)
if err != nil || retry {
return false, err
}
retry, err = m.trafficRoutingManager.RestoreGateway(tr)
// restore the stable service
case v1beta1.FinalisingStepTypeStableService:
retry, err := m.trafficRoutingManager.RestoreStableService(tr)
if err != nil || retry {
return false, err
}
retry, err = m.trafficRoutingManager.RestoreStableService(tr)
// remove canary service
case v1beta1.FinalisingStepTypeRemoveCanaryService:
retry, err := m.trafficRoutingManager.RemoveCanaryService(tr)
if err != nil || retry {
return false, err
}
retry, err = m.trafficRoutingManager.RemoveCanaryService(tr)
}
if err != nil || retry {
return false, err
}
// current step is done, run the next step
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
Expand Down Expand Up @@ -549,45 +525,47 @@ func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32,
return br
}

// bool means if we need retry; if error is not nil, always retry
func (m *canaryReleaseManager) removeBatchRelease(c *RolloutContext) (bool, error) {
batch := &v1beta1.BatchRelease{}
err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch)
if err != nil && errors.IsNotFound(err) {
return true, nil
return false, nil
} else if err != nil {
klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name)
return false, err
return true, err
}
if !batch.DeletionTimestamp.IsZero() {
klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name)
return false, nil
return true, nil
}

//delete batchRelease
err = m.Delete(context.TODO(), batch)
if err != nil {
klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
return true, err
}
klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name)
return false, nil
return true, nil
}

// bool means if we need retry; if error is not nil, always retry
func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, error) {
br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name)
if err != nil {
if errors.IsNotFound(err) {
return true, nil
return false, nil
}
return false, err
return true, err
}
waitReady := c.WaitReady
// The Completed phase means batchRelease controller has processed all it
// should process. If BatchRelease phase is completed, we can do nothing.
if br.Spec.ReleasePlan.BatchPartition == nil &&
br.Status.Phase == v1beta1.RolloutPhaseCompleted {
klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status))
return true, nil
return false, nil
}

// If BatchPartition is nil, BatchRelease will directly resume workload via:
Expand All @@ -600,11 +578,11 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool,
switch br.Spec.ReleasePlan.FinalizingPolicy {
case v1beta1.WaitResumeFinalizingPolicyType:
if waitReady { // no need to patch again
return false, nil
return true, nil
}
default:
if !waitReady { // no need to patch again
return false, nil
return true, nil
}
}
}
Expand All @@ -618,10 +596,10 @@ func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool,
// Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here.
body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy)
if err = m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil {
return false, err
return true, err
}
klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body)
return false, nil
return true, nil
}

// syncBatchRelease sync status of br to canaryStatus, and sync rollout-id of canaryStatus to br.
Expand Down Expand Up @@ -649,19 +627,23 @@ func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.Fin
//REVIEW - should we consider more complex scenarios?
// like, user rollbacks the workload and disables the Rollout at the same time?
switch reason {
// in rollback, we cannot remove selector of the stable service in the first step,
// which may cause some traffic to problematic new version, so we should route all traffic to stable service
// in the frist step
case v1beta1.FinaliseReasonRollback: // rollback
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version
v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new
v1beta1.FinalisingStepTypeDeleteBR,
// v1beta1.FinalisingStepTypeTrafficRouting, // do cleaning works(restore stable Service, remove canary Service)
v1beta1.FinalisingStepTypeStableService,
v1beta1.FinalisingStepTypeRemoveCanaryService,
}
default: // others: success/disabled/deleting rollout
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeTrafficRouting, // remove selector of stable Service
v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old
v1beta1.FinalisingStepTypeStableService,
v1beta1.FinalisingStepTypeGateway,
v1beta1.FinalisingStepTypeRemoveCanaryService,
v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old
v1beta1.FinalisingStepTypeDeleteBR,
}
}
Expand Down
8 changes: 4 additions & 4 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
}
// if no trafficRouting exists, simply remove batchRelease
if !c.Rollout.Spec.Strategy.HasTrafficRoutings() {
done, err := releaseManager.removeBatchRelease(c)
retry, err := releaseManager.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
} else if retry {
return false, nil
}
return true, nil
Expand Down Expand Up @@ -458,11 +458,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
// canary deployment, for other release, the v2 pods won't be deleted immediately
// in both cases, only the stable pods (v1) accept the traffic
case v1beta1.FinalisingStepTypeDeleteBR:
done, err := releaseManager.removeBatchRelease(c)
retry, err := releaseManager.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
} else if retry {
return false, nil
}
klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
Expand Down
8 changes: 4 additions & 4 deletions pkg/controller/rollout/rollout_progressing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
s.CanaryStatus.CurrentStepIndex = 4
s.CanaryStatus.NextStepIndex = 0
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeTrafficRouting
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
cond.Reason = v1alpha1.ProgressingReasonFinalising
Expand Down Expand Up @@ -333,8 +333,8 @@ func TestReconcileRolloutProgressing(t *testing.T) {
obj.Status.CanaryStatus.CurrentStepIndex = 4
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
// given that the selector of stable Service is removed
// we will go on it the next step, i.e. patch BatchRelease
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeTrafficRouting
// we will go on it the next step, i.e. patch restoreGateway
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
cond.Reason = v1alpha1.ProgressingReasonFinalising
cond.Status = corev1.ConditionTrue
Expand All @@ -358,7 +358,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
s.CanaryStatus.CurrentStepIndex = 4
s.CanaryStatus.NextStepIndex = 0
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease
s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway
s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted
cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing)
cond.Reason = v1alpha1.ProgressingReasonFinalising
Expand Down

0 comments on commit 2695255

Please sign in to comment.