From 011762af60bba018e8532a127490f00b18c46db8 Mon Sep 17 00:00:00 2001 From: yunbo Date: Fri, 7 Jun 2024 11:06:39 +0800 Subject: [PATCH] imporve traffic strategy for canary and partition release Signed-off-by: yunbo --- api/v1beta1/rollout_types.go | 19 ++ pkg/controller/rollout/rollout_canary.go | 161 ++++++++++++--- pkg/controller/rollout/rollout_canary_test.go | 6 + pkg/controller/rollout/rollout_progressing.go | 26 ++- .../rollout/rollout_progressing_test.go | 151 +++++++++++++- .../trafficrouting_controller.go | 4 +- pkg/trafficrouting/manager.go | 184 +++++++++++++++--- pkg/trafficrouting/manager_test.go | 144 +++++--------- .../custom_network_provider.go | 6 +- 9 files changed, 541 insertions(+), 160 deletions(-) diff --git a/api/v1beta1/rollout_types.go b/api/v1beta1/rollout_types.go index d1b7b750..cb064e62 100644 --- a/api/v1beta1/rollout_types.go +++ b/api/v1beta1/rollout_types.go @@ -17,6 +17,9 @@ limitations under the License. package v1beta1 import ( + "reflect" + + apps "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" @@ -95,6 +98,22 @@ func (r *RolloutStrategy) GetRollingStyle() RollingStyleType { return PartitionRollingStyle } +// simply using EnableExtraWorkloadForCanary is not enough, for example, a v1alaph1 Rollout +// can be converted to v1beta1 Rollout with EnableExtraWorkloadForCanary set as true, even the +// objectRef is cloneset (which doesn't support canary release) +func IsRealPartition(rollout *Rollout) bool { + estimation := rollout.Spec.Strategy.GetRollingStyle() + if estimation == EmptyRollingStyle || estimation == BlueGreenRollingStyle { + return false + } + targetRef := rollout.Spec.WorkloadRef + if targetRef.APIVersion == apps.SchemeGroupVersion.String() && targetRef.Kind == reflect.TypeOf(apps.Deployment{}).Name() && + estimation == CanaryRollingStyle { + return false + } + return true +} + // r.GetRollingStyle() == BlueGreenRollingStyle func (r *RolloutStrategy) IsBlueGreenRelease() bool { return r.GetRollingStyle() == BlueGreenRollingStyle diff --git a/pkg/controller/rollout/rollout_canary.go b/pkg/controller/rollout/rollout_canary.go index 0b93d76d..3cefc17c 100644 --- a/pkg/controller/rollout/rollout_canary.go +++ b/pkg/controller/rollout/rollout_canary.go @@ -75,7 +75,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1] if currentStep.Traffic == nil && len(currentStep.Matches) == 0 { tr := newTrafficRoutingContext(c) - done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, false) + done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr) c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime if err != nil { return err @@ -89,10 +89,59 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { switch canaryStatus.CurrentStepState { // before CanaryStepStateUpgrade, handle some special cases, to prevent traffic loss case v1beta1.CanaryStepStateInit: - // placeholder for the later traffic modification Pull Request - canaryStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, canaryStatus.CurrentStepIndex) + klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateInit) + tr := newTrafficRoutingContext(c) + if currentStep.Traffic == nil && len(currentStep.Matches) == 0 { + canaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, canaryStatus.CurrentStepState) + return nil + } + + /* + The next check is used to bypass the bug in ingress-nginx controller https://github.com/kubernetes/ingress-nginx/issues/9635 + for partition release, if the currentStep replicas is "100%", we can assume that all traffic should be routed to canary pods + */ + if currentStep.Replicas.StrVal == "100%" && v1beta1.IsRealPartition(c.Rollout) { + klog.Infof("special case detected: rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name) + done, err := m.trafficRoutingManager.RestoreStableService(tr) + if err != nil { + return err + } else if !done { + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) + c.RecheckTime = &expectedTime + return nil + } + } + + /* + The next check is used to solve the following scenario: + steps: + - replicas: 1 # frist batch + matches: + - headers: + - name: user-agent + type: Exact + value: pc + we should patch selector to stable Service before CanaryStepStateUpgrade when in the first batch + otherwise, some traffic will loss between CanaryStepStateUpgrade and CanaryStepStateTrafficRouting + */ + if canaryStatus.CurrentStepIndex == 1 { + klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name) + done, err := m.trafficRoutingManager.PatchStableService(tr) + if err != nil { + return err + } else if !done { + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) + c.RecheckTime = &expectedTime + return nil + } + } + + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade - fallthrough + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, canaryStatus.CurrentStepState) case v1beta1.CanaryStepStateUpgrade: klog.Infof("rollout(%s/%s) run canary strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateUpgrade) @@ -101,6 +150,9 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { return err } else if done { canaryStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + if currentStep.Replicas.StrVal == "100%" && v1beta1.IsRealPartition(c.Rollout) { + canaryStatus.CurrentStepState = v1beta1.CanaryStepStateMetricsAnalysis + } canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateUpgrade, canaryStatus.CurrentStepState) @@ -216,6 +268,12 @@ func (m *canaryReleaseManager) doCanaryPaused(c *RolloutContext) (bool, error) { canaryStatus := c.NewStatus.CanaryStatus currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1] steps := len(c.Rollout.Spec.Strategy.Canary.Steps) + // If it is the last step, and 100% of pods, then return true + if int32(steps) == canaryStatus.CurrentStepIndex { + if currentStep.Replicas != nil && currentStep.Replicas.StrVal == "100%" { + return true, nil + } + } cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing) // need manual confirmation if currentStep.Pause.Duration == nil { @@ -280,8 +338,9 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) { // cleanup after rollout is completed or finished func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) { + canaryStatus := c.NewStatus.CanaryStatus // when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup - if c.NewStatus.CanaryStatus == nil { + if canaryStatus == nil { return true, nil } // 1. rollout progressing complete, remove rollout progressing annotation in workload @@ -290,33 +349,73 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro return false, err } tr := newTrafficRoutingContext(c) - // 2. remove stable service the pod revision selector, so stable service will be selector all version pods. - done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, true) - c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime - if err != nil || !done { - return done, err - } - // 3. set workload.pause=false; set workload.partition=0 - done, err = m.finalizingBatchRelease(c) - if err != nil || !done { - return done, err - } - // 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods. - done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr, false) - c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime - if err != nil || !done { - return done, err - } - // 5. delete batchRelease crd - done, err = m.removeBatchRelease(c) - if err != nil { - klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) - return false, err - } else if !done { - return false, nil + klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep) + switch canaryStatus.FinalisingStep { + default: + canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService + fallthrough + + case v1beta1.FinalisingStepTypeStableService: + // restore stable service selector to select all pods [with grace time] + done, err := m.trafficRoutingManager.RestoreStableService(tr) + if err != nil || !done { + canaryStatus.LastUpdateTime = tr.LastUpdateTime + return done, err + } + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + + case v1beta1.FinalisingStepTypeGateway: + // modify network api(ingress or gateway api) configuration + done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr) + if err != nil || !done { + canaryStatus.LastUpdateTime = tr.LastUpdateTime + return done, err + } + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeCanaryService + + /* + //TODO - As mentioned in FinalisingTrafficRouting function, + we should wait grace time between FinalisingStepTypeGateway and FinalisingStepTypeCanaryService + to avoid a very rare case which could cause minor traffic loss (espically, Istio), but it's difficult + to implement now. + However, we still reserve the FinalisingStepTypeCanaryService step here, but instead of removing the + canary Service as expected (which has been done in FinalisingStepTypeGateway), FinalisingStepTypeCanaryService + simply wait a gracetime between FinalisingStepTypeCanaryService and FinalisingStepTypeDeleteBR now + */ + case v1beta1.FinalisingStepTypeCanaryService: + // wait a gracetime for safety + if canaryStatus.LastUpdateTime != nil { + if verifyTime := canaryStatus.LastUpdateTime.Add(time.Second * time.Duration(3)); verifyTime.After(time.Now()) { + klog.Infof("restoring network configuration, but we need to wait %d seconds", 3) + return false, nil + } + } + canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + + case v1beta1.FinalisingStepTypeBatchRelease: + // set workload.pause=false; set workload.partition=0 + done, err := m.finalizingBatchRelease(c) + if err != nil || !done { + return done, err + } + canaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR + + case v1beta1.FinalisingStepTypeDeleteBR: + // delete batchRelease crd + done, err := m.removeBatchRelease(c) + if err != nil { + klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return false, err + } else if !done { + return false, nil + } + klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name) + return true, nil } - klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name) - return true, nil + + return false, nil } func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error { diff --git a/pkg/controller/rollout/rollout_canary_test.go b/pkg/controller/rollout/rollout_canary_test.go index 573849cf..f245d697 100644 --- a/pkg/controller/rollout/rollout_canary_test.go +++ b/pkg/controller/rollout/rollout_canary_test.go @@ -63,6 +63,7 @@ func TestRunCanary(t *testing.T) { obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" obj.Status.CanaryStatus.CurrentStepIndex = 1 + obj.Status.CanaryStatus.NextStepIndex = 2 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonInRolling @@ -76,6 +77,7 @@ func TestRunCanary(t *testing.T) { s.CanaryStatus.StableRevision = "pod-template-hash-v1" s.CanaryStatus.CanaryRevision = "6f8cc56547" s.CanaryStatus.CurrentStepIndex = 1 + s.CanaryStatus.NextStepIndex = 2 s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonInRolling @@ -138,6 +140,7 @@ func TestRunCanary(t *testing.T) { obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" obj.Status.CanaryStatus.CurrentStepIndex = 1 + obj.Status.CanaryStatus.NextStepIndex = 2 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonInRolling @@ -183,6 +186,7 @@ func TestRunCanary(t *testing.T) { s.CanaryStatus.CanaryReplicas = 1 s.CanaryStatus.CanaryReadyReplicas = 1 s.CanaryStatus.CurrentStepIndex = 1 + s.CanaryStatus.NextStepIndex = 2 s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonInRolling @@ -287,6 +291,7 @@ func TestRunCanaryPaused(t *testing.T) { obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" obj.Status.CanaryStatus.CurrentStepIndex = 3 + obj.Status.CanaryStatus.NextStepIndex = 4 obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStatePaused return obj @@ -298,6 +303,7 @@ func TestRunCanaryPaused(t *testing.T) { obj.CanaryStatus.StableRevision = "pod-template-hash-v1" obj.CanaryStatus.CanaryRevision = "6f8cc56547" obj.CanaryStatus.CurrentStepIndex = 3 + obj.CanaryStatus.NextStepIndex = 4 obj.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStatePaused return obj diff --git a/pkg/controller/rollout/rollout_progressing.go b/pkg/controller/rollout/rollout_progressing.go index c57bc752..59ae108c 100644 --- a/pkg/controller/rollout/rollout_progressing.go +++ b/pkg/controller/rollout/rollout_progressing.go @@ -45,6 +45,8 @@ type RolloutContext struct { RecheckTime *time.Time // wait stable workload pods ready WaitReady bool + // finalising reason + FinalizeReason string } // parameter1 retryReconcile, parameter2 error @@ -122,6 +124,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason) var done bool rolloutContext.WaitReady = true + rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess done, err = r.doFinalising(rolloutContext) if err != nil { return nil, err @@ -146,6 +149,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout case v1alpha1.ProgressingReasonCancelling: klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason) var done bool + rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback done, err = r.doFinalising(rolloutContext) if err != nil { return nil, err @@ -407,22 +411,36 @@ func isRollingBackInBatches(rollout *v1beta1.Rollout, workload *util.Workload) b // 1. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods // 2. remove batchRelease CR. func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) { - if len(c.Rollout.Spec.Strategy.Canary.TrafficRoutings) > 0 { + if c.Rollout.Spec.Strategy.HasTrafficRoutings() { // modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods tr := newTrafficRoutingContext(c) - done, err := r.trafficRoutingManager.FinalisingTrafficRouting(tr, false) - c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime + done, err := r.trafficRoutingManager.RouteAllTrafficToCanaryORStable(tr, v1beta1.FinaliseReasonContinuous) if err != nil || !done { + c.NewStatus.GetSubStatus().LastUpdateTime = tr.LastUpdateTime return done, err } } - done, err := r.canaryManager.removeBatchRelease(c) + + releaseManager, err := r.getReleaseManager(c.Rollout) + if err != nil { + return false, err + } + done, err := releaseManager.removeBatchRelease(c) if err != nil { klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err } else if !done { return false, nil } + // restore Service and network api + if c.Rollout.Spec.Strategy.HasTrafficRoutings() { + tr := newTrafficRoutingContext(c) + done, err = r.trafficRoutingManager.FinalisingTrafficRouting(tr) + c.NewStatus.GetSubStatus().LastUpdateTime = tr.LastUpdateTime + if err != nil || !done { + return done, err + } + } return true, nil } diff --git a/pkg/controller/rollout/rollout_progressing_test.go b/pkg/controller/rollout/rollout_progressing_test.go index 6513faff..107b1346 100644 --- a/pkg/controller/rollout/rollout_progressing_test.go +++ b/pkg/controller/rollout/rollout_progressing_test.go @@ -242,7 +242,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} }, getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { - return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + s1 := demoService.DeepCopy() + s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1" + return []*corev1.Service{s1}, []*netv1.Ingress{demoIngress.DeepCopy()} }, getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) { obj := rolloutDemo.DeepCopy() @@ -277,6 +279,11 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 + // the first finalizing step is restorStableService, given that the stable service has + // selector, and removing it will take a grace time (i.e. take at least 2 reconciles), + // therefore, after calling ReconcileRolloutProgressing once, the finalizing step must be + // restorStableService + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising @@ -291,6 +298,140 @@ func TestReconcileRolloutProgressing(t *testing.T) { }, { name: "ReconcileRolloutProgressing finalizing2", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + delete(dep1.Annotations, util.InRolloutProgressingAnnotation) + dep1.Status = apps.DeploymentStatus{ + ObservedGeneration: 2, + Replicas: 10, + UpdatedReplicas: 5, + ReadyReplicas: 10, + AvailableReplicas: 10, + } + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) { + obj := rolloutDemo.DeepCopy() + obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo" + obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2 + obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" + obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" + obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // given that the stable service has no selector, and removing it will immediately return + // ture. Therefore, we expect the finalizing step to be next step, i.e. restoreGateway + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromInt(1), + }, + } + tr := demoTR.DeepCopy() + tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)} + return obj, br, tr + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemo.Status.DeepCopy() + s.CanaryStatus.ObservedWorkloadGeneration = 2 + s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.CanaryStatus.StableRevision = "pod-template-hash-v1" + s.CanaryStatus.CanaryRevision = "6f8cc56547" + s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + s.CanaryStatus.CurrentStepIndex = 4 + s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(s, *cond) + return s + }, + expectTr: func() *v1alpha1.TrafficRouting { + tr := demoTR.DeepCopy() + return tr + }, + }, + { + name: "ReconcileRolloutProgressing finalizing3", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + delete(dep1.Annotations, util.InRolloutProgressingAnnotation) + dep1.Status = apps.DeploymentStatus{ + ObservedGeneration: 2, + Replicas: 10, + UpdatedReplicas: 5, + ReadyReplicas: 10, + AvailableReplicas: 10, + } + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) { + obj := rolloutDemo.DeepCopy() + obj.Annotations[v1alpha1.TrafficRoutingAnnotation] = "tr-demo" + obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2 + obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1" + obj.Status.CanaryStatus.CanaryRevision = "6f8cc56547" + obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // restoring gateway will return true immediately + // Rollout will go on to next step, i.e. deleteCanaryService + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromInt(1), + }, + } + tr := demoTR.DeepCopy() + tr.Finalizers = []string{util.ProgressingRolloutFinalizer(rolloutDemo.Name)} + return obj, br, tr + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemo.Status.DeepCopy() + s.CanaryStatus.ObservedWorkloadGeneration = 2 + s.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.CanaryStatus.StableRevision = "pod-template-hash-v1" + s.CanaryStatus.CanaryRevision = "6f8cc56547" + s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" + s.CanaryStatus.CurrentStepIndex = 4 + s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeCanaryService + s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonFinalising + cond.Status = corev1.ConditionTrue + util.SetRolloutCondition(s, *cond) + return s + }, + expectTr: func() *v1alpha1.TrafficRouting { + tr := demoTR.DeepCopy() + return tr + }, + }, + { + name: "ReconcileRolloutProgressing finalizing4", getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { dep1 := deploymentDemo.DeepCopy() delete(dep1.Annotations, util.InRolloutProgressingAnnotation) @@ -316,6 +457,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepIndex = 4 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // current finalizing step is patchBatchRelease, since the batch release has been "patched" + // we expect the finalizing step to be next step, i.e. deleteBatchRelease + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -338,6 +482,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond2.Reason = v1alpha1.ProgressingReasonFinalising @@ -373,6 +518,9 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepIndex = 4 obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + // deleteBatchRelease is the last step, and it won't wait a grace time + // after this step, this release should be succeeded + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -389,6 +537,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond2.Reason = v1alpha1.ProgressingReasonCompleted cond2.Status = corev1.ConditionFalse diff --git a/pkg/controller/trafficrouting/trafficrouting_controller.go b/pkg/controller/trafficrouting/trafficrouting_controller.go index e3e44df1..6d0928ba 100644 --- a/pkg/controller/trafficrouting/trafficrouting_controller.go +++ b/pkg/controller/trafficrouting/trafficrouting_controller.go @@ -123,13 +123,13 @@ func (r *TrafficRoutingReconciler) Reconcile(ctx context.Context, req ctrl.Reque done, err = r.trafficRoutingManager.DoTrafficRouting(newTrafficRoutingContext(tr)) } case v1alpha1.TrafficRoutingPhaseFinalizing: - done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false) + done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr)) if done { newStatus.Phase = v1alpha1.TrafficRoutingPhaseHealthy newStatus.Message = "TrafficRouting is Healthy" } case v1alpha1.TrafficRoutingPhaseTerminating: - done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false) + done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr)) if done { // remove trafficRouting finalizer err = r.handleFinalizer(tr) diff --git a/pkg/trafficrouting/manager.go b/pkg/trafficrouting/manager.go index ad8bc57a..51c54ac4 100644 --- a/pkg/trafficrouting/manager.go +++ b/pkg/trafficrouting/manager.go @@ -126,7 +126,7 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) { if c.LastUpdateTime != nil { // wait seconds for network providers to consume the modification about workload, service and so on. if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) { - klog.Infof("%s update workload or service selector, and wait 3 seconds", c.Key) + klog.Infof("%s update workload or service selector, and wait %d seconds", c.Key, trafficRouting.GracePeriodSeconds) return false, nil } } @@ -138,6 +138,15 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) { klog.Warningf("%s stableRevision or podTemplateHash can not be empty, and wait a moment", c.Key) return false, nil } + /* + Why is the serviceModified flag moved here? + The rationale behind this is that when we create a canary Service, it is already instantiated with the appropriate selector. + If the stable Service also has had selector patched previously, the logic will proceed to the EnsureRoutes function uninterrupted. + It's important to note the following: Creating a new Service and updating the gateway resource occurs within a single reconciliation loop; + this can lead to instability. + Therefore, by moving the serviceModified flag, we introduce a grace period between these two operations to ensure stability. + */ + serviceModified := false // fetch canary service err = m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: canaryServiceName}, canaryService) if err != nil && !errors.IsNotFound(err) { @@ -148,9 +157,9 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) { if err != nil { return false, err } + serviceModified = true } - serviceModified := false // patch canary service to only select the canary pods if canaryService.Spec.Selector[c.RevisionLabelKey] != c.CanaryRevision { body := fmt.Sprintf(`{"spec":{"selector":{"%s":"%s"}}}`, c.RevisionLabelKey, c.CanaryRevision) @@ -180,6 +189,13 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) { if serviceModified { return false, nil } + } else { + // for end-to-end canary deployment scenario and scenario when DisableGenerateCanaryService is on + // selector is not needed, we should remove it + verify, err := m.restoreStableService(c) + if err != nil || !verify { + return false, err + } } // new network provider, ingress or gateway @@ -199,7 +215,7 @@ func (m *Manager) DoTrafficRouting(c *TrafficRoutingContext) (bool, error) { return true, nil } -func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext, onlyRestoreStableService bool) (bool, error) { +func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext) (bool, error) { if len(c.ObjectRef) == 0 { return true, nil } @@ -235,41 +251,165 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext, onlyRestore verify, err := m.restoreStableService(c) if err != nil || !verify { return false, err - } else if onlyRestoreStableService { + } + // modify network(ingress & gateway api) configuration, route all traffic to stable service + if err = trController.Finalise(context.TODO()); err != nil { + return false, err + } + + //TODO - we should wait grace time between Finalish and removal of canary service + // to avoid a very rare case which could cause minor traffic loss (espically, Istio) + // However, it seems difficult to implement this unless re-write Finalise function: + + // end to end deployment, don't remove the canary service; + // because canary service is stable service + if !c.OnlyTrafficRouting && !c.DisableGenerateCanaryService { + // remove canary service + err = m.Delete(context.TODO(), cService) + if err != nil && !errors.IsNotFound(err) { + klog.Errorf("%s remove canary service(%s) failed: %s", c.Key, cService.Name, err.Error()) + return false, err + } + klog.Infof("%s remove canary service(%s) success", c.Key, cService.Name) + } + return true, nil +} + +// Route All Traffic To Canary OR Stable based on the FinalizeReason +func (m *Manager) RouteAllTrafficToCanaryORStable(c *TrafficRoutingContext, FinalizeReason string) (bool, error) { + if len(c.ObjectRef) == 0 { + return true, nil + } + if c.OnlyTrafficRouting { return true, nil } - // First route 100% traffic to stable service - c.Strategy.Traffic = utilpointer.StringPtr("0%") - verify, err = trController.EnsureRoutes(context.TODO(), &c.Strategy) + klog.Infof("%s RouteAllTrafficTo with Reason %s", c.Key, FinalizeReason) + trafficToCanary := "" + switch FinalizeReason { + case v1beta1.FinaliseReasonRollback, v1beta1.FinaliseReasonContinuous: + trafficToCanary = "0%" + case v1beta1.FinaliseReasonSuccess: + trafficToCanary = "100%" + default: + return true, nil + } + + trafficRouting := c.ObjectRef[0] + if trafficRouting.GracePeriodSeconds <= 0 { + trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds + } + + cServiceName := getCanaryServiceName(trafficRouting.Service, c.OnlyTrafficRouting, c.DisableGenerateCanaryService) + trController, err := newNetworkProvider(m.Client, c, trafficRouting.Service, cServiceName) + if err != nil { + klog.Errorf("%s newTrafficRoutingController failed: %s", c.Key, err.Error()) + return false, err + } + + //fetch stable service + stableService := &corev1.Service{} + err = m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService) + if err != nil { + if errors.IsNotFound(err) { + return true, nil + } + klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error()) + return false, err + } + if stableService.Spec.Selector[c.RevisionLabelKey] == "" && !c.DisableGenerateCanaryService { + return true, nil + } + + // Route 100% traffic to service that won't be scaled down + c.Strategy.Traffic = utilpointer.StringPtr(trafficToCanary) + verify, err := trController.EnsureRoutes(context.TODO(), &c.Strategy) if err != nil { return false, err } else if !verify { c.LastUpdateTime = &metav1.Time{Time: time.Now()} + klog.Infof("%s updated %s traffic, wait a grace time", c.Key, trafficToCanary) return false, nil } if c.LastUpdateTime != nil { - // After restore the stable service configuration, give network provider 3 seconds to react + // After restore the stable service configuration, give network provider seconds to react if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) { - klog.Infof("%s route 100% traffic to stable service, and wait a moment", c.Key) + klog.Infof("%s routed %s traffic to canary pods, but you need wait %d seconds", c.Key, trafficToCanary, trafficRouting.GracePeriodSeconds) return false, nil } } - // modify network(ingress & gateway api) configuration, route all traffic to stable service - if err = trController.Finalise(context.TODO()); err != nil { + // if all traffic is routed canary, then it is ok to restore stable Service + // it is mainly to avoid affect of bug in ingress-nginx controller + if trafficToCanary == "100%" { + verify, err := m.restoreStableService(c) + if err != nil || !verify { + return false, err + } + } + return true, nil +} + +func (m *Manager) PatchStableService(c *TrafficRoutingContext) (bool, error) { + if len(c.ObjectRef) == 0 { + return true, nil + } + trafficRouting := c.ObjectRef[0] + if trafficRouting.GracePeriodSeconds <= 0 { + trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds + } + if c.OnlyTrafficRouting { + return true, nil + } + + //fetch stable service + stableService := &corev1.Service{} + err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService) + if err != nil { + klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error()) + // not found, wait a moment, retry + if errors.IsNotFound(err) { + return false, nil + } return false, err } - // end to end deployment, don't remove the canary service; - // because canary service is stable service - if !c.OnlyTrafficRouting && !c.DisableGenerateCanaryService { - // remove canary service - err = m.Delete(context.TODO(), cService) - if err != nil && !errors.IsNotFound(err) { - klog.Errorf("%s remove canary service(%s) failed: %s", c.Key, cService.Name, err.Error()) - return false, err + + if stableService.Spec.Selector[c.RevisionLabelKey] == c.StableRevision { + return true, nil + } + + // patch stable service to only select the stable pods + body := fmt.Sprintf(`{"spec":{"selector":{"%s":"%s"}}}`, c.RevisionLabelKey, c.StableRevision) + if err = m.Patch(context.TODO(), stableService, client.RawPatch(types.StrategicMergePatchType, []byte(body))); err != nil { + klog.Errorf("%s patch stable service(%s) selector failed: %s", c.Key, stableService.Name, err.Error()) + return false, err + } + klog.Infof("%s do something special: add stable service(%s) selector(%s=%s) success", c.Key, stableService.Name, c.RevisionLabelKey, c.StableRevision) + return true, nil +} + +func (m *Manager) RestoreStableService(c *TrafficRoutingContext) (bool, error) { + if len(c.ObjectRef) == 0 { + return true, nil + } + trafficRouting := c.ObjectRef[0] + if trafficRouting.GracePeriodSeconds <= 0 { + trafficRouting.GracePeriodSeconds = defaultGracePeriodSeconds + } + //fetch stable service + stableService := &corev1.Service{} + err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: trafficRouting.Service}, stableService) + if err != nil { + if errors.IsNotFound(err) { + return true, nil } - klog.Infof("%s remove canary service(%s) success", c.Key, cService.Name) + klog.Errorf("%s get stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error()) + return false, err + } + // restore stable Service + verify, err := m.restoreStableService(c) + if err != nil || !verify { + return false, err } return true, nil } @@ -364,7 +504,7 @@ func (m *Manager) restoreStableService(c *TrafficRoutingContext) (bool, error) { klog.Errorf("%s patch stable service(%s) failed: %s", c.Key, trafficRouting.Service, err.Error()) return false, err } - klog.Infof("remove %s stable service(%s) pod revision selector, and wait a moment", c.Key, trafficRouting.Service) + klog.Infof("removed %s stable service(%s) pod revision selector, and wait %d seconds (log once)", c.Key, trafficRouting.Service, trafficRouting.GracePeriodSeconds) c.LastUpdateTime = &metav1.Time{Time: time.Now()} return false, nil } @@ -373,7 +513,7 @@ func (m *Manager) restoreStableService(c *TrafficRoutingContext) (bool, error) { } // After restore the stable service configuration, give network provider 3 seconds to react if verifyTime := c.LastUpdateTime.Add(time.Second * time.Duration(trafficRouting.GracePeriodSeconds)); verifyTime.After(time.Now()) { - klog.Infof("%s restoring stable service(%s), and wait a moment", c.Key, trafficRouting.Service) + klog.Infof("%s restoring stable service(%s), and wait %d seconds", c.Key, trafficRouting.Service, trafficRouting.GracePeriodSeconds) return false, nil } klog.Infof("%s doFinalising stable service(%s) success", c.Key, trafficRouting.Service) diff --git a/pkg/trafficrouting/manager_test.go b/pkg/trafficrouting/manager_test.go index 8ff7ce88..e5a377e6 100644 --- a/pkg/trafficrouting/manager_test.go +++ b/pkg/trafficrouting/manager_test.go @@ -774,6 +774,7 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { getRollout: func() (*v1beta1.Rollout, *util.Workload) { obj := demoIstioRollout.DeepCopy() obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)} + obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1 return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, expectUnstructureds: func() []*unstructured.Unstructured { @@ -803,7 +804,6 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { objects = append(objects, u) return objects }, - // Rollout(/rollout-demo) is doing trafficRouting({"traffic":"5%"}), and wait a moment expectDone: true, }, { @@ -833,6 +833,7 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { obj := demoIstioRollout.DeepCopy() // set DisableGenerateCanaryService as true obj.Spec.Strategy.Canary.DisableGenerateCanaryService = true + obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1 obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)} return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, @@ -863,7 +864,6 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { objects = append(objects, u) return objects }, - // Rollout(/rollout-demo) is doing trafficRouting({"traffic":"5%"}), and wait a moment expectDone: true, }, } @@ -897,11 +897,22 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { if err != nil { t.Fatalf("InitializeTrafficRouting failed: %s", err) } + // now we need to wait at least 2x grace time to keep traffic stable: + // create the canary service -> grace time -> update the gateway -> grace time + // therefore, before both grace times are over, DoTrafficRouting should return false + // firstly, create the canary Service, before the grace time over, return false _, err = manager.DoTrafficRouting(c) if err != nil { t.Fatalf("DoTrafficRouting failed: %s", err) } - // may return false due to in the course of doing trafficRouting, let's do it again + time.Sleep(1 * time.Second) + // secondly, update the gateway, before the grace time over, return false + _, err = manager.DoTrafficRouting(c) + if err != nil { + t.Fatalf("DoTrafficRouting failed: %s", err) + } + time.Sleep(1 * time.Second) + // now, both grace times are over, it should be true done, err := manager.DoTrafficRouting(c) if err != nil { t.Fatalf("DoTrafficRouting failed: %s", err) @@ -920,12 +931,12 @@ func TestDoTrafficRoutingWithIstio(t *testing.T) { func TestFinalisingTrafficRouting(t *testing.T) { cases := []struct { - name string - getObj func() ([]*corev1.Service, []*netv1.Ingress) - getRollout func() (*v1beta1.Rollout, *util.Workload) - onlyRestoreStableService bool - expectObj func() ([]*corev1.Service, []*netv1.Ingress) - expectDone bool + name string + getObj func() ([]*corev1.Service, []*netv1.Ingress) + getRollout func() (*v1beta1.Rollout, *util.Workload) + onlyTrafficRouting bool + expectObj func() ([]*corev1.Service, []*netv1.Ingress) + expectDone bool }{ { name: "FinalisingTrafficRouting test1", @@ -950,7 +961,6 @@ func TestFinalisingTrafficRouting(t *testing.T) { obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, - onlyRestoreStableService: true, expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() s2 := demoService.DeepCopy() @@ -964,6 +974,8 @@ func TestFinalisingTrafficRouting(t *testing.T) { c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} }, + // the Serive has selector on it, which takes a grace time to remove + // that's why expectDone is false expectDone: false, }, { @@ -981,43 +993,6 @@ func TestFinalisingTrafficRouting(t *testing.T) { c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} }, - getRollout: func() (*v1beta1.Rollout, *util.Workload) { - obj := demoRollout.DeepCopy() - obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted - obj.Status.CanaryStatus.CurrentStepIndex = 4 - return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} - }, - onlyRestoreStableService: true, - expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { - s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" - c1 := demoIngress.DeepCopy() - c2 := demoIngress.DeepCopy() - c2.Name = "echoserver-canary" - c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" - c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} - }, - expectDone: false, - }, - { - name: "FinalisingTrafficRouting test3", - getObj: func() ([]*corev1.Service, []*netv1.Ingress) { - s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" - c1 := demoIngress.DeepCopy() - c2 := demoIngress.DeepCopy() - c2.Name = "echoserver-canary" - c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" - c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} - }, getRollout: func() (*v1beta1.Rollout, *util.Workload) { obj := demoRollout.DeepCopy() obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted @@ -1025,83 +1000,57 @@ func TestFinalisingTrafficRouting(t *testing.T) { obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-10 * time.Second)} return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, - onlyRestoreStableService: true, expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" c1 := demoIngress.DeepCopy() - c2 := demoIngress.DeepCopy() - c2.Name = "echoserver-canary" - c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" - c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + return []*corev1.Service{s1}, []*netv1.Ingress{c1} }, + // the Serive has selector removed, therefore code will go on and return ture expectDone: true, }, { - name: "FinalisingTrafficRouting test4", + name: "canary Service already clear", getObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1" c1 := demoIngress.DeepCopy() - c2 := demoIngress.DeepCopy() - c2.Name = "echoserver-canary" - c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" - c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + return []*corev1.Service{s1}, []*netv1.Ingress{c1} }, getRollout: func() (*v1beta1.Rollout, *util.Workload) { obj := demoRollout.DeepCopy() obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted obj.Status.CanaryStatus.CurrentStepIndex = 4 - obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-3 * time.Second)} + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, - onlyRestoreStableService: false, expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + s1.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v1" c1 := demoIngress.DeepCopy() - c2 := demoIngress.DeepCopy() - c2.Name = "echoserver-canary" - c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "0" - c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + return []*corev1.Service{s1}, []*netv1.Ingress{c1} }, - expectDone: false, + expectDone: true, }, { - name: "FinalisingTrafficRouting test5", + name: "OnlyTrafficRouting true", getObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() - s2 := demoService.DeepCopy() - s2.Name = "echoserver-canary" - s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" c1 := demoIngress.DeepCopy() c2 := demoIngress.DeepCopy() c2.Name = "echoserver-canary" c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" - c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "0" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" - return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + return []*corev1.Service{s1}, []*netv1.Ingress{c1, c2} }, getRollout: func() (*v1beta1.Rollout, *util.Workload) { obj := demoRollout.DeepCopy() obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted obj.Status.CanaryStatus.CurrentStepIndex = 4 - obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-3 * time.Second)} + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} }, - onlyRestoreStableService: false, + onlyTrafficRouting: true, expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { s1 := demoService.DeepCopy() c1 := demoIngress.DeepCopy() @@ -1125,18 +1074,19 @@ func TestFinalisingTrafficRouting(t *testing.T) { newStatus := rollout.Status.DeepCopy() currentStep := rollout.Spec.Strategy.Canary.Steps[newStatus.CanaryStatus.CurrentStepIndex-1] c := &TrafficRoutingContext{ - Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name), - Namespace: rollout.Namespace, - ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings, - Strategy: currentStep.TrafficRoutingStrategy, - OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")), - RevisionLabelKey: workload.RevisionLabelKey, - StableRevision: newStatus.CanaryStatus.StableRevision, - CanaryRevision: newStatus.CanaryStatus.PodTemplateHash, - LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime, + Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name), + Namespace: rollout.Namespace, + ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings, + Strategy: currentStep.TrafficRoutingStrategy, + OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")), + RevisionLabelKey: workload.RevisionLabelKey, + StableRevision: newStatus.CanaryStatus.StableRevision, + CanaryRevision: newStatus.CanaryStatus.PodTemplateHash, + LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime, + OnlyTrafficRouting: cs.onlyTrafficRouting, } manager := NewTrafficRoutingManager(client) - done, err := manager.FinalisingTrafficRouting(c, cs.onlyRestoreStableService) + done, err := manager.FinalisingTrafficRouting(c) if err != nil { t.Fatalf("DoTrafficRouting failed: %s", err) } diff --git a/pkg/trafficrouting/network/customNetworkProvider/custom_network_provider.go b/pkg/trafficrouting/network/customNetworkProvider/custom_network_provider.go index 5778e4a3..ff170772 100644 --- a/pkg/trafficrouting/network/customNetworkProvider/custom_network_provider.go +++ b/pkg/trafficrouting/network/customNetworkProvider/custom_network_provider.go @@ -112,9 +112,9 @@ func (r *customController) EnsureRoutes(ctx context.Context, strategy *v1beta1.T done := true // *strategy.Weight == 0 indicates traffic routing is doing finalising and tries to route whole traffic to stable service // then directly do finalising - if strategy.Traffic != nil && *strategy.Traffic == "0%" { - return true, nil - } + // if strategy.Traffic != nil && *strategy.Traffic == "0%" { + // return true, nil + // } var err error customNetworkRefList := make([]*unstructured.Unstructured, len(r.conf.TrafficConf))