From 6425b7704508ec6ae4e764e666f857c711b06c3e Mon Sep 17 00:00:00 2001 From: yunbo Date: Fri, 25 Oct 2024 16:11:59 +0800 Subject: [PATCH] support bluegreen release: release logic Signed-off-by: yunbo --- api/v1beta1/rollout_types.go | 23 +- pkg/controller/rollout/rollout_bluegreen.go | 471 ++++++++++++++++++ .../rollout/rollout_bluegreen_test.go | 331 ++++++++++++ pkg/controller/rollout/rollout_canary.go | 241 ++------- pkg/controller/rollout/rollout_controller.go | 6 + .../rollout/rollout_controller_test.go | 67 +++ pkg/controller/rollout/rollout_progressing.go | 28 +- .../rollout/rollout_progressing_test.go | 20 +- .../rollout/rollout_releaseManager.go | 165 +++++- pkg/trafficrouting/manager.go | 40 +- pkg/trafficrouting/manager_test.go | 177 ++++++- pkg/util/controller_finder.go | 30 +- pkg/util/controller_finder_test.go | 450 +++++++++++++++++ .../rollout_create_update_handler.go | 4 + 14 files changed, 1793 insertions(+), 260 deletions(-) create mode 100644 pkg/controller/rollout/rollout_bluegreen.go create mode 100644 pkg/controller/rollout/rollout_bluegreen_test.go create mode 100644 pkg/util/controller_finder_test.go diff --git a/api/v1beta1/rollout_types.go b/api/v1beta1/rollout_types.go index b3896128..77f7f2df 100644 --- a/api/v1beta1/rollout_types.go +++ b/api/v1beta1/rollout_types.go @@ -561,27 +561,22 @@ type FinalisingStepType string //goland:noinspection GoUnusedConst const ( - // Route all traffic to stable or new version based on FinaliseReason (for bluegreen) - FinalisingStepTypeRouteAllTraffic FinalisingStepType = "RouteAllTraffic" + // Route all traffic to new version (for bluegreen) + FinalisingStepRouteTrafficToNew FinalisingStepType = "RouteAllTraffic" + // Restore the GatewayAPI/Ingress/Istio + FinalisingStepRouteTrafficToStable FinalisingStepType = "RestoreGateway" // Restore the stable Service, i.e. remove corresponding selector - FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService" - // Remove the canary Service - FinalisingStepTypeRemoveCanaryService FinalisingStepType = "RemoveCanaryService" + FinalisingStepRestoreStableService FinalisingStepType = "RestoreStableService" + // Remove the Canary Service + FinalisingStepRemoveCanaryService FinalisingStepType = "RemoveCanaryService" // Patch Batch Release to scale down (exception: the canary Deployment will be // scaled down in FinalisingStepTypeDeleteBR step) // For Both BlueGreenStrategy and CanaryStrategy: // set workload.pause=false, set workload.partition=0 - FinalisingStepTypeBatchRelease FinalisingStepType = "PatchBatchRelease" - - // Execute the FinalisingTrafficRouting function - FinalisingStepTypeTrafficRouting FinalisingStepType = "FinalisingTrafficRouting" - // Restore the GatewayAPI/Ingress/Istio - FinalisingStepTypeGateway FinalisingStepType = "RestoreGateway" - // Delete Canary Service - FinalisingStepTypeDeleteCanaryService FinalisingStepType = "DeleteCanaryService" + FinalisingStepPatchBatchRelease FinalisingStepType = "PatchBatchRelease" // Delete Batch Release - FinalisingStepTypeDeleteBR FinalisingStepType = "DeleteBatchRelease" + FinalisingStepRemoveBatchRelease FinalisingStepType = "RemoveBatchRelease" // All needed work done FinalisingStepTypeEnd FinalisingStepType = "END" ) diff --git a/pkg/controller/rollout/rollout_bluegreen.go b/pkg/controller/rollout/rollout_bluegreen.go new file mode 100644 index 00000000..a0ab8963 --- /dev/null +++ b/pkg/controller/rollout/rollout_bluegreen.go @@ -0,0 +1,471 @@ +/* +Copyright 2022 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rollout + +import ( + "context" + "fmt" + "reflect" + "time" + + "github.com/openkruise/rollouts/api/v1alpha1" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/trafficrouting" + "github.com/openkruise/rollouts/pkg/util" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + utilpointer "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +type blueGreenReleaseManager struct { + client.Client + trafficRoutingManager *trafficrouting.Manager + recorder record.EventRecorder +} + +func (m *blueGreenReleaseManager) runCanary(c *RolloutContext) error { + blueGreenStatus := c.NewStatus.BlueGreenStatus + if br, err := fetchBatchRelease(m.Client, c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { + klog.Errorf("rollout(%s/%s) fetch batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return err + } else if err == nil { + // This line will do something important: + // - sync status from br to Rollout: to better observability; + // - sync rollout-id from Rollout to br: to make BatchRelease + // relabels pods in the scene where only rollout-id is changed. + if err = m.syncBatchRelease(br, blueGreenStatus); err != nil { + klog.Errorf("rollout(%s/%s) sync batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return err + } + } + if blueGreenStatus.PodTemplateHash == "" { + blueGreenStatus.PodTemplateHash = c.Workload.PodTemplateHash + } + + if m.doCanaryJump(c) { + klog.Infof("rollout(%s/%s) canary step jumped", c.Rollout.Namespace, c.Rollout.Name) + return nil + } + // When the first batch is trafficRouting rolling and the next steps are rolling release, + // We need to clean up the canary-related resources first and then rollout the rest of the batch. + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[blueGreenStatus.CurrentStepIndex-1] + if currentStep.Traffic == nil && len(currentStep.Matches) == 0 { + tr := newTrafficRoutingContext(c) + done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr) + blueGreenStatus.LastUpdateTime = tr.LastUpdateTime + if err != nil { + return err + } else if !done { + klog.Infof("rollout(%s/%s) cleaning up canary-related resources", c.Rollout.Namespace, c.Rollout.Name) + expectedTime := time.Now().Add(tr.RecheckDuration) + c.RecheckTime = &expectedTime + return nil + } + } + + switch blueGreenStatus.CurrentStepState { + // before CanaryStepStateUpgrade, handle some special cases, to prevent traffic loss + case v1beta1.CanaryStepStateInit: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateInit) + tr := newTrafficRoutingContext(c) + if currentStep.Traffic != nil || len(currentStep.Matches) > 0 { + //TODO - consider istio subsets + if blueGreenStatus.CurrentStepIndex == 1 { + klog.Infof("Before the first batch, rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name) + retry, err := m.trafficRoutingManager.PatchStableService(tr) + if err != nil { + return err + } else if retry { + expectedTime := time.Now().Add(tr.RecheckDuration) + c.RecheckTime = &expectedTime + return nil + } + } + } + + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateInit, blueGreenStatus.CurrentStepState) + fallthrough + + case v1beta1.CanaryStepStateUpgrade: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateUpgrade) + done, err := m.doCanaryUpgrade(c) + if err != nil { + return err + } else if done { + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateUpgrade, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStateTrafficRouting: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateTrafficRouting) + tr := newTrafficRoutingContext(c) + done, err := m.trafficRoutingManager.DoTrafficRouting(tr) + blueGreenStatus.LastUpdateTime = tr.LastUpdateTime + if err != nil { + return err + } else if done { + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateMetricsAnalysis + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, blueGreenStatus.CurrentStepState) + } + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) + c.RecheckTime = &expectedTime + + case v1beta1.CanaryStepStateMetricsAnalysis: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateMetricsAnalysis) + done, err := m.doCanaryMetricsAnalysis(c) + if err != nil { + return err + } else if done { + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateMetricsAnalysis, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStatePaused: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStatePaused) + done, err := m.doCanaryPaused(c) + if err != nil { + return err + } else if done { + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateReady + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStatePaused, blueGreenStatus.CurrentStepState) + } + + case v1beta1.CanaryStepStateReady: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateReady) + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + // run next step + if len(c.Rollout.Spec.Strategy.BlueGreen.Steps) > int(blueGreenStatus.CurrentStepIndex) { + blueGreenStatus.CurrentStepIndex++ + blueGreenStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, blueGreenStatus.CurrentStepIndex) + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateInit + klog.Infof("rollout(%s/%s) bluegreen step from(%d) -> to(%d)", c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.CurrentStepIndex-1, blueGreenStatus.CurrentStepIndex) + return nil + } + // completed + blueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s), run all steps", c.Rollout.Namespace, c.Rollout.Name, + blueGreenStatus.CurrentStepIndex, v1beta1.CanaryStepStateReady, blueGreenStatus.CurrentStepState) + fallthrough + // canary completed + case v1beta1.CanaryStepStateCompleted: + klog.Infof("rollout(%s/%s) run bluegreen strategy, and state(%s)", c.Rollout.Namespace, c.Rollout.Name, v1beta1.CanaryStepStateCompleted) + } + + return nil +} + +func (m *blueGreenReleaseManager) doCanaryUpgrade(c *RolloutContext) (bool, error) { + // verify whether batchRelease configuration is the latest + steps := len(c.Rollout.Spec.Strategy.BlueGreen.Steps) + blueGreenStatus := c.NewStatus.BlueGreenStatus + cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing) + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and upgrade workload to new version", blueGreenStatus.CurrentStepIndex, steps) + c.NewStatus.Message = cond.Message + // run batch release to upgrade the workloads + done, br, err := runBatchRelease(m, c.Rollout, getRolloutID(c.Workload), blueGreenStatus.CurrentStepIndex, c.Workload.IsInRollback) + if err != nil { + return false, err + } else if !done { + return false, nil + } + if br.Status.ObservedReleasePlanHash != util.HashReleasePlanBatches(&br.Spec.ReleasePlan) || + br.Generation != br.Status.ObservedGeneration { + klog.Infof("rollout(%s/%s) batchRelease status is inconsistent, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return false, nil + } + // check whether batchRelease is ready(whether new pods is ready.) + if br.Status.CanaryStatus.CurrentBatchState != v1beta1.ReadyBatchState || + br.Status.CanaryStatus.CurrentBatch+1 < blueGreenStatus.CurrentStepIndex { + klog.Infof("rollout(%s/%s) batchRelease status(%s) is not ready, and wait a moment", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) + return false, nil + } + m.recorder.Eventf(c.Rollout, corev1.EventTypeNormal, "Progressing", fmt.Sprintf("upgrade step(%d) canary pods with new versions done", blueGreenStatus.CurrentStepIndex)) + klog.Infof("rollout(%s/%s) batch(%s) state(%s), and success", + c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status), br.Status.CanaryStatus.CurrentBatchState) + // set the latest PodTemplateHash to selector the latest pods. + blueGreenStatus.PodTemplateHash = c.Workload.PodTemplateHash + return true, nil +} + +func (m *blueGreenReleaseManager) doCanaryMetricsAnalysis(c *RolloutContext) (bool, error) { + // todo + return true, nil +} + +func (m *blueGreenReleaseManager) doCanaryPaused(c *RolloutContext) (bool, error) { + blueGreenStatus := c.NewStatus.BlueGreenStatus + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[blueGreenStatus.CurrentStepIndex-1] + steps := len(c.Rollout.Spec.Strategy.BlueGreen.Steps) + cond := util.GetRolloutCondition(*c.NewStatus, v1beta1.RolloutConditionProgressing) + // need manual confirmation + if currentStep.Pause.Duration == nil { + klog.Infof("rollout(%s/%s) don't set pause duration, and need manual confirmation", c.Rollout.Namespace, c.Rollout.Name) + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and you need manually confirm to enter the next step", blueGreenStatus.CurrentStepIndex, steps) + c.NewStatus.Message = cond.Message + return false, nil + } + cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and wait duration(%d seconds) to enter the next step", blueGreenStatus.CurrentStepIndex, steps, *currentStep.Pause.Duration) + c.NewStatus.Message = cond.Message + // wait duration time, then go to next step + duration := time.Second * time.Duration(*currentStep.Pause.Duration) + expectedTime := blueGreenStatus.LastUpdateTime.Add(duration) + if expectedTime.Before(time.Now()) { + klog.Infof("rollout(%s/%s) canary step(%d) paused duration(%d seconds), and go to the next step", + c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.CurrentStepIndex, *currentStep.Pause.Duration) + return true, nil + } + c.RecheckTime = &expectedTime + return false, nil +} + +func (m *blueGreenReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) { + bluegreenStatus := c.NewStatus.BlueGreenStatus + // since we forbid adding or removing steps, currentStepIndex should always be valid + currentStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[bluegreenStatus.CurrentStepIndex-1] + // nextIndex=-1 means the release is done, nextIndex=0 is not used + if nextIndex := bluegreenStatus.NextStepIndex; nextIndex != util.NextBatchIndex(c.Rollout, bluegreenStatus.CurrentStepIndex) && nextIndex > 0 { + currentIndexBackup := bluegreenStatus.CurrentStepIndex + currentStepStateBackup := bluegreenStatus.CurrentStepState + // update the current and next stepIndex + bluegreenStatus.CurrentStepIndex = nextIndex + bluegreenStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, nextIndex) + nextStep := c.Rollout.Spec.Strategy.BlueGreen.Steps[nextIndex-1] + // compare next step and current step to decide the state we should go + if reflect.DeepEqual(nextStep.Replicas, currentStep.Replicas) { + bluegreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + } else { + bluegreenStatus.CurrentStepState = v1beta1.CanaryStepStateInit + } + bluegreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + klog.Infof("rollout(%s/%s) step(%d->%d) state from(%s -> %s)", + c.Rollout.Namespace, c.Rollout.Name, + currentIndexBackup, bluegreenStatus.CurrentStepIndex, + currentStepStateBackup, bluegreenStatus.CurrentStepState) + return true + } + return false +} + +// cleanup after rollout is completed or finished +func (m *blueGreenReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) { + blueGreenStatus := c.NewStatus.BlueGreenStatus + if blueGreenStatus == nil { + return true, nil + } + // rollout progressing complete, remove rollout progressing annotation in workload + err := removeRolloutProgressingAnnotation(m.Client, c) + if err != nil { + return false, err + } + + tr := newTrafficRoutingContext(c) + // execute steps based on the predefined order for each reason + nextStep := nextBlueGreenTask(c.FinalizeReason, blueGreenStatus.FinalisingStep) + // if current step is empty, set it with the first step + // if current step is end, we just return + if len(blueGreenStatus.FinalisingStep) == 0 { + blueGreenStatus.FinalisingStep = nextStep + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + } else if blueGreenStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name) + return true, nil + } + klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, blueGreenStatus.FinalisingStep) + + var retry bool + // the order of steps is maitained by calculating thenextStep + switch blueGreenStatus.FinalisingStep { + // set workload.pause=false; set workload.partition=0 + case v1beta1.FinalisingStepPatchBatchRelease: + retry, err = finalizingBatchRelease(m.Client, c) + // delete batchRelease + case v1beta1.FinalisingStepRemoveBatchRelease: + retry, err = removeBatchRelease(m.Client, c) + // restore the gateway resources (ingress/gatewayAPI/Istio), that means + // only stable Service will accept the traffic + case v1beta1.FinalisingStepRouteTrafficToStable: + retry, err = m.trafficRoutingManager.RestoreGateway(tr) + // restore the stable service + case v1beta1.FinalisingStepRestoreStableService: + retry, err = m.trafficRoutingManager.RestoreStableService(tr) + // remove canary service + case v1beta1.FinalisingStepRemoveCanaryService: + retry, err = m.trafficRoutingManager.RemoveCanaryService(tr) + // route all traffic to new version + case v1beta1.FinalisingStepRouteTrafficToNew: + retry, err = m.trafficRoutingManager.RouteAllTrafficToNewVersion(tr) + default: + nextStep = nextBlueGreenTask(c.FinalizeReason, "") + klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", blueGreenStatus.FinalisingStep, nextStep) + blueGreenStatus.FinalisingStep = nextStep + return false, nil + } + if err != nil || retry { + return false, err + } + // current step is done, run the next step + blueGreenStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + blueGreenStatus.FinalisingStep = nextStep + if blueGreenStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd { + return true, nil + } + + return false, nil +} + +func (m *blueGreenReleaseManager) fetchClient() client.Client { + return m.Client +} + +func (m *blueGreenReleaseManager) createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { + var batches []v1beta1.ReleaseBatch + for _, step := range rollout.Spec.Strategy.BlueGreen.Steps { + batches = append(batches, v1beta1.ReleaseBatch{CanaryReplicas: *step.Replicas}) + } + br := &v1beta1.BatchRelease{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: rollout.Namespace, + Name: rollout.Name, + OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(rollout, rolloutControllerKind)}, + }, + Spec: v1beta1.BatchReleaseSpec{ + WorkloadRef: v1beta1.ObjectRef{ + APIVersion: rollout.Spec.WorkloadRef.APIVersion, + Kind: rollout.Spec.WorkloadRef.Kind, + Name: rollout.Spec.WorkloadRef.Name, + }, + ReleasePlan: v1beta1.ReleasePlan{ + Batches: batches, + RolloutID: rolloutID, + BatchPartition: utilpointer.Int32Ptr(batch), + FailureThreshold: rollout.Spec.Strategy.BlueGreen.FailureThreshold, + RollingStyle: v1beta1.BlueGreenRollingStyle, + }, + }, + } + annotations := map[string]string{} + if isRollback { + annotations[v1alpha1.RollbackInBatchAnnotation] = rollout.Annotations[v1alpha1.RollbackInBatchAnnotation] + } + if len(annotations) > 0 { + br.Annotations = annotations + } + return br +} + +// syncBatchRelease sync status of br to blueGreenStatus, and sync rollout-id of blueGreenStatus to br. +func (m *blueGreenReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, blueGreenStatus *v1beta1.BlueGreenStatus) error { + // sync from BatchRelease status to Rollout blueGreenStatus + blueGreenStatus.UpdatedReplicas = br.Status.CanaryStatus.UpdatedReplicas + blueGreenStatus.UpdatedReadyReplicas = br.Status.CanaryStatus.UpdatedReadyReplicas + // Do not remove this line currently, otherwise, users will be not able to judge whether the BatchRelease works + // in the scene where only rollout-id changed. + // TODO: optimize the logic to better understand + blueGreenStatus.Message = fmt.Sprintf("BatchRelease is at state %s, rollout-id %s, step %d", + br.Status.CanaryStatus.CurrentBatchState, br.Status.ObservedRolloutID, br.Status.CanaryStatus.CurrentBatch+1) + + // sync rolloutId from blueGreenStatus to BatchRelease + if blueGreenStatus.ObservedRolloutID != br.Spec.ReleasePlan.RolloutID { + body := fmt.Sprintf(`{"spec":{"releasePlan":{"rolloutID":"%s"}}}`, blueGreenStatus.ObservedRolloutID) + return m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))) + } + return nil +} + +/* +- Rollback Scenario: +why the first step is to restore the gateway? (aka. route all traffic to stable version) +we cannot remove selector of the stable service firstly as canary does, because users are allowed to configure "0%" traffic +in bluegreen strategy. Consider the following example: + - replicas: 50% // step 1 + traffic: 0% + +if user is at step 1, and then attempts to rollback directly, Rollout should route all traffic to stable service +(keep unchanged actually). However, if we remove the selector of the stable service instead, we would inadvertently +route some traffic to the new version for a period, which is undesirable. + +- Rollout Deletion and Disabling Scenario: +If Rollout is being deleted or disabled, it suggests users want to release the new version using workload built-in strategy, +such as rollingUpdate for Deployment, instead of blue-green or canary. And thus, we can simply remove +the label selector of the stable service, routing traffic to reach both stable and updated pods. + +- Rollout success Scenario: +This indicates the rollout has completed its final batch and the user has confirmed to +transition fully to the new version. We can simply route all traffic to new version. Additionally, given that all +traffic is routed to the canary Service, it is safe to remove selector of stable Service, which additionally works +as a workaround for a bug caused by ingress-nginx controller (see https://github.com/kubernetes/ingress-nginx/issues/9635) +*/ +func nextBlueGreenTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { + var taskSequence []v1beta1.FinalisingStepType + switch reason { + case v1beta1.FinaliseReasonSuccess: // success + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepRouteTrafficToNew, + v1beta1.FinalisingStepRestoreStableService, + v1beta1.FinalisingStepPatchBatchRelease, + v1beta1.FinalisingStepRouteTrafficToStable, + + v1beta1.FinalisingStepRemoveCanaryService, + v1beta1.FinalisingStepRemoveBatchRelease, + } + + case v1beta1.FinaliseReasonRollback: // rollback + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepRouteTrafficToStable, // route all traffic to stable version + v1beta1.FinalisingStepPatchBatchRelease, + v1beta1.FinalisingStepRestoreStableService, + + v1beta1.FinalisingStepRemoveCanaryService, + v1beta1.FinalisingStepRemoveBatchRelease, + } + default: // others: disabled/deleting rollout + taskSequence = []v1beta1.FinalisingStepType{ + v1beta1.FinalisingStepRestoreStableService, + v1beta1.FinalisingStepPatchBatchRelease, // scale up new, scale down old + v1beta1.FinalisingStepRouteTrafficToStable, + + v1beta1.FinalisingStepRemoveCanaryService, + v1beta1.FinalisingStepRemoveBatchRelease, + } + } + // if currentTask is empty, return first task + if len(currentTask) == 0 { + return taskSequence[0] + } + // find next task + for i := range taskSequence { + if currentTask == taskSequence[i] && i < len(taskSequence)-1 { + return taskSequence[i+1] + } + } + return v1beta1.FinalisingStepTypeEnd +} diff --git a/pkg/controller/rollout/rollout_bluegreen_test.go b/pkg/controller/rollout/rollout_bluegreen_test.go new file mode 100644 index 00000000..1666a700 --- /dev/null +++ b/pkg/controller/rollout/rollout_bluegreen_test.go @@ -0,0 +1,331 @@ +/* +Copyright 2022 The Kruise Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rollout + +import ( + "context" + "reflect" + "testing" + + "github.com/openkruise/rollouts/api/v1alpha1" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/trafficrouting" + "github.com/openkruise/rollouts/pkg/util" + apps "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + netv1 "k8s.io/api/networking/v1" + + // metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/tools/record" + utilpointer "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func TestBlueGreenRunCanary(t *testing.T) { + cases := []struct { + name string + getObj func() ([]*apps.Deployment, []*apps.ReplicaSet) + getNetwork func() ([]*corev1.Service, []*netv1.Ingress) + getRollout func() (*v1beta1.Rollout, *v1beta1.BatchRelease) + expectStatus func() *v1beta1.RolloutStatus + expectBr func() *v1beta1.BatchRelease + }{ + { + name: "run bluegreen upgrade1", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + rs1 := rsDemo.DeepCopy() + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease) { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 1 + obj.Status.BlueGreenStatus.NextStepIndex = 2 + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(&obj.Status, *cond) + return obj, nil + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemoBlueGreen.Status.DeepCopy() + s.BlueGreenStatus.ObservedWorkloadGeneration = 2 + s.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + s.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + s.BlueGreenStatus.CurrentStepIndex = 1 + s.BlueGreenStatus.NextStepIndex = 2 + s.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(s, *cond) + return s + }, + expectBr: func() *v1beta1.BatchRelease { + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + return br + }, + }, + { + name: "run bluegreen traffic routing", + getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) { + dep1 := deploymentDemo.DeepCopy() + rs1 := rsDemo.DeepCopy() + rs2 := rsDemo.DeepCopy() + rs2.Name = "echoserver-canary" + rs2.Labels["pod-template-hash"] = "pod-template-hash-v2" + rs2.Spec.Template.Spec.Containers[0].Image = "echoserver:v2" + return []*apps.Deployment{dep1}, []*apps.ReplicaSet{rs1, rs2} + }, + getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) { + return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()} + }, + getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease) { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 1 + obj.Status.BlueGreenStatus.NextStepIndex = 2 + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade + cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(&obj.Status, *cond) + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + br.Status = v1beta1.BatchReleaseStatus{ + ObservedGeneration: 1, + ObservedReleasePlanHash: util.HashReleasePlanBatches(&br.Spec.ReleasePlan), + CanaryStatus: v1beta1.BatchReleaseCanaryStatus{ + CurrentBatchState: v1beta1.ReadyBatchState, + CurrentBatch: 0, + UpdatedReplicas: 1, + UpdatedReadyReplicas: 1, + }, + } + return obj, br + }, + expectStatus: func() *v1beta1.RolloutStatus { + s := rolloutDemoBlueGreen.Status.DeepCopy() + s.BlueGreenStatus.ObservedWorkloadGeneration = 2 + s.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + s.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + s.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + s.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + s.BlueGreenStatus.UpdatedReplicas = 1 + s.BlueGreenStatus.UpdatedReadyReplicas = 1 + s.BlueGreenStatus.CurrentStepIndex = 1 + s.BlueGreenStatus.NextStepIndex = 2 + s.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting + cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) + cond.Reason = v1alpha1.ProgressingReasonInRolling + util.SetRolloutCondition(s, *cond) + return s + }, + expectBr: func() *v1beta1.BatchRelease { + br := batchDemo.DeepCopy() + br.Spec.ReleasePlan.Batches = []v1beta1.ReleaseBatch{ + { + CanaryReplicas: intstr.FromString("50%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + { + CanaryReplicas: intstr.FromString("100%"), + }, + } + br.Spec.ReleasePlan.BatchPartition = utilpointer.Int32(0) + br.Spec.ReleasePlan.RollingStyle = v1beta1.BlueGreenRollingStyle + return br + }, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + deps, rss := cs.getObj() + rollout, br := cs.getRollout() + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rollout).Build() + for _, rs := range rss { + _ = fc.Create(context.TODO(), rs) + } + for _, dep := range deps { + _ = fc.Create(context.TODO(), dep) + } + if br != nil { + _ = fc.Create(context.TODO(), br) + } + ss, in := cs.getNetwork() + _ = fc.Create(context.TODO(), ss[0]) + _ = fc.Create(context.TODO(), in[0]) + r := &RolloutReconciler{ + Client: fc, + Scheme: scheme, + Recorder: record.NewFakeRecorder(10), + finder: util.NewControllerFinder(fc), + trafficRoutingManager: trafficrouting.NewTrafficRoutingManager(fc), + } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: fc, + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } + workload, err := r.finder.GetWorkloadForRef(rollout) + if err != nil { + t.Fatalf("GetWorkloadForRef failed: %s", err.Error()) + } + c := &RolloutContext{ + Rollout: rollout, + NewStatus: rollout.Status.DeepCopy(), + Workload: workload, + } + err = r.blueGreenManager.runCanary(c) + if err != nil { + t.Fatalf("reconcileRolloutProgressing failed: %s", err.Error()) + } + checkBatchReleaseEqual(fc, t, client.ObjectKey{Name: rollout.Name}, cs.expectBr()) + cStatus := c.NewStatus.DeepCopy() + cStatus.Message = "" + if cStatus.BlueGreenStatus != nil { + cStatus.BlueGreenStatus.LastUpdateTime = nil + cStatus.BlueGreenStatus.Message = "" + } + cond := util.GetRolloutCondition(*cStatus, v1beta1.RolloutConditionProgressing) + cond.Message = "" + util.SetRolloutCondition(cStatus, *cond) + expectStatus := cs.expectStatus() + if !reflect.DeepEqual(expectStatus, cStatus) { + t.Fatalf("expect(%s), but get(%s)", util.DumpJSON(cs.expectStatus()), util.DumpJSON(cStatus)) + } + }) + } +} + +func TestBlueGreenRunCanaryPaused(t *testing.T) { + cases := []struct { + name string + getRollout func() *v1beta1.Rollout + expectStatus func() *v1beta1.RolloutStatus + }{ + { + name: "paused, last step, 60% weight", + getRollout: func() *v1beta1.Rollout { + obj := rolloutDemoBlueGreen.DeepCopy() + obj.Status.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.Status.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.Status.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.Status.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.Status.BlueGreenStatus.CurrentStepIndex = 3 + obj.Status.BlueGreenStatus.NextStepIndex = 4 + obj.Status.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + obj.Status.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + return obj + }, + expectStatus: func() *v1beta1.RolloutStatus { + obj := rolloutDemoBlueGreen.Status.DeepCopy() + obj.BlueGreenStatus.ObservedWorkloadGeneration = 2 + obj.BlueGreenStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd" + obj.BlueGreenStatus.StableRevision = "pod-template-hash-v1" + obj.BlueGreenStatus.UpdatedRevision = "6f8cc56547" + obj.BlueGreenStatus.CurrentStepIndex = 3 + obj.BlueGreenStatus.NextStepIndex = 4 + obj.BlueGreenStatus.PodTemplateHash = "pod-template-hash-v2" + obj.BlueGreenStatus.CurrentStepState = v1beta1.CanaryStepStatePaused + return obj + }, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + rollout := cs.getRollout() + fc := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rollout).Build() + r := &RolloutReconciler{ + Client: fc, + Scheme: scheme, + Recorder: record.NewFakeRecorder(10), + finder: util.NewControllerFinder(fc), + trafficRoutingManager: trafficrouting.NewTrafficRoutingManager(fc), + } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: fc, + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } + c := &RolloutContext{ + Rollout: rollout, + NewStatus: rollout.Status.DeepCopy(), + } + err := r.blueGreenManager.runCanary(c) + if err != nil { + t.Fatalf("reconcileRolloutProgressing failed: %s", err.Error()) + } + cStatus := c.NewStatus.DeepCopy() + cStatus.BlueGreenStatus.LastUpdateTime = nil + cStatus.BlueGreenStatus.Message = "" + cStatus.Message = "" + if !reflect.DeepEqual(cs.expectStatus(), cStatus) { + t.Fatalf("expect(%s), but get(%s)", util.DumpJSON(cs.expectStatus()), util.DumpJSON(cStatus)) + } + }) + } +} diff --git a/pkg/controller/rollout/rollout_canary.go b/pkg/controller/rollout/rollout_canary.go index 63c01703..0278adac 100644 --- a/pkg/controller/rollout/rollout_canary.go +++ b/pkg/controller/rollout/rollout_canary.go @@ -29,11 +29,9 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" - "k8s.io/client-go/util/retry" "k8s.io/klog/v2" utilpointer "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" @@ -47,7 +45,7 @@ type canaryReleaseManager struct { func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { canaryStatus := c.NewStatus.CanaryStatus - if br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { + if br, err := fetchBatchRelease(m.Client, c.Rollout.Namespace, c.Rollout.Name); err != nil && !errors.IsNotFound(err) { klog.Errorf("rollout(%s/%s) fetch batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return err } else if err == nil { @@ -114,7 +112,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { */ expectedReplicas, _ := intstr.GetScaledValueFromIntOrPercent(currentStep.Replicas, int(c.Workload.Replicas), true) if expectedReplicas >= int(c.Workload.Replicas) && v1beta1.IsRealPartition(c.Rollout) { - klog.Infof("special case detected: rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name) + klog.Infof("Bypass the ingress-nginx bug for partition-style, rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name) retry, err := m.trafficRoutingManager.RestoreStableService(tr) if err != nil { return err @@ -144,7 +142,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { */ if canaryStatus.CurrentStepIndex == 1 { if !tr.DisableGenerateCanaryService { - klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name) + klog.Infof("Before the first batch, rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name) retry, err := m.trafficRoutingManager.PatchStableService(tr) if err != nil { return err @@ -194,13 +192,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error { klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, canaryStatus.CurrentStepState) } - // in two cases, we should wait the default grace period - // - a period after CanaryStepStateUpgrade is just done (https://github.com/openkruise/rollouts/pull/185) - // - a period after CanaryStepStateTrafficRouting is just done - if tr.RecheckDuration <= 0 { - tr.RecheckDuration = time.Duration(trafficrouting.GetGraceSeconds(c.Rollout.Spec.Strategy.GetTrafficRouting(), defaultGracePeriodSeconds)) * time.Second - } - expectedTime := time.Now().Add(tr.RecheckDuration) + expectedTime := time.Now().Add(time.Duration(defaultGracePeriodSeconds) * time.Second) c.RecheckTime = &expectedTime case v1beta1.CanaryStepStateMetricsAnalysis: @@ -259,7 +251,7 @@ func (m *canaryReleaseManager) doCanaryUpgrade(c *RolloutContext) (bool, error) cond.Message = fmt.Sprintf("Rollout is in step(%d/%d), and upgrade workload to new version", canaryStatus.CurrentStepIndex, steps) c.NewStatus.Message = cond.Message // run batch release to upgrade the workloads - done, br, err := m.runBatchRelease(c.Rollout, getRolloutID(c.Workload), canaryStatus.CurrentStepIndex, c.Workload.IsInRollback) + done, br, err := runBatchRelease(m, c.Rollout, getRolloutID(c.Workload), canaryStatus.CurrentStepIndex, c.Workload.IsInRollback) if err != nil { return false, err } else if !done { @@ -323,39 +315,27 @@ func (m *canaryReleaseManager) doCanaryPaused(c *RolloutContext) (bool, error) { func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) { canaryStatus := c.NewStatus.CanaryStatus - nextIndex := canaryStatus.NextStepIndex - /* - we set the CurrentStepIndex same as NextStepIndex to prevent currentStepIndex from out of range - for example, if we had a rollout with 4 steps and CurrentStepIndex was 2 - then, the user removed 3 steps from the plan, we can calculate NextStepIndex is 1 correctly, - but CurrentStepIndex remains 2, which could cause out of range. - */ - resetCurrentIndex := false - if int(canaryStatus.CurrentStepIndex) > len(c.Rollout.Spec.Strategy.Canary.Steps) { - canaryStatus.CurrentStepIndex = nextIndex - resetCurrentIndex = true - } + // since we forbid adding or removing steps, currentStepIndex should always be valid currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1] - if resetCurrentIndex || nextIndex != util.NextBatchIndex(c.Rollout, canaryStatus.CurrentStepIndex) && nextIndex > 0 { + // nextIndex=-1 means the release is done, nextIndex=0 is not used + if nextIndex := canaryStatus.NextStepIndex; nextIndex != util.NextBatchIndex(c.Rollout, canaryStatus.CurrentStepIndex) && nextIndex > 0 { currentIndexBackup := canaryStatus.CurrentStepIndex currentStepStateBackup := canaryStatus.CurrentStepState + // update the current and next stepIndex canaryStatus.CurrentStepIndex = nextIndex canaryStatus.NextStepIndex = util.NextBatchIndex(c.Rollout, nextIndex) nextStep := c.Rollout.Spec.Strategy.Canary.Steps[nextIndex-1] - // if the Replicas between currentStep and nextStep is same, we can jump to - // the TrafficRouting step; otherwise, we should start from the Init step - if reflect.DeepEqual(nextStep.Replicas, currentStep.Replicas) && !resetCurrentIndex { - canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + // compare next step and current step to decide the state we should go + if reflect.DeepEqual(nextStep.Replicas, currentStep.Replicas) { canaryStatus.CurrentStepState = v1beta1.CanaryStepStateTrafficRouting - klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, - canaryStatus.CurrentStepIndex, currentStepStateBackup, canaryStatus.CurrentStepState) } else { - canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} canaryStatus.CurrentStepState = v1beta1.CanaryStepStateInit - klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name, - canaryStatus.CurrentStepIndex, currentStepStateBackup, v1beta1.CanaryStepStateInit) } - klog.Infof("rollout(%s/%s) canary step from(%d) -> to(%d)", c.Rollout.Namespace, c.Rollout.Name, currentIndexBackup, canaryStatus.CurrentStepIndex) + canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} + klog.Infof("rollout(%s/%s) step(%d->%d) state from(%s -> %s)", + c.Rollout.Namespace, c.Rollout.Name, + currentIndexBackup, canaryStatus.CurrentStepIndex, + currentStepStateBackup, canaryStatus.CurrentStepState) return true } return false @@ -369,13 +349,13 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro return true, nil } // rollout progressing complete, remove rollout progressing annotation in workload - err := m.removeRolloutProgressingAnnotation(c) + err := removeRolloutProgressingAnnotation(m.Client, c) if err != nil { return false, err } tr := newTrafficRoutingContext(c) // execute steps based on the predefined order for each reason - nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep) + nextStep := nextCanaryTask(c.FinalizeReason, canaryStatus.FinalisingStep) // if current step is empty, set it with the first step // if current step is end, we just return if len(canaryStatus.FinalisingStep) == 0 { @@ -391,24 +371,24 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro // the order of steps is maitained by calculating thenextStep switch canaryStatus.FinalisingStep { // set workload.pause=false; set workload.partition=0 - case v1beta1.FinalisingStepTypeBatchRelease: - retry, err = m.finalizingBatchRelease(c) + case v1beta1.FinalisingStepPatchBatchRelease: + retry, err = finalizingBatchRelease(m.Client, c) // delete batchRelease - case v1beta1.FinalisingStepTypeDeleteBR: - retry, err = m.removeBatchRelease(c) + case v1beta1.FinalisingStepRemoveBatchRelease: + retry, err = removeBatchRelease(m.Client, c) // restore the gateway resources (ingress/gatewayAPI/Istio), that means // only stable Service will accept the traffic - case v1beta1.FinalisingStepTypeGateway: + case v1beta1.FinalisingStepRouteTrafficToStable: retry, err = m.trafficRoutingManager.RestoreGateway(tr) // restore the stable service - case v1beta1.FinalisingStepTypeStableService: + case v1beta1.FinalisingStepRestoreStableService: retry, err = m.trafficRoutingManager.RestoreStableService(tr) // remove canary service - case v1beta1.FinalisingStepTypeRemoveCanaryService: + case v1beta1.FinalisingStepRemoveCanaryService: retry, err = m.trafficRoutingManager.RemoveCanaryService(tr) default: - nextStep = nextTask(c.FinalizeReason, "") + nextStep = nextCanaryTask(c.FinalizeReason, "") klog.Warningf("unexpected finalising step, current step(%s), start from the first step(%s)", canaryStatus.FinalisingStep, nextStep) canaryStatus.FinalisingStep = nextStep return false, nil @@ -425,75 +405,11 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro return false, nil } -func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error { - if c.Workload == nil { - return nil - } - if _, ok := c.Workload.Annotations[util.InRolloutProgressingAnnotation]; !ok { - return nil - } - workloadRef := c.Rollout.Spec.WorkloadRef - workloadGVK := schema.FromAPIVersionAndKind(workloadRef.APIVersion, workloadRef.Kind) - obj := util.GetEmptyWorkloadObject(workloadGVK) - obj.SetNamespace(c.Workload.Namespace) - obj.SetName(c.Workload.Name) - body := fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, util.InRolloutProgressingAnnotation) - if err := m.Patch(context.TODO(), obj, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { - klog.Errorf("rollout(%s/%s) patch workload(%s) failed: %s", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, err.Error()) - return err - } - klog.Infof("remove rollout(%s/%s) workload(%s) annotation[%s] success", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, util.InRolloutProgressingAnnotation) - return nil +func (m *canaryReleaseManager) fetchClient() client.Client { + return m.Client } -func (m *canaryReleaseManager) runBatchRelease(rollout *v1beta1.Rollout, rolloutId string, batch int32, isRollback bool) (bool, *v1beta1.BatchRelease, error) { - batch = batch - 1 - br, err := m.fetchBatchRelease(rollout.Namespace, rollout.Name) - if errors.IsNotFound(err) { - // create new BatchRelease Crd - br = createBatchRelease(rollout, rolloutId, batch, isRollback) - if err = m.Create(context.TODO(), br); err != nil && !errors.IsAlreadyExists(err) { - klog.Errorf("rollout(%s/%s) create BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - klog.Infof("rollout(%s/%s) create BatchRelease(%s) success", rollout.Namespace, rollout.Name, util.DumpJSON(br)) - return false, br, nil - } else if err != nil { - klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - - // check whether batchRelease configuration is the latest - newBr := createBatchRelease(rollout, rolloutId, batch, isRollback) - if reflect.DeepEqual(br.Spec, newBr.Spec) && reflect.DeepEqual(br.Annotations, newBr.Annotations) { - klog.Infof("rollout(%s/%s) do batchRelease batch(%d) success", rollout.Namespace, rollout.Name, batch+1) - return true, br, nil - } - // update batchRelease to the latest version - if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error { - if err = m.Get(context.TODO(), client.ObjectKey{Namespace: newBr.Namespace, Name: newBr.Name}, br); err != nil { - klog.Errorf("error getting BatchRelease(%s/%s) from client", newBr.Namespace, newBr.Name) - return err - } - br.Spec = newBr.Spec - br.Annotations = newBr.Annotations - return m.Client.Update(context.TODO(), br) - }); err != nil { - klog.Errorf("rollout(%s/%s) update batchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) - return false, nil, err - } - klog.Infof("rollout(%s/%s) update batchRelease(%s) configuration to latest", rollout.Namespace, rollout.Name, util.DumpJSON(br)) - return false, br, nil -} - -func (m *canaryReleaseManager) fetchBatchRelease(ns, name string) (*v1beta1.BatchRelease, error) { - br := &v1beta1.BatchRelease{} - // batchRelease.name is equal related rollout.name - err := m.Get(context.TODO(), client.ObjectKey{Namespace: ns, Name: name}, br) - return br, err -} - -func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { +func (m *canaryReleaseManager) createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease { var batches []v1beta1.ReleaseBatch for _, step := range rollout.Spec.Strategy.Canary.Steps { batches = append(batches, v1beta1.ReleaseBatch{CanaryReplicas: *step.Replicas}) @@ -531,83 +447,6 @@ func createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, return br } -// bool means if we need retry; if error is not nil, always retry -func (m *canaryReleaseManager) removeBatchRelease(c *RolloutContext) (bool, error) { - batch := &v1beta1.BatchRelease{} - err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch) - if err != nil && errors.IsNotFound(err) { - return false, nil - } else if err != nil { - klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name) - return true, err - } - if !batch.DeletionTimestamp.IsZero() { - klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return true, nil - } - - //delete batchRelease - err = m.Delete(context.TODO(), batch) - if err != nil { - klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) - return true, err - } - klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) - return true, nil -} - -// bool means if we need retry; if error is not nil, always retry -func (m *canaryReleaseManager) finalizingBatchRelease(c *RolloutContext) (bool, error) { - br, err := m.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name) - if err != nil { - if errors.IsNotFound(err) { - return false, nil - } - return true, err - } - waitReady := c.WaitReady - // The Completed phase means batchRelease controller has processed all it - // should process. If BatchRelease phase is completed, we can do nothing. - if br.Spec.ReleasePlan.BatchPartition == nil && - br.Status.Phase == v1beta1.RolloutPhaseCompleted { - klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) - return false, nil - } - - // If BatchPartition is nil, BatchRelease will directly resume workload via: - // - * set workload Paused = false if it needs; - // - * set workload Partition = null if it needs. - if br.Spec.ReleasePlan.BatchPartition == nil { - // - If checkReady is true, finalizing policy must be "WaitResume"; - // - If checkReady is false, finalizing policy must be NOT "WaitResume"; - // Otherwise, we should correct it. - switch br.Spec.ReleasePlan.FinalizingPolicy { - case v1beta1.WaitResumeFinalizingPolicyType: - if waitReady { // no need to patch again - return true, nil - } - default: - if !waitReady { // no need to patch again - return true, nil - } - } - } - - // Correct finalizing policy. - policy := v1beta1.ImmediateFinalizingPolicyType - if waitReady { - policy = v1beta1.WaitResumeFinalizingPolicyType - } - - // Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here. - body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy) - if err = m.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { - return true, err - } - klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body) - return true, nil -} - // syncBatchRelease sync status of br to canaryStatus, and sync rollout-id of canaryStatus to br. func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canaryStatus *v1beta1.CanaryStatus) error { // sync from BatchRelease status to Rollout canaryStatus @@ -628,7 +467,7 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary } // calculate next task -func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { +func nextCanaryTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType { var taskSequence []v1beta1.FinalisingStepType //REVIEW - should we consider more complex scenarios? // like, user rollbacks the workload and disables the Rollout at the same time? @@ -638,19 +477,19 @@ func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.Fin // in the first step case v1beta1.FinaliseReasonRollback: // rollback taskSequence = []v1beta1.FinalisingStepType{ - v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version - v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new - v1beta1.FinalisingStepTypeDeleteBR, - v1beta1.FinalisingStepTypeStableService, - v1beta1.FinalisingStepTypeRemoveCanaryService, + v1beta1.FinalisingStepRouteTrafficToStable, // route all traffic to stable version + v1beta1.FinalisingStepPatchBatchRelease, // scale up old, scale down new + v1beta1.FinalisingStepRemoveBatchRelease, + v1beta1.FinalisingStepRestoreStableService, + v1beta1.FinalisingStepRemoveCanaryService, } default: // others: success/disabled/deleting rollout taskSequence = []v1beta1.FinalisingStepType{ - v1beta1.FinalisingStepTypeStableService, - v1beta1.FinalisingStepTypeGateway, - v1beta1.FinalisingStepTypeRemoveCanaryService, - v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old - v1beta1.FinalisingStepTypeDeleteBR, + v1beta1.FinalisingStepRestoreStableService, + v1beta1.FinalisingStepRouteTrafficToStable, + v1beta1.FinalisingStepRemoveCanaryService, + v1beta1.FinalisingStepPatchBatchRelease, // scale up new, scale down old + v1beta1.FinalisingStepRemoveBatchRelease, } } // if currentTask is empty, return first task diff --git a/pkg/controller/rollout/rollout_controller.go b/pkg/controller/rollout/rollout_controller.go index a1fe5606..cccc82a9 100755 --- a/pkg/controller/rollout/rollout_controller.go +++ b/pkg/controller/rollout/rollout_controller.go @@ -66,6 +66,7 @@ type RolloutReconciler struct { finder *util.ControllerFinder trafficRoutingManager *trafficrouting.Manager canaryManager *canaryReleaseManager + blueGreenManager *blueGreenReleaseManager } //+kubebuilder:rbac:groups=rollouts.kruise.io,resources=rollouts,verbs=get;list;watch;create;update;patch;delete @@ -198,5 +199,10 @@ func (r *RolloutReconciler) SetupWithManager(mgr ctrl.Manager) error { trafficRoutingManager: r.trafficRoutingManager, recorder: r.Recorder, } + r.blueGreenManager = &blueGreenReleaseManager{ + Client: mgr.GetClient(), + trafficRoutingManager: r.trafficRoutingManager, + recorder: r.Recorder, + } return nil } diff --git a/pkg/controller/rollout/rollout_controller_test.go b/pkg/controller/rollout/rollout_controller_test.go index 02fec15e..3d29d379 100644 --- a/pkg/controller/rollout/rollout_controller_test.go +++ b/pkg/controller/rollout/rollout_controller_test.go @@ -106,6 +106,73 @@ var ( }, }, } + + rolloutDemoBlueGreen = &v1beta1.Rollout{ + ObjectMeta: metav1.ObjectMeta{ + Name: "rollout-demo", + Labels: map[string]string{}, + Annotations: map[string]string{ + util.RolloutHashAnnotation: "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd", + }, + }, + Spec: v1beta1.RolloutSpec{ + WorkloadRef: v1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "echoserver", + }, + Strategy: v1beta1.RolloutStrategy{ + BlueGreen: &v1beta1.BlueGreenStrategy{ + Steps: []v1beta1.CanaryStep{ + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("0%"), + }, + Replicas: &intstr.IntOrString{StrVal: "50%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("0%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("50%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + { + TrafficRoutingStrategy: v1beta1.TrafficRoutingStrategy{ + Traffic: utilpointer.String("100%"), + }, + Replicas: &intstr.IntOrString{StrVal: "100%", Type: intstr.String}, + }, + }, + TrafficRoutings: []v1beta1.TrafficRoutingRef{ + { + Service: "echoserver", + Ingress: &v1beta1.IngressTrafficRouting{ + Name: "echoserver", + }, + GracePeriodSeconds: 0, // To facilitate testing, don't wait after traffic routing operation + }, + }, + }, + }, + }, + Status: v1beta1.RolloutStatus{ + Phase: v1beta1.RolloutPhaseProgressing, + BlueGreenStatus: &v1beta1.BlueGreenStatus{}, + Conditions: []v1beta1.RolloutCondition{ + { + Type: v1beta1.RolloutConditionProgressing, + Reason: v1alpha1.ProgressingReasonInitializing, + Status: corev1.ConditionTrue, + }, + }, + }, + } maxUnavailable = intstr.FromString("20%") deploymentDemo = &apps.Deployment{ TypeMeta: metav1.TypeMeta{ diff --git a/pkg/controller/rollout/rollout_progressing.go b/pkg/controller/rollout/rollout_progressing.go index a1214cbf..e303c617 100644 --- a/pkg/controller/rollout/rollout_progressing.go +++ b/pkg/controller/rollout/rollout_progressing.go @@ -378,8 +378,7 @@ func (r *RolloutReconciler) getReleaseManager(rollout *v1beta1.Rollout) (Release if rollout.Spec.Strategy.IsCanaryStragegy() { return r.canaryManager, nil } else if rollout.Spec.Strategy.IsBlueGreenRelease() { - // placeholder for upcoming PR - // return r.blueGreenManager, nil + return r.blueGreenManager, nil } return nil, fmt.Errorf("unknown rolling style: %s, and thus cannot call corresponding release manager", rollout.Spec.Strategy.GetRollingStyle()) } @@ -420,7 +419,7 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) } // if no trafficRouting exists, simply remove batchRelease if !c.Rollout.Spec.Strategy.HasTrafficRoutings() { - retry, err := releaseManager.removeBatchRelease(c) + retry, err := removeBatchRelease(releaseManager.fetchClient(), c) if err != nil { klog.Errorf("rollout(%s/%s) DoFinalising batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err @@ -440,11 +439,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) switch subStatus.FinalisingStep { default: // start from FinalisingStepTypeGateway - subStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + subStatus.FinalisingStep = v1beta1.FinalisingStepRouteTrafficToStable fallthrough // firstly, restore the gateway resources (ingress/gatewayAPI/Istio), that means // only stable Service will accept the traffic - case v1beta1.FinalisingStepTypeGateway: + case v1beta1.FinalisingStepRouteTrafficToStable: retry, err := r.trafficRoutingManager.RestoreGateway(tr) if err != nil || retry { subStatus.LastUpdateTime = tr.LastUpdateTime @@ -452,13 +451,13 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) } klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep) subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} - subStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR + subStatus.FinalisingStep = v1beta1.FinalisingStepRemoveBatchRelease fallthrough // secondly, remove the batchRelease. For canary release, it means the immediate deletion of // canary deployment, for other release, the v2 pods won't be deleted immediately // in both cases, only the stable pods (v1) accept the traffic - case v1beta1.FinalisingStepTypeDeleteBR: - retry, err := releaseManager.removeBatchRelease(c) + case v1beta1.FinalisingStepRemoveBatchRelease: + retry, err := removeBatchRelease(releaseManager.fetchClient(), c) if err != nil { klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) return false, err @@ -467,7 +466,7 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) } klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep) subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()} - subStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteCanaryService + subStatus.FinalisingStep = v1beta1.FinalisingStepRemoveCanaryService fallthrough // finally, remove the canary service. This step can swap with the last step. /* @@ -478,7 +477,7 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error) stable service selector, then the traffic will route to both v1 and v2 before executing the first step of v3 release. */ - case v1beta1.FinalisingStepTypeDeleteCanaryService: + case v1beta1.FinalisingStepRemoveCanaryService: // ignore the grace period because it is the last step _, err := r.trafficRoutingManager.RemoveCanaryService(tr) if err != nil { @@ -495,7 +494,7 @@ func (r *RolloutReconciler) recalculateCanaryStep(c *RolloutContext) (int32, err if err != nil { return 0, err } - batch, err := releaseManager.fetchBatchRelease(c.Rollout.Namespace, c.Rollout.Name) + batch, err := fetchBatchRelease(releaseManager.fetchClient(), c.Rollout.Namespace, c.Rollout.Name) if errors.IsNotFound(err) { return 1, nil } else if err != nil { @@ -506,7 +505,12 @@ func (r *RolloutReconciler) recalculateCanaryStep(c *RolloutContext) (int32, err if c.NewStatus != nil { currentIndex = c.NewStatus.GetSubStatus().CurrentStepIndex - 1 } - steps := append([]int{}, int(currentIndex)) + steps := make([]int, 0) + // currentIndex may greater than len(c.Rollout.Spec.Strategy.GetSteps()) if user changed the release plan + // currentIndex should never be less than 0 theoricaly unless user changed it intentionally + if ci := int(currentIndex); ci >= 0 && ci < len(c.Rollout.Spec.Strategy.GetSteps()) { + steps = append(steps, ci) + } // we don't distinguish between the changes in Replicas and Traffic // Whatever the change is, we recalculate the step. // we put the current step index first for retrieval, so that if Traffic is the only change, diff --git a/pkg/controller/rollout/rollout_progressing_test.go b/pkg/controller/rollout/rollout_progressing_test.go index d73eb466..0e0f1238 100644 --- a/pkg/controller/rollout/rollout_progressing_test.go +++ b/pkg/controller/rollout/rollout_progressing_test.go @@ -289,7 +289,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 - s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRestoreStableService s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising @@ -334,7 +334,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted // given that the selector of stable Service is removed // we will go on it the next step, i.e. patch restoreGateway - obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeStableService + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRestoreStableService cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -358,7 +358,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 - s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRouteTrafficToStable s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising @@ -403,7 +403,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted // because the batchRelease hasn't completed (ie. br.Status.Phase is not completed), // it will take more than one reconciles to go on to the next step - obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepPatchBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -427,7 +427,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 - s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepPatchBatchRelease s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising @@ -471,7 +471,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted // the batchRelease has completed (ie. br.Status.Phase is completed), // we expect the finalizing step to be next step, i.e. deleteBatchRelease - obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeBatchRelease + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepPatchBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -494,7 +494,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepIndex = 4 s.CanaryStatus.NextStepIndex = 0 - s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRemoveBatchRelease s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted cond2 := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) cond2.Reason = v1alpha1.ProgressingReasonFinalising @@ -534,7 +534,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted // deleteBatchRelease is the last step, and it won't wait a grace time // after this step, this release should be succeeded - obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRemoveBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonFinalising cond.Status = corev1.ConditionTrue @@ -740,7 +740,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { s.CanaryStatus.CurrentStepIndex = 3 s.CanaryStatus.CanaryReplicas = 5 s.CanaryStatus.CanaryReadyReplicas = 3 - s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeGateway + s.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRouteTrafficToStable s.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" s.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade cond := util.GetRolloutCondition(*s, v1beta1.RolloutConditionProgressing) @@ -790,7 +790,7 @@ func TestReconcileRolloutProgressing(t *testing.T) { obj.Status.CanaryStatus.CanaryReadyReplicas = 3 obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2" obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade - obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR + obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepRemoveBatchRelease cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing) cond.Reason = v1alpha1.ProgressingReasonInRolling util.SetRolloutCondition(&obj.Status, *cond) diff --git a/pkg/controller/rollout/rollout_releaseManager.go b/pkg/controller/rollout/rollout_releaseManager.go index ebc82f3e..960c95f7 100644 --- a/pkg/controller/rollout/rollout_releaseManager.go +++ b/pkg/controller/rollout/rollout_releaseManager.go @@ -17,7 +17,18 @@ limitations under the License. package rollout import ( + "context" + "fmt" + "reflect" + "github.com/openkruise/rollouts/api/v1beta1" + "github.com/openkruise/rollouts/pkg/util" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/util/retry" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" ) type ReleaseManager interface { @@ -27,8 +38,154 @@ type ReleaseManager interface { doCanaryJump(c *RolloutContext) bool // called when user accomplishes a release / does a rollback, or disables/removes the Rollout Resource doCanaryFinalising(c *RolloutContext) (bool, error) - // fetch the BatchRelease object - fetchBatchRelease(ns, name string) (*v1beta1.BatchRelease, error) - // remove the BatchRelease object - removeBatchRelease(c *RolloutContext) (bool, error) + // create btach release + createBatchRelease(rollout *v1beta1.Rollout, rolloutID string, batch int32, isRollback bool) *v1beta1.BatchRelease + // retrun client + fetchClient() client.Client +} + +func fetchBatchRelease(cli client.Client, ns, name string) (*v1beta1.BatchRelease, error) { + br := &v1beta1.BatchRelease{} + // batchRelease.name is equal related rollout.name + err := cli.Get(context.TODO(), client.ObjectKey{Namespace: ns, Name: name}, br) + return br, err +} + +func removeRolloutProgressingAnnotation(cli client.Client, c *RolloutContext) error { + if c.Workload == nil { + return nil + } + if _, ok := c.Workload.Annotations[util.InRolloutProgressingAnnotation]; !ok { + return nil + } + workloadRef := c.Rollout.Spec.WorkloadRef + workloadGVK := schema.FromAPIVersionAndKind(workloadRef.APIVersion, workloadRef.Kind) + obj := util.GetEmptyWorkloadObject(workloadGVK) + obj.SetNamespace(c.Workload.Namespace) + obj.SetName(c.Workload.Name) + body := fmt.Sprintf(`{"metadata":{"annotations":{"%s":null}}}`, util.InRolloutProgressingAnnotation) + if err := cli.Patch(context.TODO(), obj, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { + klog.Errorf("rollout(%s/%s) patch workload(%s) failed: %s", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, err.Error()) + return err + } + klog.Infof("remove rollout(%s/%s) workload(%s) annotation[%s] success", c.Rollout.Namespace, c.Rollout.Name, c.Workload.Name, util.InRolloutProgressingAnnotation) + return nil +} + +// bool means if we need retry; if error is not nil, always retry +func removeBatchRelease(cli client.Client, c *RolloutContext) (bool, error) { + batch := &v1beta1.BatchRelease{} + err := cli.Get(context.TODO(), client.ObjectKey{Namespace: c.Rollout.Namespace, Name: c.Rollout.Name}, batch) + if err != nil && errors.IsNotFound(err) { + return false, nil + } else if err != nil { + klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name) + return true, err + } + if !batch.DeletionTimestamp.IsZero() { + klog.Infof("rollout(%s/%s) BatchRelease is terminating, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return true, nil + } + + //delete batchRelease + err = cli.Delete(context.TODO(), batch) + if err != nil { + klog.Errorf("rollout(%s/%s) delete BatchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error()) + return true, err + } + klog.Infof("rollout(%s/%s) deleting BatchRelease, and wait a moment", c.Rollout.Namespace, c.Rollout.Name) + return true, nil +} + +// bool means if we need retry; if error is not nil, always retry +func finalizingBatchRelease(cli client.Client, c *RolloutContext) (bool, error) { + br, err := fetchBatchRelease(cli, c.Rollout.Namespace, c.Rollout.Name) + if err != nil { + if errors.IsNotFound(err) { + return false, nil + } + return true, err + } + waitReady := c.WaitReady + // The Completed phase means batchRelease controller has processed all it + // should process. If BatchRelease phase is completed, we can do nothing. + if br.Spec.ReleasePlan.BatchPartition == nil && + br.Status.Phase == v1beta1.RolloutPhaseCompleted { + klog.Infof("rollout(%s/%s) finalizing batchRelease(%s) done", c.Rollout.Namespace, c.Rollout.Name, util.DumpJSON(br.Status)) + return false, nil + } + + // If BatchPartition is nil, BatchRelease will directly resume workload via: + // - * set workload Paused = false if it needs; + // - * set workload Partition = null if it needs. + if br.Spec.ReleasePlan.BatchPartition == nil { + // - If checkReady is true, finalizing policy must be "WaitResume"; + // - If checkReady is false, finalizing policy must be NOT "WaitResume"; + // Otherwise, we should correct it. + switch br.Spec.ReleasePlan.FinalizingPolicy { + case v1beta1.WaitResumeFinalizingPolicyType: + if waitReady { // no need to patch again + return true, nil + } + default: + if !waitReady { // no need to patch again + return true, nil + } + } + } + + // Correct finalizing policy. + policy := v1beta1.ImmediateFinalizingPolicyType + if waitReady { + policy = v1beta1.WaitResumeFinalizingPolicyType + } + + // Patch BatchPartition and FinalizingPolicy, BatchPartition always patch null here. + body := fmt.Sprintf(`{"spec":{"releasePlan":{"batchPartition":null,"finalizingPolicy":"%s"}}}`, policy) + if err = cli.Patch(context.TODO(), br, client.RawPatch(types.MergePatchType, []byte(body))); err != nil { + return true, err + } + klog.Infof("rollout(%s/%s) patch batchRelease(%s) success", c.Rollout.Namespace, c.Rollout.Name, body) + return true, nil +} + +func runBatchRelease(m ReleaseManager, rollout *v1beta1.Rollout, rolloutId string, batch int32, isRollback bool) (bool, *v1beta1.BatchRelease, error) { + cli := m.fetchClient() + batch = batch - 1 + br, err := fetchBatchRelease(cli, rollout.Namespace, rollout.Name) + if errors.IsNotFound(err) { + // create new BatchRelease Crd + br = m.createBatchRelease(rollout, rolloutId, batch, isRollback) + if err = cli.Create(context.TODO(), br); err != nil && !errors.IsAlreadyExists(err) { + klog.Errorf("rollout(%s/%s) create BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + klog.Infof("rollout(%s/%s) create BatchRelease(%s) success", rollout.Namespace, rollout.Name, util.DumpJSON(br)) + return false, br, nil + } else if err != nil { + klog.Errorf("rollout(%s/%s) fetch BatchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + + // check whether batchRelease configuration is the latest + newBr := m.createBatchRelease(rollout, rolloutId, batch, isRollback) + if reflect.DeepEqual(br.Spec, newBr.Spec) && reflect.DeepEqual(br.Annotations, newBr.Annotations) { + klog.Infof("rollout(%s/%s) do batchRelease batch(%d) success", rollout.Namespace, rollout.Name, batch+1) + return true, br, nil + } + // update batchRelease to the latest version + if err = retry.RetryOnConflict(retry.DefaultBackoff, func() error { + if err = cli.Get(context.TODO(), client.ObjectKey{Namespace: newBr.Namespace, Name: newBr.Name}, br); err != nil { + klog.Errorf("error getting BatchRelease(%s/%s) from client", newBr.Namespace, newBr.Name) + return err + } + br.Spec = newBr.Spec + br.Annotations = newBr.Annotations + return cli.Update(context.TODO(), br) + }); err != nil { + klog.Errorf("rollout(%s/%s) update batchRelease failed: %s", rollout.Namespace, rollout.Name, err.Error()) + return false, nil, err + } + klog.Infof("rollout(%s/%s) update batchRelease(%s) configuration to latest", rollout.Namespace, rollout.Name, util.DumpJSON(br)) + return false, br, nil } diff --git a/pkg/trafficrouting/manager.go b/pkg/trafficrouting/manager.go index 113f6433..243c5999 100644 --- a/pkg/trafficrouting/manager.go +++ b/pkg/trafficrouting/manager.go @@ -34,6 +34,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "k8s.io/utils/integer" + utilpointer "k8s.io/utils/pointer" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -223,6 +224,41 @@ func (m *Manager) FinalisingTrafficRouting(c *TrafficRoutingContext) (bool, erro return true, nil } +// returns: +// - if error is not nil, usually we need to retry later. Only if error is nil, we consider the bool. +// - The bool value indicates whether retry is needed. If true, it usually means +// gateway resources have been updated and we need to wait for `graceSeconds`. +// +// only if error is nil AND retry is false, this calling can be considered as completed +func (m *Manager) RouteAllTrafficToNewVersion(c *TrafficRoutingContext) (bool, error) { + klog.InfoS("route all traffic to new version", "rollout", c.Key) + if len(c.ObjectRef) == 0 { + return false, nil + } + // build up the network provider + stableService := c.ObjectRef[0].Service + cServiceName := getCanaryServiceName(stableService, c.OnlyTrafficRouting, c.DisableGenerateCanaryService) + trController, err := newNetworkProvider(m.Client, c, stableService, cServiceName) + if err != nil { + klog.Errorf("%s newTrafficRoutingController failed: %s", c.Key, err.Error()) + return false, err + } + graceSeconds := GetGraceSeconds(c.ObjectRef, defaultGracePeriodSeconds) + retry, remaining, err := grace.RunWithGraceSeconds(string(c.OwnerRef.UID), "updateRoute", graceSeconds, func() (bool, error) { + // route all traffic to new version + c.Strategy.Matches = nil + c.Strategy.Traffic = utilpointer.StringPtr("100%") + //NOTE - This return value "verified" has the opposite semantics with "modified" + verified, err := trController.EnsureRoutes(context.TODO(), &c.Strategy) + if !verified { + c.LastUpdateTime = &metav1.Time{Time: time.Now()} + } + return !verified, err + }) + UpdateRecheckDuration(c, remaining) + return retry, err +} + // returns: // - if error is not nil, usually we need to retry later. Only if error is nil, we consider the bool. // - The bool value indicates whether retry is needed. If true, it usually means @@ -437,11 +473,11 @@ func (m *Manager) RestoreStableService(c *TrafficRoutingContext) (bool, error) { serviceName := c.ObjectRef[0].Service err := m.Get(context.TODO(), client.ObjectKey{Namespace: c.Namespace, Name: serviceName}, stableService) if errors.IsNotFound(err) { - return true, nil + return false, nil } if err != nil { klog.Errorf("%s get stable service(%s) failed: %s", c.Key, serviceName, err.Error()) - return false, err + return true, err } // restore stable Service diff --git a/pkg/trafficrouting/manager_test.go b/pkg/trafficrouting/manager_test.go index 2ce96fd3..354416e9 100644 --- a/pkg/trafficrouting/manager_test.go +++ b/pkg/trafficrouting/manager_test.go @@ -1293,8 +1293,10 @@ func TestRestoreGateway(t *testing.T) { // the second call, it should be no error and no retry time.Sleep(1 * time.Second) retry, err = manager.RestoreGateway(c) - if err != nil || retry { - t.Fatalf("RestoreGateway failed: %s", err) + if err != nil { + t.Fatalf("RestoreGateway failed: %s", err.Error()) + } else if retry { + t.Fatalf("RestoreGateway failed: retry should be false") } }) } @@ -1403,8 +1405,175 @@ func TestRemoveCanaryService(t *testing.T) { // the second call, it should be no error and no retry time.Sleep(1 * time.Second) retry, err = manager.RemoveCanaryService(c) - if err != nil || retry { - t.Fatalf("RemoveCanaryService failed: %s", err) + if err != nil { + t.Fatalf("RemoveCanaryService failed: %s", err.Error()) + } else if retry { + t.Fatalf("RemoveCanaryService failed: retry should be false") + } + }) + } +} + +func TestRouteAllTrafficToNewVersion(t *testing.T) { + cases := []struct { + name string + getObj func() ([]*corev1.Service, []*netv1.Ingress) + getRollout func() (*v1beta1.Rollout, *util.Workload) + onlyTrafficRouting bool + expectObj func() ([]*corev1.Service, []*netv1.Ingress) + expectNotFound func() ([]*corev1.Service, []*netv1.Ingress) + retry bool + }{ + { + name: "Route all traffic test1", + getObj: func() ([]*corev1.Service, []*netv1.Ingress) { + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + getRollout: func() (*v1beta1.Rollout, *util.Workload) { + obj := demoRollout.DeepCopy() + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} + return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} + }, + expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { + // service and ingress remain unchanged + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + expectNotFound: func() ([]*corev1.Service, []*netv1.Ingress) { + return nil, nil + }, + retry: false, + }, + { + name: "Route all traffic test2", + getObj: func() ([]*corev1.Service, []*netv1.Ingress) { + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "50" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + getRollout: func() (*v1beta1.Rollout, *util.Workload) { + obj := demoRollout.DeepCopy() + obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateCompleted + obj.Status.CanaryStatus.CurrentStepIndex = 4 + obj.Status.CanaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now().Add(-time.Hour)} + return obj, &util.Workload{RevisionLabelKey: apps.DefaultDeploymentUniqueLabelKey} + }, + expectObj: func() ([]*corev1.Service, []*netv1.Ingress) { + // service and ingress remain unchanged + s1 := demoService.DeepCopy() + s2 := demoService.DeepCopy() + s2.Name = "echoserver-canary" + s2.Spec.Selector[apps.DefaultDeploymentUniqueLabelKey] = "podtemplatehash-v2" + c1 := demoIngress.DeepCopy() + c2 := demoIngress.DeepCopy() + c2.Name = "echoserver-canary" + c2.Annotations[fmt.Sprintf("%s/canary", nginxIngressAnnotationDefaultPrefix)] = "true" + c2.Annotations[fmt.Sprintf("%s/canary-weight", nginxIngressAnnotationDefaultPrefix)] = "100" + c2.Spec.Rules[0].HTTP.Paths[0].Backend.Service.Name = "echoserver-canary" + return []*corev1.Service{s1, s2}, []*netv1.Ingress{c1, c2} + }, + expectNotFound: func() ([]*corev1.Service, []*netv1.Ingress) { + return nil, nil + }, + retry: true, + }, + } + + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + ss, ig := cs.getObj() + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(ig[0], ss[0], demoConf.DeepCopy()).Build() + if len(ss) == 2 { + _ = cli.Create(context.TODO(), ss[1]) + } + if len(ig) == 2 { + _ = cli.Create(context.TODO(), ig[1]) + } + rollout, workload := cs.getRollout() + newStatus := rollout.Status.DeepCopy() + currentStep := rollout.Spec.Strategy.Canary.Steps[newStatus.CanaryStatus.CurrentStepIndex-1] + c := &TrafficRoutingContext{ + Key: fmt.Sprintf("Rollout(%s/%s)", rollout.Namespace, rollout.Name), + Namespace: rollout.Namespace, + ObjectRef: rollout.Spec.Strategy.Canary.TrafficRoutings, + Strategy: currentStep.TrafficRoutingStrategy, + OwnerRef: *metav1.NewControllerRef(rollout, v1beta1.SchemeGroupVersion.WithKind("Rollout")), + RevisionLabelKey: workload.RevisionLabelKey, + StableRevision: newStatus.CanaryStatus.StableRevision, + CanaryRevision: newStatus.CanaryStatus.PodTemplateHash, + LastUpdateTime: newStatus.CanaryStatus.LastUpdateTime, + OnlyTrafficRouting: cs.onlyTrafficRouting, + } + manager := NewTrafficRoutingManager(cli) + retry, err := manager.RouteAllTrafficToNewVersion(c) + if err != nil { + t.Fatalf("RouteAllTrafficToNewVersion first failed: %s", err.Error()) + } + if retry != cs.retry { + t.Fatalf("RouteAllTrafficToNewVersion expect(%v), but get(%v)", cs.retry, retry) + } + ss, ig = cs.expectObj() + for _, obj := range ss { + checkObjEqual(cli, t, obj) + } + for _, obj := range ig { + checkObjEqual(cli, t, obj) + } + + ss, ig = cs.expectNotFound() + for _, obj := range ss { + checkNotFound(cli, t, obj) + } + for _, obj := range ig { + checkNotFound(cli, t, obj) + } + // if done, no need check again + if !cs.retry { + return + } + // the second call, it should be no error and no retry + time.Sleep(1 * time.Second) + retry, err = manager.RouteAllTrafficToNewVersion(c) + if err != nil { + t.Fatalf("RouteAllTrafficToNewVersion failed: %s", err.Error()) + } else if retry { + t.Fatalf("RouteAllTrafficToNewVersion failed: retry should be false") + } + ss, ig = cs.expectObj() + for _, obj := range ss { + checkObjEqual(cli, t, obj) + } + for _, obj := range ig { + checkObjEqual(cli, t, obj) } }) } diff --git a/pkg/util/controller_finder.go b/pkg/util/controller_finder.go index 886b8191..50030561 100644 --- a/pkg/util/controller_finder.go +++ b/pkg/util/controller_finder.go @@ -87,19 +87,19 @@ func NewControllerFinder(c client.Client) *ControllerFinder { func (r *ControllerFinder) GetWorkloadForRef(rollout *rolloutv1beta1.Rollout) (*Workload, error) { workloadRef := rollout.Spec.WorkloadRef - if rollout.Spec.Strategy.GetRollingStyle() == rolloutv1beta1.CanaryRollingStyle { - for _, finder := range append(r.canaryStyleFinders(), r.partitionStyleFinders()...) { - workload, err := finder(rollout.Namespace, &workloadRef) - if workload != nil || err != nil { - return workload, err - } - } - } else { - for _, finder := range r.partitionStyleFinders() { - workload, err := finder(rollout.Namespace, &workloadRef) - if workload != nil || err != nil { - return workload, err - } + var finders []ControllerFinderFunc + switch rollout.Spec.Strategy.GetRollingStyle() { + case rolloutv1beta1.CanaryRollingStyle: + finders = append(r.canaryStyleFinders(), r.partitionStyleFinders()...) + case rolloutv1beta1.BlueGreenRollingStyle: + finders = r.bluegreenStyleFinders() + default: + finders = r.partitionStyleFinders() + } + for _, finder := range finders { + workload, err := finder(rollout.Namespace, &workloadRef) + if workload != nil || err != nil { + return workload, err } } @@ -115,6 +115,10 @@ func (r *ControllerFinder) partitionStyleFinders() []ControllerFinderFunc { return []ControllerFinderFunc{r.getKruiseCloneSet, r.getAdvancedDeployment, r.getStatefulSetLikeWorkload, r.getKruiseDaemonSet} } +func (r *ControllerFinder) bluegreenStyleFinders() []ControllerFinderFunc { + return []ControllerFinderFunc{r.getKruiseCloneSet, r.getAdvancedDeployment} +} + var ( ControllerKindRS = apps.SchemeGroupVersion.WithKind("ReplicaSet") ControllerKindDep = apps.SchemeGroupVersion.WithKind("Deployment") diff --git a/pkg/util/controller_finder_test.go b/pkg/util/controller_finder_test.go new file mode 100644 index 00000000..6c875f34 --- /dev/null +++ b/pkg/util/controller_finder_test.go @@ -0,0 +1,450 @@ +package util + +import ( + "context" + "fmt" + + // "reflect" + "strconv" + "testing" + + "math/rand" + + appsv1alpha1 "github.com/openkruise/kruise-api/apps/v1alpha1" + rolloutv1alpha1 "github.com/openkruise/rollouts/api/v1alpha1" + rolloutv1beta1 "github.com/openkruise/rollouts/api/v1beta1" + apps "k8s.io/api/apps/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +var namespace string = "unit-test" +var demoRollout rolloutv1beta1.Rollout = rolloutv1beta1.Rollout{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Name: "rollout-demo", + Labels: map[string]string{}, + Annotations: map[string]string{}, + }, + Spec: rolloutv1beta1.RolloutSpec{ + WorkloadRef: rolloutv1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "deployment-demo", + }, + Strategy: rolloutv1beta1.RolloutStrategy{ + BlueGreen: &rolloutv1beta1.BlueGreenStrategy{}, + }, + }, + Status: rolloutv1beta1.RolloutStatus{}, +} + +func TestGetWorkloadForRef(t *testing.T) { + cases := []struct { + name string + getRollout func() *rolloutv1beta1.Rollout + getWorkload func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) + expectWorkload func() *Workload + err error + }{ + { + name: "cloneset, not in rollout progress", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps.kruise.io/v1alpha1", + Kind: "CloneSet", + Name: "cloneset-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + return nil, nil, cloneset.DeepCopy() + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "CloneSet", + APIVersion: "apps.kruise.io/v1alpha1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "cloneset-demo", + Annotations: map[string]string{ + "rollouts.kruise.io/unit-test-anno": "true", + }, + Labels: map[string]string{ + "rollouts.kruise.io/unit-test-label": "true", + "rollouts.kruise.io/stable-revision": "stable", + }, + }, + Replicas: 10, + StableRevision: "version1", + CanaryRevision: "version2", + PodTemplateHash: "version2", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + } + }, + }, + { + name: "cloneset,in rollout progress", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps.kruise.io/v1alpha1", + Kind: "CloneSet", + Name: "cloneset-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + cs := cloneset.DeepCopy() + cs.Annotations[InRolloutProgressingAnnotation] = "true" + return nil, nil, cs + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "CloneSet", + APIVersion: "apps.kruise.io/v1alpha1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "cloneset-demo", + Annotations: map[string]string{ + "rollouts.kruise.io/unit-test-anno": "true", + }, + Labels: map[string]string{ + "rollouts.kruise.io/unit-test-label": "true", + "rollouts.kruise.io/stable-revision": "stable", + }, + }, + Replicas: 10, + StableRevision: "version1", + CanaryRevision: "version2", + PodTemplateHash: "version2", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + InRolloutProgressing: true, + } + }, + }, + { + name: "in rollback progress", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps.kruise.io/v1alpha1", + Kind: "CloneSet", + Name: "cloneset-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + cs := cloneset.DeepCopy() + cs.Annotations[InRolloutProgressingAnnotation] = "true" + cs.Status.CurrentRevision = "version2" + cs.Status.UpdateRevision = "version2" + cs.Status.UpdatedReplicas = 5 + return nil, nil, cs + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "CloneSet", + APIVersion: "apps.kruise.io/v1alpha1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "cloneset-demo", + Annotations: map[string]string{ + "rollouts.kruise.io/unit-test-anno": "true", + }, + Labels: map[string]string{ + "rollouts.kruise.io/unit-test-label": "true", + "rollouts.kruise.io/stable-revision": "stable", + }, + }, + Replicas: 10, + StableRevision: "version2", + CanaryRevision: "version2", + PodTemplateHash: "version2", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + InRolloutProgressing: true, + IsInRollback: true, + } + }, + }, + { + name: "deployment: not in rollout progress", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "deployment-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + dep := deployment.DeepCopy() + dep.Labels[rolloutv1alpha1.DeploymentStableRevisionLabel] = "stable" + rs := generateRS(*dep) + rs.Namespace = namespace + rs.Spec.Replicas = dep.Spec.Replicas + rs.Labels[apps.DefaultDeploymentUniqueLabelKey] = "5cf7c88b4" + return dep, &rs, nil + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "Deployment", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "deployment-demo", + Annotations: map[string]string{ + "rollouts.kruise.io/unit-test-anno": "true", + }, + Labels: map[string]string{ + "rollouts.kruise.io/unit-test-label": "true", + "rollouts.kruise.io/stable-revision": "stable", + }, + }, + Replicas: 10, + StableRevision: "stable", + CanaryRevision: "5cf7c88b4", + PodTemplateHash: "", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + } + }, + }, + { + name: "in rollout progress", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "deployment-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + dep := deployment.DeepCopy() + dep.Labels[rolloutv1alpha1.DeploymentStableRevisionLabel] = "stable" + dep.Annotations[InRolloutProgressingAnnotation] = "true" + rs := generateRS(*dep) + rs.Namespace = namespace + rs.Spec.Replicas = dep.Spec.Replicas + rs.Labels[apps.DefaultDeploymentUniqueLabelKey] = "5cf7c88b4" + return dep, &rs, nil + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "Deployment", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "deployment-demo", + }, + Replicas: 10, + StableRevision: "stable", + CanaryRevision: "5cf7c88b4", + PodTemplateHash: "5cf7c88b4", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + InRolloutProgressing: true, + } + }, + }, + { + name: "in rollback", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "deployment-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + dep := deployment.DeepCopy() + dep.Labels[rolloutv1alpha1.DeploymentStableRevisionLabel] = "stable" + dep.Annotations[InRolloutProgressingAnnotation] = "true" + rs := generateRS(*dep) + rs.Namespace = namespace + rs.Spec.Replicas = dep.Spec.Replicas + // the newst revision is stable + rs.Labels[apps.DefaultDeploymentUniqueLabelKey] = "stable" + return dep, &rs, nil + }, + expectWorkload: func() *Workload { + return &Workload{ + TypeMeta: metav1.TypeMeta{ + Kind: "Deployment", + APIVersion: "apps/v1", + }, + ObjectMeta: metav1.ObjectMeta{ + Namespace: "unit-test", + Name: "deployment-demo", + }, + Replicas: 10, + StableRevision: "stable", + CanaryRevision: "5cf7c88b4", + PodTemplateHash: "stable", + RevisionLabelKey: "pod-template-hash", + IsStatusConsistent: true, + InRolloutProgressing: true, + IsInRollback: true, + } + }, + }, + { + name: "not consistent", + getRollout: func() *rolloutv1beta1.Rollout { + rollout := demoRollout.DeepCopy() + rollout.Spec.WorkloadRef = rolloutv1beta1.ObjectRef{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "deployment-demo", + } + return rollout + }, + getWorkload: func() (*apps.Deployment, *apps.ReplicaSet, *appsv1alpha1.CloneSet) { + dep := deployment.DeepCopy() + // modify generation + dep.Generation = 12 + dep.Labels[rolloutv1alpha1.DeploymentStableRevisionLabel] = "stable" + dep.Annotations[InRolloutProgressingAnnotation] = "true" + rs := generateRS(*dep) + rs.Namespace = namespace + rs.Spec.Replicas = dep.Spec.Replicas + rs.Labels[apps.DefaultDeploymentUniqueLabelKey] = "5cf7c88b4" + return dep, &rs, nil + }, + expectWorkload: func() *Workload { + return &Workload{ + IsStatusConsistent: false, + } + }, + }, + } + for _, cs := range cases { + t.Run(cs.name, func(t *testing.T) { + rollout := cs.getRollout() + dp, rs, cloneset := cs.getWorkload() + cli := fake.NewClientBuilder().WithScheme(scheme).WithObjects(rollout).Build() + if dp != nil { + _ = cli.Create(context.TODO(), rs) + } + if rs != nil { + _ = cli.Create(context.TODO(), dp) + } + if cloneset != nil { + _ = cli.Create(context.TODO(), cloneset) + } + finder := NewControllerFinder(cli) + workload, err := finder.GetWorkloadForRef(rollout) + if !checkWorkloadEqual(workload, cs.expectWorkload()) { + t.Fatal("expected workload not equal got workload") + } + if res := checkErrorEqual(err, cs.err); res != "" { + t.Fatal(res) + } + }) + } +} + +func checkErrorEqual(g, e error) string { + gotError, expectedError := "none", "none" + if g != nil { + gotError = g.Error() + } + if e != nil { + gotError = e.Error() + } + if gotError != expectedError { + return fmt.Sprintf("expected error %s, but got error %s,", expectedError, gotError) + } + return "" +} + +// checkWorkloadEqual compares two Workload pointers for equality. +func checkWorkloadEqual(a, b *Workload) bool { + if a == nil && b == nil { + return true + } + if a == nil || b == nil { + return false + } + + // Compare TypeMeta + if a.Kind != b.Kind || a.APIVersion != b.APIVersion { + return false + } + + // Compare ObjectMeta for specified fields (Namespace, Name, Annotations, Labels) + if !objectMetaEqual(a.ObjectMeta, b.ObjectMeta) { + return false + } + + // Compare other fields + return a.Replicas == b.Replicas && + a.StableRevision == b.StableRevision && + a.CanaryRevision == b.CanaryRevision && + a.PodTemplateHash == b.PodTemplateHash && + a.RevisionLabelKey == b.RevisionLabelKey && + a.IsInRollback == b.IsInRollback && + a.InRolloutProgressing == b.InRolloutProgressing && + a.IsStatusConsistent == b.IsStatusConsistent +} + +// objectMetaEqual compares the specified fields of ObjectMeta. +func objectMetaEqual(a, b metav1.ObjectMeta) bool { + return a.Namespace == b.Namespace && + a.Name == b.Name +} + +func generateRS(deployment apps.Deployment) apps.ReplicaSet { + template := deployment.Spec.Template.DeepCopy() + return apps.ReplicaSet{ + ObjectMeta: metav1.ObjectMeta{ + UID: randomUID(), + Name: randomName("replicaset"), + Labels: template.Labels, + OwnerReferences: []metav1.OwnerReference{*newDControllerRef(&deployment)}, + }, + Spec: apps.ReplicaSetSpec{ + Replicas: new(int32), + Template: *template, + Selector: &metav1.LabelSelector{MatchLabels: template.Labels}, + }, + } +} +func randomUID() types.UID { + return types.UID(strconv.FormatInt(rand.Int63(), 10)) +} + +func randomName(prefix string) string { + return fmt.Sprintf("%s-%s", prefix, strconv.FormatInt(5, 10)) +} + +func newDControllerRef(d *apps.Deployment) *metav1.OwnerReference { + isController := true + return &metav1.OwnerReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: d.GetName(), + UID: d.GetUID(), + Controller: &isController, + } +} diff --git a/pkg/webhook/rollout/validating/rollout_create_update_handler.go b/pkg/webhook/rollout/validating/rollout_create_update_handler.go index 653b0bf6..49b727cf 100644 --- a/pkg/webhook/rollout/validating/rollout_create_update_handler.go +++ b/pkg/webhook/rollout/validating/rollout_create_update_handler.go @@ -160,6 +160,10 @@ func (h *RolloutCreateUpdateHandler) validateRolloutUpdate(oldObj, newObj *appsv if oldObj.Spec.Strategy.GetRollingStyle() != newObj.Spec.Strategy.GetRollingStyle() { return field.ErrorList{field.Forbidden(field.NewPath("Spec.Strategy.Canary|BlueGreen"), "Rollout style and enableExtraWorkloadForCanary are immutable")} } + // forbid adding or removing steps during rollout so that the code can be simpler + if len(oldObj.Spec.Strategy.GetSteps()) != len(newObj.Spec.Strategy.GetSteps()) { + return field.ErrorList{field.Forbidden(field.NewPath("Spec.Strategy.Canary|BlueGreen"), "Amount of Rollout steps are immutable")} + } } /*if newObj.Status.CanaryStatus != nil && newObj.Status.CanaryStatus.CurrentStepState == appsv1beta1.CanaryStepStateReady {