Skip to content

Commit

Permalink
improve finalising logic for canary release
Browse files Browse the repository at this point in the history
Signed-off-by: yunbo <[email protected]>
  • Loading branch information
Funinu committed Aug 29, 2024
1 parent 5378dc2 commit 15b589d
Show file tree
Hide file tree
Showing 9 changed files with 284 additions and 57 deletions.
2 changes: 0 additions & 2 deletions api/v1alpha1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ type RolloutStatus struct {
// Conditions a list of conditions a rollout can have.
// +optional
Conditions []RolloutCondition `json:"conditions,omitempty"`
// +optional
//BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"`
// Phase is the rollout phase.
Phase RolloutPhase `json:"phase,omitempty"`
// Message provides details on why the rollout is in its current phase
Expand Down
24 changes: 12 additions & 12 deletions api/v1beta1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ type CanaryStatus struct {
// BlueGreenStatus status fields that only pertain to the blueGreen rollout
type BlueGreenStatus struct {
CommonStatus `json:",inline"`
// CanaryRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses
// UpdatedRevision is calculated by rollout based on podTemplateHash, and the internal logic flow uses
// It may be different from rs podTemplateHash in different k8s versions, so it cannot be used as service selector label
UpdatedRevision string `json:"updatedRevision"`
// UpdatedReplicas the numbers of updated pods
Expand Down Expand Up @@ -558,29 +558,29 @@ const (
type FinalisingStepType string

const (
// some work that should be done before pod scaling down.
// For BlueGreenStrategy:
// we rout all traffic to stable or new version based on FinaliseReason
// For CanaryStrategy:
// we remove the selector of stable service
FinalisingStepTypePreparing FinalisingStepType = "Preparing"
// Route all traffic to stable or new version based on FinaliseReason (for bluegreen)
FinalisingStepTypeRouteAllTraffic FinalisingStepType = "RouteAllTraffic"
// Restore the stable Service, i.e. remove corresponding selector
FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService"
// Remove the canary Service
FinalisingStepTypeRemoveCanaryService FinalisingStepType = "RemoveCanaryService"

// Patch Batch Release to scale down (exception: the canary Deployment will be
// scaled down in FinalisingStepTypeDeleteBR step)
// For Both BlueGreenStrategy and CanaryStrategy:
// set workload.pause=false, set workload.partition=0
FinalisingStepTypeBatchRelease FinalisingStepType = "PatchBatchRelease"
//TODO - Currently, the next three steps are in the same function, FinalisingTrafficRouting
// we should try to separate the FinalisingStepTypeGateway and FinalisingStepTypeCanaryService
// with graceful time to prevent some potential issues

// Restore the stable Service (i.e. remove corresponding selector)
FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService"
// Execute the FinalisingTrafficRouting function
FinalisingStepTypeTrafficRouting FinalisingStepType = "FinalisingTrafficRouting"
// Restore the GatewayAPI/Ingress/Istio
FinalisingStepTypeGateway FinalisingStepType = "RestoreGateway"
// Delete Canary Service
FinalisingStepTypeDeleteCanaryService FinalisingStepType = "DeleteCanaryService"
// Delete Batch Release
FinalisingStepTypeDeleteBR FinalisingStepType = "DeleteBatchRelease"
// All needed work done
FinalisingStepTypeEnd FinalisingStepType = "END"
)

// +genclient
Expand Down
5 changes: 2 additions & 3 deletions config/crd/bases/rollouts.kruise.io_rollouts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -534,8 +534,7 @@ spec:
format: int64
type: integer
phase:
description: BlueGreenStatus *BlueGreenStatus `json:"blueGreenStatus,omitempty"`
Phase is the rollout phase.
description: Phase is the rollout phase.
type: string
type: object
type: object
Expand Down Expand Up @@ -1475,7 +1474,7 @@ spec:
format: int32
type: integer
updatedRevision:
description: CanaryRevision is calculated by rollout based on
description: UpdatedRevision is calculated by rollout based on
podTemplateHash, and the internal logic flow uses It may be
different from rs podTemplateHash in different k8s versions,
so it cannot be used as service selector label
Expand Down
132 changes: 104 additions & 28 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,43 +363,84 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) {

// cleanup after rollout is completed or finished
func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) {
canaryStatus := c.NewStatus.CanaryStatus
// when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup
if c.NewStatus.CanaryStatus == nil {
if canaryStatus == nil {
return true, nil
}
// 1. rollout progressing complete, remove rollout progressing annotation in workload
// rollout progressing complete, remove rollout progressing annotation in workload
err := m.removeRolloutProgressingAnnotation(c)
if err != nil {
return false, err
}
tr := newTrafficRoutingContext(c)
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 3. set workload.pause=false; set workload.partition=0
done, err = m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 5. delete batchRelease crd
done, err = m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
return false, nil
// execute steps based on the predefined order for each reason
nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep)
// if current step is empty, set it with the first step
// if current step is end, we just return
if len(canaryStatus.FinalisingStep) == 0 {
canaryStatus.FinalisingStep = nextStep
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
} else if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
}
klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep)
// the steps. order is maitained by the nextStep
switch canaryStatus.FinalisingStep {
// call the FinalisingTrafficRouting function to:
// 1.restore stable service selector to select all pods
// 2.restore network api(ingress/ gateway api/ istio) configuration
// 3.delete canary service
case v1beta1.FinalisingStepTypeTrafficRouting:
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
if err != nil || !done {
canaryStatus.LastUpdateTime = tr.LastUpdateTime
return done, err
}

// set workload.pause=false; set workload.partition=0
case v1beta1.FinalisingStepTypeBatchRelease:
done, err := m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
// delete batchRelease
case v1beta1.FinalisingStepTypeDeleteBR:
done, err := m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
return false, nil
}
// restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
retry, err := m.trafficRoutingManager.RestoreGateway(tr)
if err != nil || retry {
return false, err
}
// restore the stable service
case v1beta1.FinalisingStepTypeStableService:
retry, err := m.trafficRoutingManager.RestoreStableService(tr)
if err != nil || retry {
return false, err
}
// remove canary service
case v1beta1.FinalisingStepTypeRemoveCanaryService:
retry, err := m.trafficRoutingManager.RemoveCanaryService(tr)
if err != nil || retry {
return false, err
}
}
// current step is done, run the next step
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
return false, nil
}

func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error {
Expand Down Expand Up @@ -601,3 +642,38 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary
}
return nil
}

// calculate next task
func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType {
var taskSequence []v1beta1.FinalisingStepType
//REVIEW - should we consider more complex scenarios?
// like, user rollbacks the workload and disables the Rollout at the same time?
switch reason {
case v1beta1.FinaliseReasonRollback: // rollback
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version
v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new
v1beta1.FinalisingStepTypeDeleteBR,
// v1beta1.FinalisingStepTypeTrafficRouting, // do cleaning works(restore stable Service, remove canary Service)
v1beta1.FinalisingStepTypeStableService,
v1beta1.FinalisingStepTypeRemoveCanaryService,
}
default: // others: success/disabled/deleting rollout
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeTrafficRouting, // remove selector of stable Service
v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old
v1beta1.FinalisingStepTypeDeleteBR,
}
}
// if currentTask is empty, return first task
if len(currentTask) == 0 {
return taskSequence[0]
}
// find next task
for i := range taskSequence {
if currentTask == taskSequence[i] && i < len(taskSequence)-1 {
return taskSequence[i+1]
}
}
return v1beta1.FinalisingStepTypeEnd
}
4 changes: 4 additions & 0 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type RolloutContext struct {
RecheckTime *time.Time
// wait stable workload pods ready
WaitReady bool
// finalising reason
FinalizeReason string
}

// parameter1 retryReconcile, parameter2 error
Expand Down Expand Up @@ -116,6 +118,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.WaitReady = true
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand All @@ -140,6 +143,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
case v1alpha1.ProgressingReasonCancelling:
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand Down
Loading

0 comments on commit 15b589d

Please sign in to comment.