Skip to content

Commit

Permalink
refactor the grace system
Browse files Browse the repository at this point in the history
Signed-off-by: yunbo <[email protected]>
  • Loading branch information
Funinu committed Aug 7, 2024
1 parent 78273c2 commit c62d09a
Show file tree
Hide file tree
Showing 18 changed files with 890 additions and 346 deletions.
23 changes: 11 additions & 12 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,19 +76,18 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
klog.Infof("rollout(%s/%s) canary step jumped", c.Rollout.Namespace, c.Rollout.Name)
return nil
}
gracePeriodSeconds := util.GracePeriodSecondsOrDefault(c.Rollout.Spec.Strategy.GetTrafficRouting(), defaultGracePeriodSeconds)
// When the first batch is trafficRouting rolling and the next steps are rolling release,
// We need to clean up the canary-related resources first and then rollout the rest of the batch.
currentStep := c.Rollout.Spec.Strategy.Canary.Steps[canaryStatus.CurrentStepIndex-1]
if currentStep.Traffic == nil && len(currentStep.Matches) == 0 {
tr := newTrafficRoutingContext(c)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil {
return err
} else if !done {
klog.Infof("rollout(%s/%s) cleaning up canary-related resources", c.Rollout.Namespace, c.Rollout.Name)
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand Down Expand Up @@ -116,11 +115,11 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
expectedReplicas, _ := intstr.GetScaledValueFromIntOrPercent(currentStep.Replicas, int(c.Workload.Replicas), true)
if expectedReplicas >= int(c.Workload.Replicas) && v1beta1.IsRealPartition(c.Rollout) {
klog.Infof("special case detected: rollout(%s/%s) restore stable Service", c.Rollout.Namespace, c.Rollout.Name)
done, err := m.trafficRoutingManager.RestoreStableService(tr)
retry, err := m.trafficRoutingManager.RestoreStableService(tr)
if err != nil {
return err
} else if !done {
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
} else if retry {
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand All @@ -146,11 +145,11 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
if canaryStatus.CurrentStepIndex == 1 {
if !tr.DisableGenerateCanaryService {
klog.Infof("special case detected: rollout(%s/%s) patch stable Service", c.Rollout.Namespace, c.Rollout.Name)
done, err := m.trafficRoutingManager.PatchStableService(tr)
retry, err := m.trafficRoutingManager.PatchStableService(tr)
if err != nil {
return err
} else if !done {
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
} else if retry {
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime
return nil
}
Expand Down Expand Up @@ -195,7 +194,7 @@ func (m *canaryReleaseManager) runCanary(c *RolloutContext) error {
klog.Infof("rollout(%s/%s) step(%d) state from(%s) -> to(%s)", c.Rollout.Namespace, c.Rollout.Name,
canaryStatus.CurrentStepIndex, v1beta1.CanaryStepStateTrafficRouting, canaryStatus.CurrentStepState)
}
expectedTime := time.Now().Add(time.Duration(gracePeriodSeconds) * time.Second)
expectedTime := time.Now().Add(tr.RecheckDuration)
c.RecheckTime = &expectedTime

case v1beta1.CanaryStepStateMetricsAnalysis:
Expand Down Expand Up @@ -369,7 +368,7 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
}
tr := newTrafficRoutingContext(c)
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr, true)
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
Expand All @@ -380,7 +379,7 @@ func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, erro
return done, err
}
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr, false)
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/rollout/rollout_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ var (
Ingress: &v1beta1.IngressTrafficRouting{
Name: "echoserver",
},
GracePeriodSeconds: 0, // To facilitate testing, don't wait after traffic routing operation
},
},
},
Expand Down
22 changes: 4 additions & 18 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -431,11 +431,6 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
if subStatus == nil {
return true, nil
}
gracePeriodSeconds := c.Rollout.Spec.Strategy.GetTrafficRouting()[0].GracePeriodSeconds
// To ensure respect for graceful time between these steps, we set start timer before the first step
if len(subStatus.FinalisingStep) == 0 {
subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
}
tr := newTrafficRoutingContext(c)
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
switch subStatus.FinalisingStep {
Expand All @@ -446,21 +441,11 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
// firstly, restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
//TODO - RestoreGateway returns (bool, error) pair instead of error only.
// return (fasle, nil): gateway is patched successfully, but we need time to observe; recheck later
// return (true, nil): gateway is patched successfully, and accepts the update successfully; go to next step then
// return (false, error): gateway encounters error when patched, or the update is not accepted; recheck later
err := r.trafficRoutingManager.RestoreGateway(tr)
if err != nil {
retry, err := r.trafficRoutingManager.RestoreGateway(tr)
if err != nil || retry {
subStatus.LastUpdateTime = tr.LastUpdateTime
return false, err
}
// usually, GracePeriodSeconds means duration to wait after an operation is done,
// we use defaultGracePeriodSeconds+1 here because the timer started before the RestoreGateway step
if subStatus.LastUpdateTime != nil && time.Since(subStatus.LastUpdateTime.Time) < time.Second*time.Duration(gracePeriodSeconds+1) {
klog.Infof("rollout(%s/%s) in step (%s), and wait %d seconds", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep, gracePeriodSeconds+1)
return false, nil
}
klog.Infof("rollout(%s/%s) in step (%s), and success", c.Rollout.Namespace, c.Rollout.Name, subStatus.FinalisingStep)
subStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
subStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
Expand Down Expand Up @@ -490,7 +475,8 @@ func (r *RolloutReconciler) doProgressingReset(c *RolloutContext) (bool, error)
first step of v3 release.
*/
case v1beta1.FinalisingStepTypeDeleteCanaryService:
err := r.trafficRoutingManager.RemoveCanaryService(tr)
// ignore the grace period because it is the last step
_, err := r.trafficRoutingManager.RemoveCanaryService(tr)
if err != nil {
subStatus.LastUpdateTime = tr.LastUpdateTime
return false, err
Expand Down
10 changes: 7 additions & 3 deletions pkg/controller/rollout/rollout_progressing_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
},
},
{
name: "ReconcileRolloutProgressing rolling -> continueRelease1",
name: "ReconcileRolloutProgressing rolling -> continueRelease1", // add grace time to test the first step: restoring the gateway
getObj: func() ([]*apps.Deployment, []*apps.ReplicaSet) {
dep1 := deploymentDemo.DeepCopy()
dep1.Spec.Template.Spec.Containers[0].Image = "echoserver:v3"
Expand All @@ -558,10 +558,14 @@ func TestReconcileRolloutProgressing(t *testing.T) {
return []*apps.Deployment{dep1, dep2}, []*apps.ReplicaSet{rs1, rs2}
},
getNetwork: func() ([]*corev1.Service, []*netv1.Ingress) {
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{demoIngress.DeepCopy()}
c1 := demoIngress.DeepCopy()
c2 := demoIngress.DeepCopy()
c2.Name = c2.Name + "-canary"
return []*corev1.Service{demoService.DeepCopy()}, []*netv1.Ingress{c1, c2}
},
getRollout: func() (*v1beta1.Rollout, *v1beta1.BatchRelease, *v1alpha1.TrafficRouting) {
obj := rolloutDemo.DeepCopy()
obj.Spec.Strategy.Canary.TrafficRoutings[0].GracePeriodSeconds = 1 // add grace time to test fine step in continuous logic
obj.Status.CanaryStatus.ObservedWorkloadGeneration = 2
obj.Status.CanaryStatus.RolloutHash = "f55bvd874d5f2fzvw46bv966x4bwbdv4wx6bd9f7b46ww788954b8z8w29b7wxfd"
obj.Status.CanaryStatus.StableRevision = "pod-template-hash-v1"
Expand Down Expand Up @@ -635,7 +639,7 @@ func TestReconcileRolloutProgressing(t *testing.T) {
obj.Status.CanaryStatus.CanaryReadyReplicas = 3
obj.Status.CanaryStatus.PodTemplateHash = "pod-template-hash-v2"
obj.Status.CanaryStatus.CurrentStepState = v1beta1.CanaryStepStateUpgrade
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteCanaryService
obj.Status.CanaryStatus.FinalisingStep = v1beta1.FinalisingStepTypeDeleteBR
cond := util.GetRolloutCondition(obj.Status, v1beta1.RolloutConditionProgressing)
cond.Reason = v1alpha1.ProgressingReasonInRolling
util.SetRolloutCondition(&obj.Status, *cond)
Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/trafficrouting/trafficrouting_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,13 @@ func (r *TrafficRoutingReconciler) Reconcile(ctx context.Context, req ctrl.Reque
done, err = r.trafficRoutingManager.DoTrafficRouting(newTrafficRoutingContext(tr))
}
case v1alpha1.TrafficRoutingPhaseFinalizing:
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
if done {
newStatus.Phase = v1alpha1.TrafficRoutingPhaseHealthy
newStatus.Message = "TrafficRouting is Healthy"
}
case v1alpha1.TrafficRoutingPhaseTerminating:
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr), false)
done, err = r.trafficRoutingManager.FinalisingTrafficRouting(newTrafficRoutingContext(tr))
if done {
// remove trafficRouting finalizer
err = r.handleFinalizer(tr)
Expand Down
Loading

0 comments on commit c62d09a

Please sign in to comment.