Skip to content

Commit

Permalink
allow set tolerate failed pods for each step
Browse files Browse the repository at this point in the history
Signed-off-by: mingzhou.swx <[email protected]>
  • Loading branch information
mingzhou.swx committed Nov 3, 2022
1 parent 5924c72 commit 8650229
Show file tree
Hide file tree
Showing 13 changed files with 336 additions and 45 deletions.
16 changes: 16 additions & 0 deletions api/v1alpha1/batchrelease_plan_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ type ReleasePlan struct {
BatchPartition *int32 `json:"batchPartition,omitempty"`
// RolloutID indicates an id for each rollout progress
RolloutID string `json:"rolloutID,omitempty"`
// FailureThreshold indicates how many failed pods can be tolerated in all upgraded pods.
// Only when both batch FailureThreshold and Canary.FailureThreshold are satisfied, Rollout
// can enter ready state.
// If FailureThreshold is nil, Rollout will use the MaxUnavailable of workload as its
// FailureThreshold.
// Defaults to nil.
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
}

// ReleaseBatch is used to describe how each batch release should be
Expand All @@ -52,6 +59,15 @@ type ReleaseBatch struct {
// it can be an absolute number (ex: 5) or a percentage of workload replicas.
// batches[i].canaryReplicas should less than or equal to batches[j].canaryReplicas if i < j.
CanaryReplicas intstr.IntOrString `json:"canaryReplicas"`
// FailureThreshold indicates how many failed pods can be tolerated in the pods upgraded in current batch.
// For example, assume that 10 pods are upgraded *in this batch*, and FailureThreshold=20%,
// this batch can enter ready state if at latest 8 upgraded pods are ready.
// Only when both batch FailureThreshold and canary.FailureThreshold are satisfied,
// Rollout can enter ready state the next batch.
// This FailureThreshold will take effect only when RolloutID is set.
// Default to nil.
// +optional
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
}

// BatchReleaseStatus defines the observed state of a release plan
Expand Down
19 changes: 17 additions & 2 deletions api/v1alpha1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,19 @@ type CanaryStrategy struct {
// TrafficRoutings hosts all the supported service meshes supported to enable more fine-grained traffic routing
// todo current only support one TrafficRouting
TrafficRoutings []*TrafficRouting `json:"trafficRoutings,omitempty"`
// FailureThreshold indicates how many failed pods can be tolerated in all upgraded pods.
// Only when both batch FailureThreshold and Canary.FailureThreshold are satisfied, Rollout
// can enter ready state.
// If FailureThreshold is nil, Rollout will use the MaxUnavailable of workload as its
// FailureThreshold.
// Defaults to nil.
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
// MetricsAnalysis *MetricsAnalysisBackground `json:"metricsAnalysis,omitempty"`
}

// CanaryStep defines a step of a canary workload.
type CanaryStep struct {
// SetWeight sets what percentage of the canary pods should receive

// Weight indicate how many percentage of traffic the canary pods should receive
// +optional
Weight *int32 `json:"weight,omitempty"`
// Replicas is the number of expected canary pods in this batch
Expand All @@ -111,6 +117,15 @@ type CanaryStep struct {
// Pause defines a pause stage for a rollout, manual or auto
// +optional
Pause RolloutPause `json:"pause,omitempty"`
// FailureThreshold indicates how many failed pods can be tolerated in the pods upgraded in current batch.
// For example, assume that 10 pods are upgraded *in this batch*, and FailureThreshold=20%,
// this batch can enter ready state if at latest 8 upgraded pods are ready.
// Only when both batch FailureThreshold and canary.FailureThreshold are satisfied,
// Rollout can enter ready state the next batch.
// This FailureThreshold will take effect only when RolloutID is set.
// Default to nil.
// +optional
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
// MetricsAnalysis *RolloutAnalysis `json:"metricsAnalysis,omitempty"`
}

Expand Down
24 changes: 23 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions config/crd/bases/rollouts.kruise.io_batchreleases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,35 @@ spec:
should less than or equal to batches[j].canaryReplicas
if i < j.'
x-kubernetes-int-or-string: true
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed
pods can be tolerated in the pods upgraded in current
batch. For example, assume that 10 pods are upgraded *in
this batch*, and FailureThreshold=20%, this batch can
enter ready state if at latest 8 upgraded pods are ready.
Only when both batch FailureThreshold and canary.FailureThreshold
are satisfied, Rollout can enter ready state the next
batch. This FailureThreshold will take effect only when
RolloutID is set. Default to nil.
x-kubernetes-int-or-string: true
required:
- canaryReplicas
type: object
type: array
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed pods can
be tolerated in all upgraded pods. Only when both batch FailureThreshold
and Canary.FailureThreshold are satisfied, Rollout can enter
ready state. If FailureThreshold is nil, Rollout will use the
MaxUnavailable of workload as its FailureThreshold. Defaults
to nil.
x-kubernetes-int-or-string: true
rolloutID:
description: RolloutID indicates an id for each rollout progress
type: string
Expand Down
28 changes: 28 additions & 0 deletions config/crd/bases/rollouts.kruise.io_rollouts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,38 @@ spec:
description: CanaryStrategy defines parameters for a Replica Based
Canary
properties:
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed pods
can be tolerated in all upgraded pods. Only when both batch
FailureThreshold and Canary.FailureThreshold are satisfied,
Rollout can enter ready state. If FailureThreshold is nil,
Rollout will use the MaxUnavailable of workload as its FailureThreshold.
Defaults to nil.
x-kubernetes-int-or-string: true
steps:
description: Steps define the order of phases to execute release
in batches(20%, 40%, 60%, 80%, 100%)
items:
description: CanaryStep defines a step of a canary workload.
properties:
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed
pods can be tolerated in the pods upgraded in current
batch. For example, assume that 10 pods are upgraded
*in this batch*, and FailureThreshold=20%, this batch
can enter ready state if at latest 8 upgraded pods
are ready. Only when both batch FailureThreshold and
canary.FailureThreshold are satisfied, Rollout can
enter ready state the next batch. This FailureThreshold
will take effect only when RolloutID is set. Default
to nil.
x-kubernetes-int-or-string: true
pause:
description: Pause defines a pause stage for a rollout,
manual or auto
Expand All @@ -117,6 +143,8 @@ spec:
5) or a percentage of total pods.'
x-kubernetes-int-or-string: true
weight:
description: Weight indicate how many percentage of
traffic the canary pods should receive
format: int32
type: integer
type: object
Expand Down
49 changes: 36 additions & 13 deletions pkg/controller/batchrelease/workloads/cloneset_control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,14 @@ func (c *CloneSetRolloutController) UpgradeOneBatch() (bool, error) {
return false, nil
}

pods, err := util.ListOwnedPods(c.client, c.clone)
if err != nil {
klog.Errorf("Failed to list pods for CloneSet %v", c.targetNamespacedName)
return false, err
var err error
var pods []*v1.Pod
if c.release.Spec.ReleasePlan.RolloutID != "" {
pods, err = util.ListOwnedPods(c.client, c.clone)
if err != nil {
klog.Errorf("Failed to list pods for CloneSet %v", c.targetNamespacedName)
return false, err
}
}

var noNeedRollbackReplicas int32
Expand Down Expand Up @@ -228,9 +232,21 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) {
return false, nil
}

rolloutID := c.release.Spec.ReleasePlan.RolloutID

var err error
var pods []*v1.Pod
if rolloutID != "" {
pods, err = util.ListOwnedPods(c.client, c.clone)
if err != nil {
return false, err
}
}

var noNeedRollbackReplicas int32
if c.newStatus.CanaryStatus.NoNeedUpdateReplicas != nil {
noNeedRollbackReplicas = *c.newStatus.CanaryStatus.NoNeedUpdateReplicas
noNeedRollbackReplicas = countNoNeedRollbackReplicas(pods, c.newStatus.UpdateRevision, c.release.Spec.ReleasePlan.RolloutID)
c.newStatus.CanaryStatus.NoNeedUpdateReplicas = pointer.Int32(noNeedRollbackReplicas)
}

replicas := *c.clone.Spec.Replicas
Expand All @@ -241,6 +257,8 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) {

// current batch id
currentBatch := c.newStatus.CanaryStatus.CurrentBatch
// the number of canary pods should have in current batch in plan
plannedBatchCanaryReplicas := c.calculateCurrentCanary(c.newStatus.ObservedWorkloadReplicas)
// the number of pods will be partitioned by cloneSet
partitionedStableReplicas, _ := intstr.GetValueFromIntOrPercent(c.clone.Spec.UpdateStrategy.Partition, int(replicas), true)
// the number of canary pods that consider rollback context and other real-world situations
Expand All @@ -249,25 +267,30 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) {
expectedBatchStableReplicas := replicas - noNeedRollbackReplicas - expectedBatchCanaryReplicas
// the number of canary pods that cloneSet will be upgraded
realNeedUpgradeCanaryReplicas := CalculateRealCanaryReplicasGoal(expectedBatchStableReplicas, replicas, &c.release.Spec.ReleasePlan.Batches[currentBatch].CanaryReplicas)

var maxUnavailableReplicas int
if c.clone.Spec.UpdateStrategy.MaxUnavailable != nil {
maxUnavailableReplicas, _ = intstr.GetValueFromIntOrPercent(c.clone.Spec.UpdateStrategy.MaxUnavailable, int(realNeedUpgradeCanaryReplicas), true)
}
// the number of not-ready pods that can be tolerated.
failureThreshold := util.FailureThreshold(
c.release.Spec.ReleasePlan.FailureThreshold,
c.release.Spec.ReleasePlan.Batches[currentBatch].FailureThreshold,
c.clone.Spec.UpdateStrategy.MaxUnavailable,
c.clone.Status.UpdatedReplicas, currentBatch, rolloutID, pods)

klog.V(3).InfoS("check one batch, current info:",
"BatchRelease", c.releasePlanKey,
"currentBatch", currentBatch,
"replicas", replicas,
"updatedReplicas", updatedReplicas,
"failureThreshold", failureThreshold,
"noNeedRollbackReplicas", noNeedRollbackReplicas,
"maxUnavailableReplicas", maxUnavailableReplicas,
"partitionedStableReplicas", partitionedStableReplicas,
"expectedBatchCanaryReplicas", expectedBatchCanaryReplicas,
"expectedBatchStableReplicas", expectedBatchStableReplicas)

if !util.IsPodBatchLabelSatisfied(pods, rolloutID, plannedBatchCanaryReplicas) {
return false, nil
}

currentBatchIsReady := updatedReplicas >= realNeedUpgradeCanaryReplicas && // 1.the number of upgrade pods achieved the goal
updatedReadyReplicas+int32(maxUnavailableReplicas) >= realNeedUpgradeCanaryReplicas && // 2.the number of upgraded available pods achieved the goal
updatedReadyReplicas+failureThreshold >= realNeedUpgradeCanaryReplicas && // 2.the number of upgraded available pods achieved the goal
(realNeedUpgradeCanaryReplicas == 0 || updatedReadyReplicas >= 1) // 3.make sure that at least one upgrade pod is available

if !currentBatchIsReady {
Expand Down Expand Up @@ -380,7 +403,7 @@ func (c *CloneSetRolloutController) recordCloneSetRevisionAndReplicas() {

func (c *CloneSetRolloutController) patchPodBatchLabel(pods []*v1.Pod, plannedBatchCanaryReplicas, expectedBatchStableReplicas int32) (bool, error) {
rolloutID := c.release.Spec.ReleasePlan.RolloutID
if rolloutID == "" {
if rolloutID == "" || len(pods) == 0 {
return true, nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,12 @@ func (c *DeploymentsRolloutController) CheckOneBatchReady() (bool, error) {
// canary goal that should have in current batch
canaryGoal := c.calculateCurrentCanary(c.newStatus.ObservedWorkloadReplicas)
// max unavailable allowed replicas
maxUnavailable := 0
if c.canary.Spec.Strategy.RollingUpdate != nil &&
failureThreshold := 0
if c.release.Spec.ReleasePlan.FailureThreshold != nil {
failureThreshold, _ = intstr.GetScaledValueFromIntOrPercent(c.release.Spec.ReleasePlan.FailureThreshold, int(*c.canary.Spec.Replicas), true)
} else if c.canary.Spec.Strategy.RollingUpdate != nil &&
c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable != nil {
maxUnavailable, _ = intstr.GetScaledValueFromIntOrPercent(c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable, int(*c.canary.Spec.Replicas), true)
failureThreshold, _ = intstr.GetScaledValueFromIntOrPercent(c.canary.Spec.Strategy.RollingUpdate.MaxUnavailable, int(*c.canary.Spec.Replicas), true)
}

klog.InfoS("checking the batch releasing progress",
Expand All @@ -156,13 +158,13 @@ func (c *DeploymentsRolloutController) CheckOneBatchReady() (bool, error) {
"canary-goal", canaryGoal,
"canary-available-pod-count", availableCanaryPodCount,
"stable-pod-status-replicas", c.stable.Status.Replicas,
"maxUnavailable", maxUnavailable)
"failureThreshold", failureThreshold)

currentBatchIsNotReadyYet := func() bool {
// the number of upgrade pods does not achieve the goal
return canaryPodCount < canaryGoal ||
// the number of upgraded available pods does not achieve the goal
availableCanaryPodCount+int32(maxUnavailable) < canaryGoal ||
availableCanaryPodCount+int32(failureThreshold) < canaryGoal ||
// make sure that at least one upgrade pod is available
(canaryGoal > 0 && availableCanaryPodCount == 0)
}
Expand Down
Loading

0 comments on commit 8650229

Please sign in to comment.