Skip to content

Commit

Permalink
add failure threshold (#101)
Browse files Browse the repository at this point in the history
Signed-off-by: mingzhou.swx <[email protected]>

Signed-off-by: mingzhou.swx <[email protected]>
Co-authored-by: mingzhou.swx <[email protected]>
  • Loading branch information
veophi and mingzhou.swx authored Nov 16, 2022
1 parent 5924c72 commit 113527e
Show file tree
Hide file tree
Showing 14 changed files with 421 additions and 118 deletions.
6 changes: 6 additions & 0 deletions api/v1alpha1/batchrelease_plan_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ type ReleasePlan struct {
BatchPartition *int32 `json:"batchPartition,omitempty"`
// RolloutID indicates an id for each rollout progress
RolloutID string `json:"rolloutID,omitempty"`
// FailureThreshold indicates how many failed pods can be tolerated in all upgraded pods.
// Only when FailureThreshold are satisfied, Rollout can enter ready state.
// If FailureThreshold is nil, Rollout will use the MaxUnavailable of workload as its
// FailureThreshold.
// Defaults to nil.
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
}

// ReleaseBatch is used to describe how each batch release should be
Expand Down
9 changes: 7 additions & 2 deletions api/v1alpha1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,18 @@ type CanaryStrategy struct {
// TrafficRoutings hosts all the supported service meshes supported to enable more fine-grained traffic routing
// todo current only support one TrafficRouting
TrafficRoutings []*TrafficRouting `json:"trafficRoutings,omitempty"`
// FailureThreshold indicates how many failed pods can be tolerated in all upgraded pods.
// Only when FailureThreshold are satisfied, Rollout can enter ready state.
// If FailureThreshold is nil, Rollout will use the MaxUnavailable of workload as its
// FailureThreshold.
// Defaults to nil.
FailureThreshold *intstr.IntOrString `json:"failureThreshold,omitempty"`
// MetricsAnalysis *MetricsAnalysisBackground `json:"metricsAnalysis,omitempty"`
}

// CanaryStep defines a step of a canary workload.
type CanaryStep struct {
// SetWeight sets what percentage of the canary pods should receive

// Weight indicate how many percentage of traffic the canary pods should receive
// +optional
Weight *int32 `json:"weight,omitempty"`
// Replicas is the number of expected canary pods in this batch
Expand Down
10 changes: 10 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 10 additions & 0 deletions config/crd/bases/rollouts.kruise.io_batchreleases.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ spec:
- canaryReplicas
type: object
type: array
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed pods can
be tolerated in all upgraded pods. Only when FailureThreshold
are satisfied, Rollout can enter ready state. If FailureThreshold
is nil, Rollout will use the MaxUnavailable of workload as its
FailureThreshold. Defaults to nil.
x-kubernetes-int-or-string: true
rolloutID:
description: RolloutID indicates an id for each rollout progress
type: string
Expand Down
12 changes: 12 additions & 0 deletions config/crd/bases/rollouts.kruise.io_rollouts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,16 @@ spec:
description: CanaryStrategy defines parameters for a Replica Based
Canary
properties:
failureThreshold:
anyOf:
- type: integer
- type: string
description: FailureThreshold indicates how many failed pods
can be tolerated in all upgraded pods. Only when FailureThreshold
are satisfied, Rollout can enter ready state. If FailureThreshold
is nil, Rollout will use the MaxUnavailable of workload
as its FailureThreshold. Defaults to nil.
x-kubernetes-int-or-string: true
steps:
description: Steps define the order of phases to execute release
in batches(20%, 40%, 60%, 80%, 100%)
Expand All @@ -117,6 +127,8 @@ spec:
5) or a percentage of total pods.'
x-kubernetes-int-or-string: true
weight:
description: Weight indicate how many percentage of
traffic the canary pods should receive
format: int32
type: integer
type: object
Expand Down
60 changes: 36 additions & 24 deletions pkg/controller/batchrelease/workloads/cloneset_control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,10 +158,14 @@ func (c *CloneSetRolloutController) UpgradeOneBatch() (bool, error) {
return false, nil
}

pods, err := util.ListOwnedPods(c.client, c.clone)
if err != nil {
klog.Errorf("Failed to list pods for CloneSet %v", c.targetNamespacedName)
return false, err
var err error
var pods []*v1.Pod
if c.release.Spec.ReleasePlan.RolloutID != "" {
pods, err = util.ListOwnedPods(c.client, c.clone)
if err != nil {
klog.Errorf("Failed to list pods for CloneSet %v", c.targetNamespacedName)
return false, err
}
}

var noNeedRollbackReplicas int32
Expand Down Expand Up @@ -228,9 +232,23 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) {
return false, nil
}

rolloutID := c.release.Spec.ReleasePlan.RolloutID

var err error
var pods []*v1.Pod
// if rolloutID is not set, no need to list pods,
// because we cannot patch correct batch label to pod.
if rolloutID != "" {
pods, err = util.ListOwnedPods(c.client, c.clone)
if err != nil {
return false, err
}
}

var noNeedRollbackReplicas int32
if c.newStatus.CanaryStatus.NoNeedUpdateReplicas != nil {
noNeedRollbackReplicas = *c.newStatus.CanaryStatus.NoNeedUpdateReplicas
noNeedRollbackReplicas = countNoNeedRollbackReplicas(pods, c.newStatus.UpdateRevision, c.release.Spec.ReleasePlan.RolloutID)
c.newStatus.CanaryStatus.NoNeedUpdateReplicas = pointer.Int32(noNeedRollbackReplicas)
}

replicas := *c.clone.Spec.Replicas
Expand All @@ -241,41 +259,35 @@ func (c *CloneSetRolloutController) CheckOneBatchReady() (bool, error) {

// current batch id
currentBatch := c.newStatus.CanaryStatus.CurrentBatch
// the number of canary pods should have in current batch in plan
plannedUpdatedReplicas := c.calculateCurrentCanary(c.newStatus.ObservedWorkloadReplicas)
// the number of pods will be partitioned by cloneSet
partitionedStableReplicas, _ := intstr.GetValueFromIntOrPercent(c.clone.Spec.UpdateStrategy.Partition, int(replicas), true)
// the number of canary pods that consider rollback context and other real-world situations
expectedBatchCanaryReplicas := c.calculateCurrentCanary(replicas - noNeedRollbackReplicas)
expectedUpdatedReplicas := c.calculateCurrentCanary(replicas - noNeedRollbackReplicas)
// the number of stable pods that consider rollback context and other real-world situations
expectedBatchStableReplicas := replicas - noNeedRollbackReplicas - expectedBatchCanaryReplicas
expectedStableReplicas := replicas - noNeedRollbackReplicas - expectedUpdatedReplicas
// the number of canary pods that cloneSet will be upgraded
realNeedUpgradeCanaryReplicas := CalculateRealCanaryReplicasGoal(expectedBatchStableReplicas, replicas, &c.release.Spec.ReleasePlan.Batches[currentBatch].CanaryReplicas)

var maxUnavailableReplicas int
if c.clone.Spec.UpdateStrategy.MaxUnavailable != nil {
maxUnavailableReplicas, _ = intstr.GetValueFromIntOrPercent(c.clone.Spec.UpdateStrategy.MaxUnavailable, int(realNeedUpgradeCanaryReplicas), true)
}
realDesiredUpdatedReplicas := CalculateRealCanaryReplicasGoal(expectedStableReplicas, replicas, &c.release.Spec.ReleasePlan.Batches[currentBatch].CanaryReplicas)

klog.V(3).InfoS("check one batch, current info:",
"BatchRelease", c.releasePlanKey,
"currentBatch", currentBatch,
"replicas", replicas,
"updatedReplicas", updatedReplicas,
"noNeedRollbackReplicas", noNeedRollbackReplicas,
"maxUnavailableReplicas", maxUnavailableReplicas,
"partitionedStableReplicas", partitionedStableReplicas,
"expectedBatchCanaryReplicas", expectedBatchCanaryReplicas,
"expectedBatchStableReplicas", expectedBatchStableReplicas)
"expectedUpdatedReplicas", expectedUpdatedReplicas,
"realDesiredUpdatedReplicas", realDesiredUpdatedReplicas,
"expectedStableReplicas", expectedStableReplicas)

currentBatchIsReady := updatedReplicas >= realNeedUpgradeCanaryReplicas && // 1.the number of upgrade pods achieved the goal
updatedReadyReplicas+int32(maxUnavailableReplicas) >= realNeedUpgradeCanaryReplicas && // 2.the number of upgraded available pods achieved the goal
(realNeedUpgradeCanaryReplicas == 0 || updatedReadyReplicas >= 1) // 3.make sure that at least one upgrade pod is available

if !currentBatchIsReady {
klog.InfoS("the batch is not ready yet", "BatchRelease", c.releasePlanKey, "current-batch", c.newStatus.CanaryStatus.CurrentBatch)
if !isBatchReady(c.release, pods, c.clone.Spec.UpdateStrategy.MaxUnavailable,
plannedUpdatedReplicas, realDesiredUpdatedReplicas, updatedReplicas, updatedReadyReplicas) {
klog.Infof("BatchRelease(%v) batch is not ready yet, current batch=%d", klog.KObj(c.release), currentBatch)
return false, nil
}

c.recorder.Eventf(c.release, v1.EventTypeNormal, "BatchAvailable", "Batch %d is available", c.newStatus.CanaryStatus.CurrentBatch)
klog.Infof("BatchRelease(%v) batch is ready, current batch=%d", klog.KObj(c.release), currentBatch)
return true, nil
}

Expand Down Expand Up @@ -380,7 +392,7 @@ func (c *CloneSetRolloutController) recordCloneSetRevisionAndReplicas() {

func (c *CloneSetRolloutController) patchPodBatchLabel(pods []*v1.Pod, plannedBatchCanaryReplicas, expectedBatchStableReplicas int32) (bool, error) {
rolloutID := c.release.Spec.ReleasePlan.RolloutID
if rolloutID == "" {
if rolloutID == "" || len(pods) == 0 {
return true, nil
}

Expand Down
40 changes: 40 additions & 0 deletions pkg/controller/batchrelease/workloads/commons.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,43 @@ func getPodOrdinal(pod *corev1.Pod) int {
ord, _ := strconv.Atoi(pod.Name[strings.LastIndex(pod.Name, "-")+1:])
return ord
}

func failureThreshold(threshold, maxUnavailable *intstr.IntOrString, replicas int32) int32 {
globalThreshold := 0
if threshold != nil {
globalThreshold, _ = intstr.GetScaledValueFromIntOrPercent(threshold, int(replicas), true)
} else if maxUnavailable != nil {
globalThreshold, _ = intstr.GetScaledValueFromIntOrPercent(maxUnavailable, int(replicas), true)
}
return int32(integer.IntMax(0, globalThreshold))
}

func isBatchReady(release *v1alpha1.BatchRelease, pods []*corev1.Pod, maxUnavailable *intstr.IntOrString, labelDesired, desired, updated, updatedReady int32) bool {
updateRevision := release.Status.UpdateRevision
if updatedReady <= 0 { // Some workloads, such as StatefulSet, may not have such field
updatedReady = int32(util.WrappedPodCount(pods, func(pod *corev1.Pod) bool {
return pod.DeletionTimestamp.IsZero() && util.IsConsistentWithRevision(pod, updateRevision) && util.IsPodReady(pod)
}))
}

rolloutID := release.Spec.ReleasePlan.RolloutID
threshold := failureThreshold(release.Spec.ReleasePlan.FailureThreshold, maxUnavailable, updated)
podReady := updated >= desired && updatedReady+threshold >= desired && (desired == 0 || updatedReady > 0)
return podReady && isPodBatchLabelSatisfied(pods, rolloutID, labelDesired)
}

func isPodBatchLabelSatisfied(pods []*corev1.Pod, rolloutID string, targetCount int32) bool {
if len(rolloutID) == 0 || len(pods) == 0 {
return true
}
labeledCount := int32(0)
for _, pod := range pods {
if !pod.DeletionTimestamp.IsZero() {
continue
}
if pod.Labels[util.RolloutIDLabel] == rolloutID {
labeledCount++
}
}
return labeledCount >= targetCount
}
Loading

0 comments on commit 113527e

Please sign in to comment.