Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: rollback to stable with dynamicStableScale could overwhelm stable pods #3077

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions rollout/bluegreen.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,10 +220,9 @@
annotationedRSs := int32(0)
rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)

Check warning on line 225 in rollout/bluegreen.go

View check run for this annotation

Codecov / codecov/patch

rollout/bluegreen.go#L224-L225

Added lines #L224 - L225 were not covered by tests
continue
}
if *targetRS.Spec.Replicas == 0 {
Expand Down
49 changes: 24 additions & 25 deletions rollout/canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,9 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli

annotationedRSs := int32(0)
for _, targetRS := range oldRSs {
if replicasetutil.IsStillReferenced(c.rollout.Status, targetRS) {
// We should technically never get here because we shouldn't be passing a replicaset list
// which includes referenced ReplicaSets. But we check just in case
c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
if c.isReplicaSetReferenced(targetRS) {
// We might get here if user interrupted an an update in order to move back to stable.
c.log.Infof("Skip scale down of older RS '%s': still referenced", targetRS.Name)
continue
}
if maxScaleDown <= 0 {
Expand Down Expand Up @@ -220,15 +219,8 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
// and doesn't yet have scale down deadline. This happens when a user changes their
// mind in the middle of an V1 -> V2 update, and then applies a V3. We are deciding
// what to do with the defunct, intermediate V2 ReplicaSet right now.
if !c.replicaSetReferencedByCanaryTraffic(targetRS) {
// It is safe to scale the intermediate RS down, if no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
} else {
c.log.Infof("Skip scaling down intermediate RS '%s': still referenced by service", targetRS.Name)
// This ReplicaSet is still referenced by the service. It is not safe to scale
// this down.
continue
}
// It is safe to scale the intermediate RS down, since no traffic is directed to it.
c.log.Infof("scaling down intermediate RS '%s'", targetRS.Name)
}
}
if *targetRS.Spec.Replicas == desiredReplicaCount {
Expand All @@ -248,19 +240,26 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
return totalScaledDown, nil
}

func (c *rolloutContext) replicaSetReferencedByCanaryTraffic(rs *appsv1.ReplicaSet) bool {
rsPodHash := replicasetutil.GetPodTemplateHash(rs)
ro := c.rollout

if ro.Status.Canary.Weights == nil {
return false
}

if ro.Status.Canary.Weights.Canary.PodTemplateHash == rsPodHash || ro.Status.Canary.Weights.Stable.PodTemplateHash == rsPodHash {
return true
// isDynamicallyRollingBackToStable returns true if we were in the middle of an canary update with
// dynamic stable scaling, but was interrupted and are now rolling back to stable RS. This is similar
// to, but different than aborting. With abort, desired hash != stable hash and so we know the
// two hashes to balance traffic against. But with dynamically rolling back to stable, the
// desired hash == stable hash, and so we must use the *previous* desired hash and balance traffic
// between previous desired vs. stable hash, in order to safely shift traffic back to stable.
// This function also returns the previous desired hash (where we are weighted to)
func isDynamicallyRollingBackToStable(ro *v1alpha1.Rollout, desiredRS *appsv1.ReplicaSet) (bool, string) {
if rolloututil.IsFullyPromoted(ro) && ro.Spec.Strategy.Canary.TrafficRouting != nil && ro.Spec.Strategy.Canary.DynamicStableScale {
if ro.Status.Canary.Weights != nil {
currSelector := ro.Status.Canary.Weights.Canary.PodTemplateHash
desiredSelector := replicasetutil.GetPodTemplateHash(desiredRS)
if currSelector != desiredSelector {
if desiredRS.Status.AvailableReplicas < *ro.Spec.Replicas {
return true, currSelector
}
}
}
}

return false
return false, ""
}

// canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating
Expand Down
118 changes: 118 additions & 0 deletions rollout/canary_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/stretchr/testify/assert"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -1890,3 +1891,120 @@ func TestHandleCanaryAbort(t *testing.T) {
assert.JSONEq(t, calculatePatch(r1, fmt.Sprintf(expectedPatch, newConditions)), patch)
})
}

func TestIsDynamicallyRollingBackToStable(t *testing.T) {
newRSWithHashAndReplicas := func(hash string, available int32) *appsv1.ReplicaSet {
return &appsv1.ReplicaSet{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
v1alpha1.DefaultRolloutUniqueLabelKey: hash,
},
},
Status: v1.ReplicaSetStatus{
AvailableReplicas: available,
},
}
}

testCases := []struct {
name string
status v1alpha1.RolloutStatus
trafficRoutingDisabled bool
dynamicStableScalingDisabled bool
rsHash string
rsAvailableReplicas *int32 // if nil, will set to rollout replicas
trafficWeights *v1alpha1.TrafficWeights
expectedResult bool
}{
{
name: "desired RS != stable RS",
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "def456"},
rsHash: "",
expectedResult: false,
},
{
name: "not using traffic routing",
trafficRoutingDisabled: true,
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"},
rsHash: "",
expectedResult: false,
},
{
name: "not using dynamicStableScaling",
dynamicStableScalingDisabled: true,
status: v1alpha1.RolloutStatus{CurrentPodHash: "abc123", StableRS: "abc123"},
rsHash: "",
expectedResult: false,
},
{
name: "weighted selector == desired RS",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "abc123",
},
},
},
},
rsHash: "abc123",
expectedResult: false,
},
{
name: "weighted selector != desired RS, desired not fully available",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "def456",
},
},
},
},
rsHash: "abc123",
rsAvailableReplicas: pointer.Int32(1),
expectedResult: true,
},
{
name: "weighted selector != desired RS, desired RS is fully available",
status: v1alpha1.RolloutStatus{
CurrentPodHash: "abc123",
StableRS: "abc123",
Canary: v1alpha1.CanaryStatus{
Weights: &v1alpha1.TrafficWeights{
Canary: v1alpha1.WeightDestination{
PodTemplateHash: "def456",
},
},
},
},
rsHash: "abc123",
expectedResult: true,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
ro := newCanaryRollout("test", 10, nil, nil, nil, intstr.FromInt(0), intstr.FromInt(1))
if !tc.trafficRoutingDisabled {
ro.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{}
}
if !tc.dynamicStableScalingDisabled {
ro.Spec.Strategy.Canary.DynamicStableScale = true
}
ro.Status = tc.status

desiredRS := newRSWithHashAndReplicas(tc.rsHash, 1)
if tc.rsAvailableReplicas != nil {
desiredRS.Status.AvailableReplicas = *tc.rsAvailableReplicas
}

rbToStable, _ := isDynamicallyRollingBackToStable(ro, desiredRS)

assert.Equal(t, tc.expectedResult, rbToStable)
})
}
}
55 changes: 55 additions & 0 deletions rollout/replicaset.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
"time"

appsv1 "k8s.io/api/apps/v1"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
patchtypes "k8s.io/apimachinery/pkg/types"
Expand All @@ -15,6 +16,7 @@
"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
"github.com/argoproj/argo-rollouts/utils/defaults"
replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
serviceutil "github.com/argoproj/argo-rollouts/utils/service"
timeutil "github.com/argoproj/argo-rollouts/utils/time"
)

Expand Down Expand Up @@ -296,3 +298,56 @@

return annotationedRSs, desiredReplicaCount, nil
}

// isReplicaSetReferenced returns if the given ReplicaSet is still being referenced by any of
// the current, stable, blue-green services. Used to determine if the ReplicaSet can
// safely be scaled to zero, or deleted.
func (c *rolloutContext) isReplicaSetReferenced(rs *appsv1.ReplicaSet) bool {
rsPodHash := replicasetutil.GetPodTemplateHash(rs)
if rsPodHash == "" {
return false
}
ro := c.rollout
referencesToCheck := []string{
ro.Status.StableRS,
ro.Status.CurrentPodHash,
ro.Status.BlueGreen.ActiveSelector,
ro.Status.BlueGreen.PreviewSelector,
}
if ro.Status.Canary.Weights != nil {
referencesToCheck = append(referencesToCheck, ro.Status.Canary.Weights.Canary.PodTemplateHash, ro.Status.Canary.Weights.Stable.PodTemplateHash)
}
for _, ref := range referencesToCheck {
if ref == rsPodHash {
return true
}
}

// The above are static, lightweight checks to see if the selectors we record in our status are
// still referencing the ReplicaSet in question. Those checks aren't always enough. Next, we do
// a deeper check to look up the actual service objects, and see if they are still referencing
// the ReplicaSet. If so, we cannot scale it down.
var servicesToCheck []string
if ro.Spec.Strategy.Canary != nil {
servicesToCheck = []string{ro.Spec.Strategy.Canary.CanaryService, ro.Spec.Strategy.Canary.StableService}
} else {
servicesToCheck = []string{ro.Spec.Strategy.BlueGreen.ActiveService, ro.Spec.Strategy.BlueGreen.PreviewService}
}
for _, svcName := range servicesToCheck {
if svcName == "" {
continue
}
svc, err := c.servicesLister.Services(c.rollout.Namespace).Get(svcName)
if err != nil {
if k8serrors.IsNotFound(err) {
// service doesn't exist
continue
}
return true

Check warning on line 346 in rollout/replicaset.go

View check run for this annotation

Codecov / codecov/patch

rollout/replicaset.go#L346

Added line #L346 was not covered by tests
}
if serviceutil.GetRolloutSelectorLabel(svc) == rsPodHash {
return true
}
}
return false
}
Loading
Loading