Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: reset the progress condition when a pod is restarted #1649

Merged
54 changes: 54 additions & 0 deletions rollout/bluegreen_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,60 @@ func newBlueGreenRollout(name string, replicas int, revisionHistoryLimit *int32,
return rollout
}

func TestBlueGreenComplateRolloutRestart(t *testing.T) {
f := newFixture(t)
defer f.Close()

r := newBlueGreenRollout("foo", 1, nil, "active", "preview")
r.Status.Conditions = []v1alpha1.RolloutCondition{}

completedCond := conditions.NewRolloutCondition(v1alpha1.RolloutCompleted, corev1.ConditionTrue, conditions.RolloutCompletedReason, conditions.RolloutCompletedReason)
conditions.SetRolloutCondition(&r.Status, *completedCond)

f.rolloutLister = append(f.rolloutLister, r)
f.objects = append(f.objects, r)
previewSvc := newService("preview", 80, nil, r)
activeSvc := newService("active", 80, nil, r)
f.kubeobjects = append(f.kubeobjects, previewSvc, activeSvc)
f.serviceLister = append(f.serviceLister, activeSvc, previewSvc)

rs := newReplicaSet(r, 1)
rsPodHash := rs.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
generatedConditions := generateConditionsPatchWithComplete(false, conditions.ReplicaSetNotAvailableReason, rs, false, "", false)

f.expectCreateReplicaSetAction(rs)
servicePatchIndex := f.expectPatchServiceAction(previewSvc, rsPodHash)
f.expectUpdateReplicaSetAction(rs) // scale up RS
updatedRolloutIndex := f.expectUpdateRolloutStatusAction(r)
expectedPatchWithoutSubs := `{
"status":{
"blueGreen" : {
"previewSelector": "%s"
},
"conditions": %s,
"selector": "foo=bar",
"stableRS": "%s",
"phase": "Progressing",
"message": "more replicas need to be updated"
}
}`
expectedPatch := calculatePatch(r, fmt.Sprintf(expectedPatchWithoutSubs, rsPodHash, generatedConditions, rsPodHash))
patchRolloutIndex := f.expectPatchRolloutActionWithPatch(r, expectedPatch)
f.run(getKey(r, t))

f.verifyPatchedService(servicePatchIndex, rsPodHash, "")

updatedRollout := f.getUpdatedRollout(updatedRolloutIndex)
updatedProgressingCondition := conditions.GetRolloutCondition(updatedRollout.Status, v1alpha1.RolloutProgressing)
assert.NotNil(t, updatedProgressingCondition)
assert.Equal(t, conditions.NewReplicaSetReason, updatedProgressingCondition.Reason)
assert.Equal(t, corev1.ConditionTrue, updatedProgressingCondition.Status)
assert.Equal(t, fmt.Sprintf(conditions.NewReplicaSetMessage, rs.Name), updatedProgressingCondition.Message)

patch := f.getPatchedRollout(patchRolloutIndex)
assert.Equal(t, expectedPatch, patch)
}

func TestBlueGreenCreatesReplicaSet(t *testing.T) {
f := newFixture(t)
defer f.Close()
Expand Down
3 changes: 3 additions & 0 deletions rollout/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ func newProgressingCondition(reason string, resourceObj runtime.Object, optional
if reason == conditions.NewRSAvailableReason {
msg = fmt.Sprintf(conditions.ReplicaSetCompletedMessage, resource.Name)
}
if reason == conditions.ReplicaSetNotAvailableReason {
msg = conditions.NotAvailableMessage
}
case *v1alpha1.Rollout:
if reason == conditions.ReplicaSetUpdatedReason {
msg = fmt.Sprintf(conditions.RolloutProgressingMessage, resource.Name)
Expand Down
19 changes: 16 additions & 3 deletions rollout/sync.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,14 +577,15 @@ func (c *rolloutContext) calculateRolloutConditions(newStatus v1alpha1.RolloutSt
isPaused := len(c.rollout.Status.PauseConditions) > 0 || c.rollout.Spec.Paused
isAborted := c.pauseContext.IsAborted()

var becameIncomplete bool // remember if we transitioned from completed
completeCond := conditions.GetRolloutCondition(c.rollout.Status, v1alpha1.RolloutCompleted)
if !isPaused && conditions.RolloutComplete(c.rollout, &newStatus) {
updateCompletedCond := conditions.NewRolloutCondition(v1alpha1.RolloutCompleted, corev1.ConditionTrue, conditions.RolloutCompletedReason, conditions.RolloutCompletedReason)
conditions.SetRolloutCondition(&newStatus, *updateCompletedCond)
} else {
if completeCond != nil {
updateCompletedCond := conditions.NewRolloutCondition(v1alpha1.RolloutCompleted, corev1.ConditionFalse, conditions.RolloutCompletedReason, conditions.RolloutCompletedReason)
conditions.SetRolloutCondition(&newStatus, *updateCompletedCond)
becameIncomplete = conditions.SetRolloutCondition(&newStatus, *updateCompletedCond)
}
}

Expand Down Expand Up @@ -619,14 +620,26 @@ func (c *rolloutContext) calculateRolloutConditions(newStatus v1alpha1.RolloutSt
msg := fmt.Sprintf(conditions.ReplicaSetCompletedMessage, rsName)
progressingCondition := conditions.NewRolloutCondition(v1alpha1.RolloutProgressing, corev1.ConditionTrue, conditions.NewRSAvailableReason, msg)
conditions.SetRolloutCondition(&newStatus, *progressingCondition)
case conditions.RolloutProgressing(c.rollout, &newStatus):
case conditions.RolloutProgressing(c.rollout, &newStatus) || becameIncomplete:
// If there is any progress made, continue by not checking if the rollout failed. This
// behavior emulates the rolling updater progressDeadline check.
msg := fmt.Sprintf(conditions.RolloutProgressingMessage, c.rollout.Name)
if c.newRS != nil {
msg = fmt.Sprintf(conditions.ReplicaSetProgressingMessage, c.newRS.Name)
}
condition := conditions.NewRolloutCondition(v1alpha1.RolloutProgressing, corev1.ConditionTrue, conditions.ReplicaSetUpdatedReason, msg)

var reason string
if newStatus.StableRS == newStatus.CurrentPodHash && becameIncomplete {
// When a fully promoted rollout becomes Incomplete, e.g., due to the ReplicaSet status changes like
// pod restarts, evicted -> recreated, we'll need to reset the rollout's condition to `PROGRESSING` to
// avoid any timeouts.
reason = conditions.ReplicaSetNotAvailableReason
msg = conditions.NotAvailableMessage
} else {
reason = conditions.ReplicaSetUpdatedReason
}
condition := conditions.NewRolloutCondition(v1alpha1.RolloutProgressing, corev1.ConditionTrue, reason, msg)

// Update the current Progressing condition or add a new one if it doesn't exist.
// If a Progressing condition with status=true already exists, we should update
// everything but lastTransitionTime. SetRolloutCondition already does that but
Expand Down
37 changes: 37 additions & 0 deletions test/e2e/functional_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1074,6 +1074,43 @@ spec:
ExpectActiveRevision("2")
}

func (s *FunctionalSuite) TestCompleteRolloutRestart() {
s.Given().
HealthyRollout(`
apiVersion: argoproj.io/v1alpha1
kind: Rollout
metadata:
name: rollout-restart
spec:
progressDeadlineAbort: true
progressDeadlineSeconds: 15
replicas: 2
selector:
matchLabels:
app: ollout-restart
template:
metadata:
labels:
app: ollout-restart
spec:
containers:
- name: ollout-restart
image: nginx:1.19-alpine
imagePullPolicy: Always
strategy:
canary:
steps:
- setWeight: 20
`).
When().
WatchRolloutStatus("Healthy").
Sleep(16 * time.Second). // give it enough time to pass the progressDeadlineSeconds
Then().
When().
RestartRollout().
WatchRolloutStatus("Healthy")
}

func (s *FunctionalSuite) TestKubectlWaitForCompleted() {
s.Given().
HealthyRollout(`
Expand Down
6 changes: 6 additions & 0 deletions utils/conditions/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ const (
// estimated once a rollout is paused.
RolloutPausedMessage = "Rollout is paused"

// ReplicaSetNotAvailableReason is added when the replicaset of an rollout is not available.
// This could happen when a fully promoted rollout becomes incomplete, e.g.,
// due to pod restarts, evicted -> recreated. In this case, we'll need to reset the rollout's
// condition to `PROGRESSING` to avoid any timeouts.
ReplicaSetNotAvailableReason = "ReplicaSetNotAvailable"

// RolloutResumedReason is added in a rollout when it is resumed. Useful for not failing accidentally
// rollout that paused amidst a rollout and are bounded by a deadline.
RolloutResumedReason = "RolloutResumed"
Expand Down