Skip to content

Commit

Permalink
Reset checks on every eviction
Browse files Browse the repository at this point in the history
  • Loading branch information
PBundyra committed Oct 28, 2024
1 parent 8848223 commit 0e6dab2
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 15 deletions.
11 changes: 5 additions & 6 deletions pkg/controller/core/workload_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (c
reason += dtCond.Reason
message = fmt.Sprintf("%s due to %s", message, dtCond.Message)
}
workload.SetEvictedCondition(&wl, reason, message)
workload.SetEvictedConditionAndResetChecks(&wl, reason, message)
updated = true
evicted = true
}
Expand Down Expand Up @@ -379,8 +379,7 @@ func (r *WorkloadReconciler) reconcileCheckBasedEviction(ctx context.Context, wl
}
// at this point we know a Workload has at least one Retry AdmissionCheck
message := "At least one admission check is false"
workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByAdmissionCheck, message)
workload.ResetChecksOnEviction(wl)
workload.SetEvictedConditionAndResetChecks(wl, kueue.WorkloadEvictedByAdmissionCheck, message)
if err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true); err != nil {
return false, client.IgnoreNotFound(err)
}
Expand Down Expand Up @@ -416,7 +415,7 @@ func (r *WorkloadReconciler) reconcileOnLocalQueueActiveState(ctx context.Contex
return false, nil
}
log.V(3).Info("Workload is evicted because the LocalQueue is stopped", "localQueue", klog.KRef(wl.Namespace, wl.Spec.QueueName))
workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByLocalQueueStopped, "The LocalQueue is stopped")
workload.SetEvictedConditionAndResetChecks(wl, kueue.WorkloadEvictedByLocalQueueStopped, "The LocalQueue is stopped")
err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
if err == nil {
cqName := string(lq.Spec.ClusterQueue)
Expand Down Expand Up @@ -463,7 +462,7 @@ func (r *WorkloadReconciler) reconcileOnClusterQueueActiveState(ctx context.Cont
}
log.V(3).Info("Workload is evicted because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName))
message := "The ClusterQueue is stopped"
workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByClusterQueueStopped, message)
workload.SetEvictedConditionAndResetChecks(wl, kueue.WorkloadEvictedByClusterQueueStopped, message)
err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
if err == nil {
workload.ReportEvictedWorkload(r.recorder, wl, cqName, kueue.WorkloadEvictedByClusterQueueStopped, message)
Expand Down Expand Up @@ -541,7 +540,7 @@ func (r *WorkloadReconciler) reconcileNotReadyTimeout(ctx context.Context, req c
return 0, client.IgnoreNotFound(err)
}
message := fmt.Sprintf("Exceeded the PodsReady timeout %s", req.NamespacedName.String())
workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByPodsReadyTimeout, message)
workload.SetEvictedConditionAndResetChecks(wl, kueue.WorkloadEvictedByPodsReadyTimeout, message)
err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
if err == nil {
cqName, _ := r.queues.ClusterQueueForWorkload(wl)
Expand Down
10 changes: 6 additions & 4 deletions pkg/controller/core/workload_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,9 @@ func TestReconcile(t *testing.T) {
ReserveQuota(utiltesting.MakeAdmission("q1").Obj()).
Admitted(true).
AdmissionCheck(kueue.AdmissionCheckState{
Name: "check",
State: kueue.CheckStateReady,
Name: "check",
State: kueue.CheckStatePending,
Message: "Reset to Pending after eviction. Previously: Ready",
}).
Generation(1).
Condition(metav1.Condition{
Expand Down Expand Up @@ -690,8 +691,9 @@ func TestReconcile(t *testing.T) {
ReserveQuota(utiltesting.MakeAdmission("q1").Obj()).
Admitted(true).
AdmissionCheck(kueue.AdmissionCheckState{
Name: "check",
State: kueue.CheckStateReady,
Name: "check",
State: kueue.CheckStatePending,
Message: "Reset to Pending after eviction. Previously: Ready",
}).
Generation(1).
Condition(metav1.Condition{
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/preemption/preemption.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.In

func (p *Preemptor) applyPreemptionWithSSA(ctx context.Context, w *kueue.Workload, reason, message string) error {
w = w.DeepCopy()
workload.SetEvictedCondition(w, kueue.WorkloadEvictedByPreemption, message)
workload.SetEvictedConditionAndResetChecks(w, kueue.WorkloadEvictedByPreemption, message)
workload.SetPreemptedCondition(w, reason, message)
return workload.ApplyAdmissionStatus(ctx, p.client, w, true)
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/workload/workload.go
Original file line number Diff line number Diff line change
Expand Up @@ -570,7 +570,7 @@ func SetDeactivationTarget(w *kueue.Workload, reason string, message string) {
apimeta.SetStatusCondition(&w.Status.Conditions, condition)
}

func SetEvictedCondition(w *kueue.Workload, reason string, message string) {
func SetEvictedConditionAndResetChecks(w *kueue.Workload, reason string, message string) {
condition := metav1.Condition{
Type: kueue.WorkloadEvicted,
Status: metav1.ConditionTrue,
Expand All @@ -579,6 +579,7 @@ func SetEvictedCondition(w *kueue.Workload, reason string, message string) {
ObservedGeneration: w.Generation,
}
apimeta.SetStatusCondition(&w.Status.Conditions, condition)
ResetChecksOnEviction(w)
}

// PropagateResourceRequests synchronizes w.Status.ResourceRequests to
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ var _ = ginkgo.Describe("Workload controller", ginkgo.Ordered, ginkgo.ContinueOn
ginkgo.By("evicting the workload, the accumulated admission time is updated", func() {
gomega.Eventually(func(g gomega.Gomega) {
g.Expect(k8sClient.Get(ctx, key, wl)).To(gomega.Succeed())
workload.SetEvictedCondition(wl, "ByTest", "by test")
workload.SetEvictedConditionAndResetChecks(wl, "ByTest", "by test")
g.Expect(workload.ApplyAdmissionStatus(ctx, k8sClient, wl, false)).To(gomega.Succeed())
}, util.Timeout, util.Interval).Should(gomega.Succeed())
util.FinishEvictionForWorkloads(ctx, k8sClient, wl)
Expand Down
4 changes: 2 additions & 2 deletions test/integration/controller/jobs/pod/pod_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu
ginkgo.By("checking that the Pods get a deletion timestamp when the workload is evicted", func() {
gomega.Expect(func() error {
w := createdWorkload.DeepCopy()
workload.SetEvictedCondition(w, "ByTest", "by test")
workload.SetEvictedConditionAndResetChecks(w, "ByTest", "by test")
return workload.ApplyAdmissionStatus(ctx, k8sClient, w, false)
}()).Should(gomega.Succeed())

Expand Down Expand Up @@ -1321,7 +1321,7 @@ var _ = ginkgo.Describe("Pod controller", ginkgo.Ordered, ginkgo.ContinueOnFailu
})

ginkgo.By("setting evicted condition to true", func() {
workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByPreemption, "By test")
workload.SetEvictedConditionAndResetChecks(wl, kueue.WorkloadEvictedByPreemption, "By test")
gomega.Expect(
workload.ApplyAdmissionStatus(ctx, k8sClient, wl, false),
).Should(gomega.Succeed())
Expand Down

0 comments on commit 0e6dab2

Please sign in to comment.