diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index 23c1c3c260..8a8c357915 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -148,6 +148,19 @@ The label 'reason' can have the following values: }, []string{"cluster_queue", "reason"}, ) + PreemptedWorkloadsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: constants.KueueName, + Name: "preempted_workloads_total", + Help: `The number of preempted workloads per 'preemptor_cluster_queue', +The label 'reason' can have the following values: +- "preeemption.InClusterQueueReason" +- "preemption.InCohortReclamationReason" +- "preemption.InCohortFairSharingReason" +- "preemption.InCohortReclaimWhileBorrowingReason"`, + }, []string{"preemptor_cluster_queue", "reason"}, + ) + // Metrics tied to the cache. ReservingActiveWorkloads = prometheus.NewGaugeVec( @@ -262,6 +275,10 @@ func ReportEvictedWorkloads(cqName, reason string) { EvictedWorkloadsTotal.WithLabelValues(cqName, reason).Inc() } +func ReportPreemptedWorkloads(preemptorCqName, reason string) { + PreemptedWorkloadsTotal.WithLabelValues(preemptorCqName, reason).Inc() +} + func ClearQueueSystemMetrics(cqName string) { PendingWorkloads.DeleteLabelValues(cqName, PendingStatusActive) PendingWorkloads.DeleteLabelValues(cqName, PendingStatusInadmissible) @@ -271,6 +288,7 @@ func ClearQueueSystemMetrics(cqName string) { admissionWaitTime.DeleteLabelValues(cqName) admissionChecksWaitTime.DeleteLabelValues(cqName) EvictedWorkloadsTotal.DeletePartialMatch(prometheus.Labels{"cluster_queue": cqName}) + PreemptedWorkloadsTotal.DeletePartialMatch(prometheus.Labels{"preemptor_cluster_queue": cqName}) } func ReportClusterQueueStatus(cqName string, cqStatus ClusterQueueStatus) { @@ -378,6 +396,7 @@ func Register() { quotaReservedWaitTime, AdmittedWorkloadsTotal, EvictedWorkloadsTotal, + PreemptedWorkloadsTotal, admissionWaitTime, admissionChecksWaitTime, ClusterQueueResourceUsage, diff --git a/pkg/scheduler/preemption/preemption.go b/pkg/scheduler/preemption/preemption.go index 91d72b00c5..a8af7c5a58 100644 --- a/pkg/scheduler/preemption/preemption.go +++ b/pkg/scheduler/preemption/preemption.go @@ -162,6 +162,26 @@ func canBorrowWithinCohort(cq *cache.ClusterQueue, wl *kueue.Workload) (bool, *i return true, &threshold } +// Preemption origins +const ( + // ClusterQueueOrigin indicates that preemption originated from cluster queue + ClusterQueueOrigin = "ClusterQueue" + // CohortOrigin indicates that preemption originated from cohort + CohortOrigin = "Cohort" +) + +// Reasons of ClusterQueueOrigin +const ( + InClusterQueueReason = "InClusterQueue" +) + +// Reasons of CohortOrigin +const ( + InCohortReclamationReason = "InCohortReclamation" + InCohortFairSharingReason = "InCohortFairSharing" + InCohortReclaimWhileBorrowingReason = "InCohortReclaimWhileBorrowing" +) + // IssuePreemptions marks the target workloads as evicted. func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.Info, targets []*workload.Info, cq *cache.ClusterQueue) (int, error) { log := ctrl.LoggerFrom(ctx) @@ -187,6 +207,8 @@ func (p *Preemptor) IssuePreemptions(ctx context.Context, preemptor *workload.In } log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj), "reason", reason, "message", message) + metrics.ReportPreemptedWorkloads(preemptor.ClusterQueue, reason) + p.recorder.Eventf(target.Obj, corev1.EventTypeNormal, "Preempted", message) metrics.ReportEvictedWorkloads(target.ClusterQueue, kueue.WorkloadEvictedByPreemption) } else {