Skip to content

Commit

Permalink
Add metrics for admission attempts count and duration
Browse files Browse the repository at this point in the history
Add role and rolebinding for prometheus to be able to list the services in the kueue-system namespace.

Change-Id: I77cf51536ebf53ece9a4ba2d8457dbc3e71d1e8d
  • Loading branch information
alculquicondor committed Apr 27, 2022
1 parent 9968361 commit f0d01ea
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 5 deletions.
3 changes: 0 additions & 3 deletions config/default/manager_auth_proxy_patch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,3 @@ spec:
- containerPort: 8443
protocol: TCP
name: https
- name: manager
args:
- "--zap-log-level=2"
1 change: 0 additions & 1 deletion config/manager/manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ spec:
- command:
- /manager
args:
- --leader-elect
- "--zap-log-level=2"
imagePullPolicy: Always
image: controller:latest
Expand Down
1 change: 1 addition & 0 deletions config/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
resources:
- monitor.yaml
- role.yaml
46 changes: 46 additions & 0 deletions config/prometheus/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: system
rules:
- apiGroups:
- ""
resources:
- services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiGroups:
- extensions
resources:
- ingresses
verbs:
- get
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- ingresses
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/google/go-cmp v0.5.7
github.com/onsi/ginkgo/v2 v2.1.3
github.com/onsi/gomega v1.18.1
github.com/prometheus/client_golang v1.12.1
go.uber.org/zap v1.21.0
k8s.io/api v0.23.4
k8s.io/apimachinery v0.23.4
Expand Down Expand Up @@ -44,7 +45,6 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.12.1 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.32.1 // indirect
github.com/prometheus/procfs v0.7.3 // indirect
Expand Down
63 changes: 63 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
Copyright 2022 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"time"

"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

type AdmissionResult string

const (
subsystemName = "kueue"

SuccessAdmissionResult AdmissionResult = "success"
InadmissibleAdmissionResult AdmissionResult = "inadmissible"
)

var (
admissionAttempts = prometheus.NewCounterVec(
prometheus.CounterOpts{
Subsystem: subsystemName,
Name: "admission_attempts_total",
Help: "Number of attempts to admit pods, by result. `success` means that at least one workload was admitted, `inadmissible` means that no workload was admitted.",
}, []string{"result"},
)

admissionAttemptLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: subsystemName,
Name: "admission_attempt_duration_seconds",
Help: "Latency of an admission attempt",
}, []string{"result"},
)
)

func AdmissionAttempt(result AdmissionResult, duration time.Duration) {
admissionAttempts.WithLabelValues(string(result)).Inc()
admissionAttemptLatency.WithLabelValues(string(result)).Observe(duration.Seconds())
}

func init() {
metrics.Registry.MustRegister(
admissionAttempts,
admissionAttempts,
)
}
7 changes: 7 additions & 0 deletions pkg/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"sort"
"time"

"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
Expand All @@ -35,6 +36,7 @@ import (
"k8s.io/klog/v2"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/kueue/pkg/metrics"

kueue "sigs.k8s.io/kueue/apis/kueue/v1alpha1"
"sigs.k8s.io/kueue/pkg/cache"
Expand Down Expand Up @@ -85,6 +87,7 @@ func (s *Scheduler) schedule(ctx context.Context) {
if len(headWorkloads) == 0 {
return
}
startTime := time.Now()

// 2. Take a snapshot of the cache.
snapshot := s.cache.Snapshot()
Expand Down Expand Up @@ -127,6 +130,7 @@ func (s *Scheduler) schedule(ctx context.Context) {
}

// 6. Requeue the heads that were not scheduled.
result := metrics.InadmissibleAdmissionResult
for _, e := range entries {
log.V(3).Info("Workload evaluated for admission",
"workload", klog.KObj(e.Obj),
Expand All @@ -135,8 +139,11 @@ func (s *Scheduler) schedule(ctx context.Context) {
"reason", e.inadmissibleReason)
if e.status != assumed {
s.requeueAndUpdate(log, ctx, e)
} else {
result = metrics.SuccessAdmissionResult
}
}
metrics.AdmissionAttempt(result, time.Since(startTime))
}

type entryStatus string
Expand Down

0 comments on commit f0d01ea

Please sign in to comment.