From f0d01ea8ecc49c751fa6af128028f6b69260eeac Mon Sep 17 00:00:00 2001 From: Aldo Culquicondor Date: Tue, 26 Apr 2022 15:11:38 -0400 Subject: [PATCH] Add metrics for admission attempts count and duration Add role and rolebinding for prometheus to be able to list the services in the kueue-system namespace. Change-Id: I77cf51536ebf53ece9a4ba2d8457dbc3e71d1e8d --- config/default/manager_auth_proxy_patch.yaml | 3 - config/manager/manager.yaml | 1 - config/prometheus/kustomization.yaml | 1 + config/prometheus/role.yaml | 46 ++++++++++++++ go.mod | 2 +- pkg/metrics/metrics.go | 63 ++++++++++++++++++++ pkg/scheduler/scheduler.go | 7 +++ 7 files changed, 118 insertions(+), 5 deletions(-) create mode 100644 config/prometheus/role.yaml create mode 100644 pkg/metrics/metrics.go diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml index ca55a81174..d4f6481707 100644 --- a/config/default/manager_auth_proxy_patch.yaml +++ b/config/default/manager_auth_proxy_patch.yaml @@ -20,6 +20,3 @@ spec: - containerPort: 8443 protocol: TCP name: https - - name: manager - args: - - "--zap-log-level=2" diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 64846788e1..ca9b156021 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -30,7 +30,6 @@ spec: - command: - /manager args: - - --leader-elect - "--zap-log-level=2" imagePullPolicy: Always image: controller:latest diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml index ed137168a1..2363f098bf 100644 --- a/config/prometheus/kustomization.yaml +++ b/config/prometheus/kustomization.yaml @@ -1,2 +1,3 @@ resources: - monitor.yaml +- role.yaml diff --git a/config/prometheus/role.yaml b/config/prometheus/role.yaml new file mode 100644 index 0000000000..bb5f3b2df5 --- /dev/null +++ b/config/prometheus/role.yaml @@ -0,0 +1,46 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: system +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/go.mod b/go.mod index e4088c4c9a..a409a487b1 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/google/go-cmp v0.5.7 github.com/onsi/ginkgo/v2 v2.1.3 github.com/onsi/gomega v1.18.1 + github.com/prometheus/client_golang v1.12.1 go.uber.org/zap v1.21.0 k8s.io/api v0.23.4 k8s.io/apimachinery v0.23.4 @@ -44,7 +45,6 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/prometheus/client_golang v1.12.1 // indirect github.com/prometheus/client_model v0.2.0 // indirect github.com/prometheus/common v0.32.1 // indirect github.com/prometheus/procfs v0.7.3 // indirect diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go new file mode 100644 index 0000000000..46341db46f --- /dev/null +++ b/pkg/metrics/metrics.go @@ -0,0 +1,63 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "time" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +type AdmissionResult string + +const ( + subsystemName = "kueue" + + SuccessAdmissionResult AdmissionResult = "success" + InadmissibleAdmissionResult AdmissionResult = "inadmissible" +) + +var ( + admissionAttempts = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: subsystemName, + Name: "admission_attempts_total", + Help: "Number of attempts to admit pods, by result. `success` means that at least one workload was admitted, `inadmissible` means that no workload was admitted.", + }, []string{"result"}, + ) + + admissionAttemptLatency = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Subsystem: subsystemName, + Name: "admission_attempt_duration_seconds", + Help: "Latency of an admission attempt", + }, []string{"result"}, + ) +) + +func AdmissionAttempt(result AdmissionResult, duration time.Duration) { + admissionAttempts.WithLabelValues(string(result)).Inc() + admissionAttemptLatency.WithLabelValues(string(result)).Observe(duration.Seconds()) +} + +func init() { + metrics.Registry.MustRegister( + admissionAttempts, + admissionAttempts, + ) +} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 665bd8392d..30f7bdc45a 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "sort" + "time" "github.com/go-logr/logr" corev1 "k8s.io/api/core/v1" @@ -35,6 +36,7 @@ import ( "k8s.io/klog/v2" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/kueue/pkg/metrics" kueue "sigs.k8s.io/kueue/apis/kueue/v1alpha1" "sigs.k8s.io/kueue/pkg/cache" @@ -85,6 +87,7 @@ func (s *Scheduler) schedule(ctx context.Context) { if len(headWorkloads) == 0 { return } + startTime := time.Now() // 2. Take a snapshot of the cache. snapshot := s.cache.Snapshot() @@ -127,6 +130,7 @@ func (s *Scheduler) schedule(ctx context.Context) { } // 6. Requeue the heads that were not scheduled. + result := metrics.InadmissibleAdmissionResult for _, e := range entries { log.V(3).Info("Workload evaluated for admission", "workload", klog.KObj(e.Obj), @@ -135,8 +139,11 @@ func (s *Scheduler) schedule(ctx context.Context) { "reason", e.inadmissibleReason) if e.status != assumed { s.requeueAndUpdate(log, ctx, e) + } else { + result = metrics.SuccessAdmissionResult } } + metrics.AdmissionAttempt(result, time.Since(startTime)) } type entryStatus string