Skip to content
This repository has been archived by the owner on Apr 25, 2023. It is now read-only.

Commit

Permalink
Merge pull request #1506 from zqzten/metrics
Browse files Browse the repository at this point in the history
Introduce standard controller runtime metrics
  • Loading branch information
k8s-ci-robot authored Jul 11, 2022
2 parents 4babb44 + 6c2b0b8 commit a402d2e
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 7 deletions.
31 changes: 30 additions & 1 deletion pkg/controller/util/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (
"k8s.io/client-go/util/flowcontrol"
"k8s.io/client-go/util/workqueue"
runtimeclient "sigs.k8s.io/controller-runtime/pkg/client"

"sigs.k8s.io/kubefed/pkg/metrics"
)

type ReconcileFunc func(qualifiedName QualifiedName) ReconciliationStatus
Expand Down Expand Up @@ -128,6 +130,8 @@ func (w *asyncWorker) EnqueueWithDelay(qualifiedName QualifiedName, delay time.D
}

func (w *asyncWorker) Run(stopChan <-chan struct{}) {
w.initMetrics()

StartBackoffGC(w.backoff, stopChan)
w.deliverer.StartWithHandler(func(item *DelayingDelivererItem) {
qualifiedName, ok := item.Value.(*QualifiedName)
Expand Down Expand Up @@ -183,16 +187,41 @@ func (w *asyncWorker) reconcileOnce() bool {
return true
}

metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(1)
defer metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(-1)
defer metrics.UpdateControllerRuntimeReconcileTimeFromStart(w.name, time.Now())

status := w.reconcile(qualifiedName)
switch status {
case StatusAllOK:
break
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Inc()
case StatusError:
w.EnqueueForError(qualifiedName)
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Inc()
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Inc()
case StatusNeedsRecheck:
w.EnqueueForRetry(qualifiedName)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Inc()
case StatusNotSynced:
w.EnqueueForClusterSync(qualifiedName)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Inc()
}
return true
}

const (
labelSuccess = "success"
labelError = "error"
labelNeedsRecheck = "needs_recheck"
labelNotSynced = "not_synced"
)

func (w *asyncWorker) initMetrics() {
metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Set(0)
metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Add(0)
metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Add(0)
metrics.ControllerRuntimeWorkerCount.WithLabelValues(w.name).Set(float64(w.maxConcurrentReconciles))
}
59 changes: 53 additions & 6 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"k8s.io/klog/v2"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)
Expand Down Expand Up @@ -58,7 +59,7 @@ var (
reconcileFederatedResourcesDuration = prometheus.NewHistogram(
prometheus.HistogramOpts{
Name: "reconcile_federated_resources_duration_seconds",
Help: "Time taken to reconcile federated resources in the target clusters.",
Help: "[Deprecated] Time taken to reconcile federated resources in the target clusters. Replaced by controller_runtime_reconcile_time_seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
},
)
Expand Down Expand Up @@ -90,18 +91,45 @@ var (
controllerRuntimeReconcileDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_duration_seconds",
Help: "Time taken by various parts of Kubefed controllers reconciliation loops.",
Help: "[Deprecated] Time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0},
}, []string{"controller"},
)

controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Name: "controller_runtime_reconcile_quantile_seconds",
Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.",
Help: "[Deprecated] Quantiles of time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.",
MaxAge: time.Hour,
}, []string{"controller"},
)

ControllerRuntimeReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_total",
Help: "Total number of reconciliations per controller",
}, []string{"controller", "result"})

ControllerRuntimeReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconciliation errors per controller",
}, []string{"controller"})

ControllerRuntimeReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_seconds",
Help: "Length of time per reconciliation per controller",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60},
}, []string{"controller"})

ControllerRuntimeWorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_max_concurrent_reconciles",
Help: "Maximum number of concurrent reconciles per controller",
}, []string{"controller"})

ControllerRuntimeActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_active_workers",
Help: "Number of currently used workers per controller",
}, []string{"controller"})
)

const (
Expand All @@ -117,6 +145,10 @@ const (
// RegisterAll registers all metrics.
func RegisterAll() {
metrics.Registry.MustRegister(
// expose process metrics like CPU, Memory, file descriptor usage etc.
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// expose Go runtime metrics like GC stats, memory stats etc.
collectors.NewGoCollector(),
kubefedClusterTotal,
joinedClusterTotal,
reconcileFederatedResourcesDuration,
Expand All @@ -127,6 +159,11 @@ func RegisterAll() {
dispatchOperationDuration,
controllerRuntimeReconcileDuration,
controllerRuntimeReconcileDurationSummary,
ControllerRuntimeReconcileTotal,
ControllerRuntimeReconcileErrors,
ControllerRuntimeReconcileTime,
ControllerRuntimeWorkerCount,
ControllerRuntimeActiveWorkers,
)
}

Expand Down Expand Up @@ -203,10 +240,20 @@ func UpdateControllerReconcileDurationFromStart(controller string, start time.Ti

// UpdateControllerReconcileDuration records the duration of the reconcile function of a controller
func UpdateControllerReconcileDuration(controller string, duration time.Duration) {
controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
}

// UpdateControllerRuntimeReconcileTimeFromStart records the duration of the reconcile loop of a controller
func UpdateControllerRuntimeReconcileTimeFromStart(controller string, start time.Time) {
duration := time.Since(start)
UpdateControllerRuntimeReconcileTime(controller, duration)
}

// UpdateControllerRuntimeReconcileTime records the duration of the reconcile function of a controller
func UpdateControllerRuntimeReconcileTime(controller string, duration time.Duration) {
if duration > LogReconcileLongDurationThreshold {
klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration)
}

controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds())
controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds())
ControllerRuntimeReconcileTime.WithLabelValues(controller).Observe(duration.Seconds())
}

0 comments on commit a402d2e

Please sign in to comment.