Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nfd-topology-updater: add metrics support #1295

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not publish discovered features to the cluster-local Kubernetes API server.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ spec:
command:
- "nfd-topology-updater"
args: []
ports:
- name: metrics
containerPort: 8081
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
ports:
- name: metrics
containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
Expand Down
1 change: 1 addition & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ topologyUpdater:
rbac:
create: true

metricsPort: 8081
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
Expand Down
1 change: 1 addition & 0 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
Expand Down
1 change: 1 addition & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The exposed metrics are
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.

## Via Kustomize

Expand Down
14 changes: 14 additions & 0 deletions docs/reference/topology-updater-commandline-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,20 @@ Example:
nfd-topology-updater -oneshot -no-publish
```

### -metrics

The `-metrics` flag specifies the port on which to expose
[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the
metrics server on nfd-topology-updater.

Default: 8081

Example:

```bash
nfd-topology-updater -metrics=12345
```

### -sleep-interval

The `-sleep-interval` specifies the interval between resource hardware
Expand Down
75 changes: 75 additions & 0 deletions pkg/nfd-topology-updater/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
Copyright 2023 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nfdtopologyupdater

import (
"fmt"
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)

// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_topology_updater_build_info"
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
)

var (
srv *http.Server

buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: scanErrorsQuery,
Help: "Number of errors in scanning resource allocation of pods.",
})
)

// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}

// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
scanErrors)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))

klog.InfoS("metrics server starting", "port", port)
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
}

// stopMetricsServer stops the metrics server
func stopMetricsServer() {
if srv != nil {
klog.InfoS("stopping metrics server", "port", srv.Addr)
srv.Close()
}
}
9 changes: 9 additions & 0 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const (

// Args are the command line arguments
type Args struct {
MetricsPort int
NoPublish bool
Oneshot bool
KubeConfigFile string
Expand Down Expand Up @@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}

// Register to metrics server
if w.args.MetricsPort > 0 {
go runMetricsServer(w.args.MetricsPort)
registerVersion(version.Get())
defer stopMetricsServer()
}

var resScan resourcemonitor.ResourcesScanner

resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
Expand Down Expand Up @@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error {
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
if err != nil {
klog.ErrorS(err, "scan failed")
scanErrors.Inc()
continue
}
zones = resAggr.Aggregate(scanResponse.PodResources)
Expand Down