Skip to content

Commit

Permalink
nfd-topology-updater: add metrics support
Browse files Browse the repository at this point in the history
For now, add only one metric, a counter for the errors occurring while
scanning pod resources on the node.
  • Loading branch information
marquiz committed Aug 4, 2023
1 parent 45dc46a commit 00bc9e8
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cmd/nfd-topology-updater/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) {
"Do not publish discovered features to the cluster-local Kubernetes API server.")
flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "",
"Kube config file.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second,
"Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]")
flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ spec:
command:
- "nfd-topology-updater"
args: []
ports:
- name: metrics
containerPort: 8081
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ spec:
# Disable kubelet state tracking by giving an empty path
- "-kubelet-state-dir="
{{- end }}
- -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}}
ports:
- name: metrics
containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}}
volumeMounts:
{{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }}
- name: kubelet-config
Expand Down
1 change: 1 addition & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,7 @@ topologyUpdater:
rbac:
create: true

metricsPort: 8081
kubeletConfigPath:
kubeletPodResourcesSockPath:
updateInterval: 60s
Expand Down
1 change: 1 addition & 0 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster.
| `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater |
| `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix |
| `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater |
| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics |
| `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path |
| `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources |
| `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. |
Expand Down
1 change: 1 addition & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ The exposed metrics are
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.

## Via Kustomize

Expand Down
14 changes: 14 additions & 0 deletions docs/reference/topology-updater-commandline-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,20 @@ Example:
nfd-topology-updater -oneshot -no-publish
```

### -metrics

The `-metrics` flag specifies the port on which to expose
[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the
metrics server on nfd-topology-updater.

Default: 8081

Example:

```bash
nfd-topology-updater -metrics=12345
```

### -sleep-interval

The `-sleep-interval` specifies the interval between resource hardware
Expand Down
75 changes: 75 additions & 0 deletions pkg/nfd-topology-updater/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
Copyright 2023 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package nfdtopologyupdater

import (
"fmt"
"net/http"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/klog/v2"
"sigs.k8s.io/node-feature-discovery/pkg/version"
)

// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_topology_updater_build_info"
scanErrorsQuery = "nfd_topology_updater_scan_errors_total"
)

var (
srv *http.Server

buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{
Name: buildInfoQuery,
Help: "Version from which Node Feature Discovery was built.",
ConstLabels: map[string]string{
"version": version.Get(),
},
})
scanErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: scanErrorsQuery,
Help: "Number of errors in scanning resource allocation of pods.",
})
)

// registerVersion exposes the Operator build version.
func registerVersion(version string) {
buildInfo.SetToCurrentTime()
}

// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
scanErrors)
mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))

klog.InfoS("metrics server starting", "port", port)
srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux}
klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe())
}

// stopMetricsServer stops the metrics server
func stopMetricsServer() {
if srv != nil {
klog.InfoS("stopping metrics server", "port", srv.Addr)
srv.Close()
}
}
9 changes: 9 additions & 0 deletions pkg/nfd-topology-updater/nfd-topology-updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ const (

// Args are the command line arguments
type Args struct {
MetricsPort int
NoPublish bool
Oneshot bool
KubeConfigFile string
Expand Down Expand Up @@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error {
return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err)
}

// Register to metrics server
if w.args.MetricsPort > 0 {
go runMetricsServer(w.args.MetricsPort)
registerVersion(version.Get())
defer stopMetricsServer()
}

var resScan resourcemonitor.ResourcesScanner

resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint)
Expand Down Expand Up @@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error {
klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources))
if err != nil {
klog.ErrorS(err, "scan failed")
scanErrors.Inc()
continue
}
zones = resAggr.Aggregate(scanResponse.PodResources)
Expand Down

0 comments on commit 00bc9e8

Please sign in to comment.