diff --git a/cmd/nfd-topology-updater/main.go b/cmd/nfd-topology-updater/main.go index f187f4df47..171ecd09ab 100644 --- a/cmd/nfd-topology-updater/main.go +++ b/cmd/nfd-topology-updater/main.go @@ -106,6 +106,8 @@ func initFlags(flagset *flag.FlagSet) (*topology.Args, *resourcemonitor.Args) { "Do not publish discovered features to the cluster-local Kubernetes API server.") flagset.StringVar(&args.KubeConfigFile, "kubeconfig", "", "Kube config file.") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") flagset.DurationVar(&resourcemonitorArgs.SleepInterval, "sleep-interval", time.Duration(60)*time.Second, "Time to sleep between CR updates. zero means no CR updates on interval basis. [Default: 60s]") flagset.StringVar(&resourcemonitorArgs.Namespace, "watch-namespace", "*", diff --git a/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml b/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml index 8971ea8abb..28abd3a4bd 100644 --- a/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml +++ b/deployment/base/topologyupdater-daemonset/topologyupdater-daemonset.yaml @@ -22,3 +22,6 @@ spec: command: - "nfd-topology-updater" args: [] + ports: + - name: metrics + containerPort: 8081 diff --git a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml index 6082635cd1..2e5d152a8f 100644 --- a/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml +++ b/deployment/helm/node-feature-discovery/templates/topologyupdater.yaml @@ -70,6 +70,10 @@ spec: # Disable kubelet state tracking by giving an empty path - "-kubelet-state-dir=" {{- end }} + - -metrics={{ .Values.topologyUpdater.metricsPort | default "8081"}} + ports: + - name: metrics + containerPort: {{ .Values.topologyUpdater.metricsPort | default "8081"}} volumeMounts: {{- if .Values.topologyUpdater.kubeletConfigPath | empty | not }} - name: kubelet-config diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index b7d5a731b2..a259560665 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -417,6 +417,7 @@ topologyUpdater: rbac: create: true + metricsPort: 8081 kubeletConfigPath: kubeletPodResourcesSockPath: updateInterval: 60s diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index 9b2450e719..bc09e85c88 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -173,6 +173,7 @@ API's you need to install the prometheus operator in your cluster. | `topologyUpdater.serviceAccount.annotations` | dict | {} | Annotations to add to the service account for topology updater | | `topologyUpdater.serviceAccount.name` | string | | The name of the service account for topology updater to use. If not set and create is true, a name is generated using the fullname template and `-topology-updater` suffix | | `topologyUpdater.rbac.create` | bool | true | Specifies whether to create [RBAC][rbac] configuration for topology updater | +| `topologyUpdater.metricsPort` | integer | 8081 | Port on which to expose prometheus metrics | | `topologyUpdater.kubeletConfigPath` | string | "" | Specifies the kubelet config host path | | `topologyUpdater.kubeletPodResourcesSockPath` | string | "" | Specifies the kubelet sock path to read pod resources | | `topologyUpdater.updateInterval` | string | 60s | Time to sleep between CR updates. Non-positive value implies no CR update. | diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 38a4011fe0..acc962e51d 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -20,6 +20,7 @@ The exposed metrics are | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node +| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods. ## Via Kustomize diff --git a/docs/reference/topology-updater-commandline-reference.md b/docs/reference/topology-updater-commandline-reference.md index c3e1debe01..3fed551522 100644 --- a/docs/reference/topology-updater-commandline-reference.md +++ b/docs/reference/topology-updater-commandline-reference.md @@ -72,6 +72,20 @@ Example: nfd-topology-updater -oneshot -no-publish ``` +### -metrics + +The `-metrics` flag specifies the port on which to expose +[Prometheus](https://prometheus.io/) metrics. Setting this to 0 disables the +metrics server on nfd-topology-updater. + +Default: 8081 + +Example: + +```bash +nfd-topology-updater -metrics=12345 +``` + ### -sleep-interval The `-sleep-interval` specifies the interval between resource hardware diff --git a/pkg/nfd-topology-updater/metrics.go b/pkg/nfd-topology-updater/metrics.go new file mode 100644 index 0000000000..479647bbaa --- /dev/null +++ b/pkg/nfd-topology-updater/metrics.go @@ -0,0 +1,75 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdtopologyupdater + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_topology_updater_build_info" + scanErrorsQuery = "nfd_topology_updater_scan_errors_total" +) + +var ( + srv *http.Server + + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) + scanErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: scanErrorsQuery, + Help: "Number of errors in scanning resource allocation of pods.", + }) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.SetToCurrentTime() +} + +// runMetricsServer starts a http server to expose metrics +func runMetricsServer(port int) { + r := prometheus.NewRegistry() + r.MustRegister(buildInfo, + scanErrors) + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) + + klog.InfoS("metrics server starting", "port", port) + srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux} + klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe()) +} + +// stopMetricsServer stops the metrics server +func stopMetricsServer() { + if srv != nil { + klog.InfoS("stopping metrics server", "port", srv.Addr) + srv.Close() + } +} diff --git a/pkg/nfd-topology-updater/nfd-topology-updater.go b/pkg/nfd-topology-updater/nfd-topology-updater.go index 0c620355d7..2d96d9f668 100644 --- a/pkg/nfd-topology-updater/nfd-topology-updater.go +++ b/pkg/nfd-topology-updater/nfd-topology-updater.go @@ -50,6 +50,7 @@ const ( // Args are the command line arguments type Args struct { + MetricsPort int NoPublish bool Oneshot bool KubeConfigFile string @@ -142,6 +143,13 @@ func (w *nfdTopologyUpdater) Run() error { return fmt.Errorf("faild to configure Node Feature Discovery Topology Updater: %w", err) } + // Register to metrics server + if w.args.MetricsPort > 0 { + go runMetricsServer(w.args.MetricsPort) + registerVersion(version.Get()) + defer stopMetricsServer() + } + var resScan resourcemonitor.ResourcesScanner resScan, err = resourcemonitor.NewPodResourcesScanner(w.resourcemonitorArgs.Namespace, podResClient, w.apihelper, w.resourcemonitorArgs.PodSetFingerprint) @@ -169,6 +177,7 @@ func (w *nfdTopologyUpdater) Run() error { klog.V(1).InfoS("received updated pod resources", "podResources", utils.DelayedDumper(scanResponse.PodResources)) if err != nil { klog.ErrorS(err, "scan failed") + scanErrors.Inc() continue } zones = resAggr.Aggregate(scanResponse.PodResources)