From e3aedd33e27965fc232136690f80fd7e6314d59e Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 6 Jun 2023 16:39:02 +0200 Subject: [PATCH] Enable metrics via prometheus operator Expose metrics via prometheus.monitoring.coreos.com/v1 The exposed metrics are | Metric | Type | Meaning | | --------------- | ---------------- | ---------------- | | `nfd_master_build_info` | Gauge | Version from which nfd-master was built. | | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. | | `nfd_updated_nodes` | Counter | Time taken to label a node | | `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD | | `nfd_feature_discovery_duration_seconds` | HistogramVec | Time taken to discover features on a node | Signed-off-by: Carlos Eduardo Arango Gutierrez Co-authored-by: Markus Lehtonen --- cmd/nfd-master/main.go | 2 + cmd/nfd-worker/main.go | 2 + deployment/base/master/master-deployment.yaml | 2 + .../worker-daemonset/worker-daemonset.yaml | 3 + .../templates/master.yaml | 4 +- .../templates/prometheus.yaml | 23 ++++++ .../templates/worker.yaml | 4 + .../helm/node-feature-discovery/values.yaml | 5 ++ .../overlays/prometheus/kustomization.yaml | 7 ++ deployment/overlays/prometheus/monitor.yaml | 20 +++++ docs/deployment/helm.md | 7 ++ docs/deployment/kustomize.md | 14 ++++ docs/deployment/metrics.md | 43 ++++++++++ go.mod | 16 ++-- go.sum | 29 ++++--- pkg/nfd-master/metrics.go | 82 +++++++++++++++++++ pkg/nfd-master/nfd-master.go | 14 ++++ pkg/nfd-worker/metrics.go | 80 ++++++++++++++++++ pkg/nfd-worker/nfd-worker.go | 10 ++- 19 files changed, 344 insertions(+), 23 deletions(-) create mode 100644 deployment/helm/node-feature-discovery/templates/prometheus.yaml create mode 100644 deployment/overlays/prometheus/kustomization.yaml create mode 100644 deployment/overlays/prometheus/monitor.yaml create mode 100644 docs/deployment/metrics.md create mode 100644 pkg/nfd-master/metrics.go create mode 100644 pkg/nfd-worker/metrics.go diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 7bbd5a9063..277747bd44 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.") flagset.IntVar(&args.Port, "port", 8080, "Port on which to listen for connections.") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") flagset.BoolVar(&args.Prune, "prune", false, "Prune all NFD related attributes from all nodes of the cluster and exit.") flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false, diff --git a/cmd/nfd-worker/main.go b/cmd/nfd-worker/main.go index 02866257d9..351260c9ea 100644 --- a/cmd/nfd-worker/main.go +++ b/cmd/nfd-worker/main.go @@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs) "Kubeconfig to use") flagset.BoolVar(&args.Oneshot, "oneshot", false, "Do not publish feature labels") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") flagset.StringVar(&args.Options, "options", "", "Specify config options from command line. Config options are specified "+ "in the same format as in the config file (i.e. json or yaml). These options") diff --git a/deployment/base/master/master-deployment.yaml b/deployment/base/master/master-deployment.yaml index bcf0b38405..b1042b789f 100644 --- a/deployment/base/master/master-deployment.yaml +++ b/deployment/base/master/master-deployment.yaml @@ -35,5 +35,7 @@ spec: command: - "nfd-master" ports: + - name: metrics + containerPort: 8081 - name: grpc containerPort: 8080 diff --git a/deployment/base/worker-daemonset/worker-daemonset.yaml b/deployment/base/worker-daemonset/worker-daemonset.yaml index f93bf175f7..2132498ce8 100644 --- a/deployment/base/worker-daemonset/worker-daemonset.yaml +++ b/deployment/base/worker-daemonset/worker-daemonset.yaml @@ -23,3 +23,6 @@ spec: - "nfd-worker" args: - "-server=nfd-master:8080" + ports: + - name: metrics + containerPort: 8081 diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml index 56fbab6bfb..d129a4b641 100644 --- a/deployment/helm/node-feature-discovery/templates/master.yaml +++ b/deployment/helm/node-feature-discovery/templates/master.yaml @@ -66,6 +66,8 @@ spec: ports: - containerPort: {{ .Values.master.port | default "8080" }} name: grpc + - containerPort: {{ .Values.master.metricsPort | default "8081" }} + name: metrics env: - name: NODE_NAME valueFrom: @@ -118,6 +120,7 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} + - "-metrics={{ .Values.master.metricsPort | default "8081" }}" volumeMounts: {{- if .Values.tls.enable }} - name: nfd-master-cert @@ -139,7 +142,6 @@ spec: items: - key: nfd-master.conf path: nfd-master.conf - {{- with .Values.master.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deployment/helm/node-feature-discovery/templates/prometheus.yaml b/deployment/helm/node-feature-discovery/templates/prometheus.yaml new file mode 100644 index 0000000000..77f09db84b --- /dev/null +++ b/deployment/helm/node-feature-discovery/templates/prometheus.yaml @@ -0,0 +1,23 @@ +{{- if .Values.prometheus.enable }} +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "node-feature-discovery.fullname" . }} + labels: + {{- include "node-feature-discovery.selectorLabels" . | nindent 4 }} +spec: + podMetricsEndpoints: + - honorLabels: true + interval: 10s + path: /metrics + port: metrics + scheme: http + namespaceSelector: + matchNames: + - {{ include "node-feature-discovery.namespace" . }} + selector: + matchExpressions: + - {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]} + - {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]} +{{- end }} diff --git a/deployment/helm/node-feature-discovery/templates/worker.yaml b/deployment/helm/node-feature-discovery/templates/worker.yaml index c1240bdc93..35eb0ec2a1 100644 --- a/deployment/helm/node-feature-discovery/templates/worker.yaml +++ b/deployment/helm/node-feature-discovery/templates/worker.yaml @@ -54,6 +54,10 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} + - "-metrics={{ .Values.worker.metricsPort | default "8081"}}" + ports: + - name: metrics + containerPort: {{ .Values.worker.metricsPort | default "8081"}} volumeMounts: - name: host-boot mountPath: "/host-boot" diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index eb65b03cbc..6b59ff1b84 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -31,6 +31,7 @@ master: ### # The TCP port that nfd-master listens for incoming requests. Default: 8080 port: 8080 + metricsPort: 8081 instance: featureApi: resyncPeriod: @@ -343,6 +344,7 @@ worker: # ### + metricsPort: 8081 daemonsetAnnotations: {} podSecurityContext: {} # fsGroup: 2000 @@ -493,3 +495,6 @@ topologyGC: tls: enable: false certManager: false + +prometheus: + enable: false diff --git a/deployment/overlays/prometheus/kustomization.yaml b/deployment/overlays/prometheus/kustomization.yaml new file mode 100644 index 0000000000..24b885f606 --- /dev/null +++ b/deployment/overlays/prometheus/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: node-feature-discovery + +resources: +- monitor.yaml diff --git a/deployment/overlays/prometheus/monitor.yaml b/deployment/overlays/prometheus/monitor.yaml new file mode 100644 index 0000000000..c62a108eaf --- /dev/null +++ b/deployment/overlays/prometheus/monitor.yaml @@ -0,0 +1,20 @@ +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: nfd-metrics + labels: + app: nfd +spec: + podMetricsEndpoints: + - honorLabels: true + interval: 10s + path: /metrics + port: metrics + scheme: http + namespaceSelector: + matchNames: + - node-feature-discovery + selector: + matchExpressions: + - {key: app, operator: In, values: ["nfd-master", "nfd-worker"]} diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index 341211dad3..1cc7a192a9 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -102,6 +102,11 @@ We have introduced the following Chart parameters. | `tls.enable` | bool | false | Specifies whether to use TLS for communications between components | | `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates | | `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication. +| `prometheus.enable` | bool | false | Specifies whether to expose metrics using prometheus operator | + +Metrics are configured to be exposed using prometheus operator API's by +default. If you want to expose metrics using the prometheus operator +API's you need to install the prometheus operator in your cluster. ### Master pod parameters @@ -109,6 +114,7 @@ We have introduced the following Chart parameters. |-----------------------------|---------|-----------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------| | `master.*` | dict | | NFD master deployment configuration | | `master.port` | integer | | Specifies the TCP port that nfd-master listens for incoming requests. | +| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus operator | | `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments | | `master.resyncPeriod` | string | | NFD API controller resync period. | | `master.extraLabelNs` | array | [] | List of allowed extra label namespaces | @@ -139,6 +145,7 @@ We have introduced the following Chart parameters. | Name | Type | Default | description | | ---- | ---- | ------- | ----------- | | `worker.*` | dict | | NFD worker daemonset configuration | +| `worker.metricsPort*` | integer | 8081 | Port on which to expose metrics from components to prometheus operator | | `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) | | `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings | | `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) | diff --git a/docs/deployment/kustomize.md b/docs/deployment/kustomize.md index 6e5abccefc..127b747c02 100644 --- a/docs/deployment/kustomize.md +++ b/docs/deployment/kustomize.md @@ -57,6 +57,8 @@ scenarios under see [Master Worker Topologyupdater](#master-worker-topologyupdater) below - [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater): see [Topology Updater](#topologyupdater) below +- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus): + see [Metrics](#metrics) below - [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune): clean up the cluster after uninstallation, see [Removing feature labels](uninstallation.md#removing-feature-labels) @@ -137,6 +139,17 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo ``` +### Metrics + +To allow [prometheus operator][prometheus-operator] +to scrape metrics from node-feature-discovery, +run the following command: + +```bash +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }} +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }} +``` + ## Uninstallation Simplest way is to invoke `kubectl delete` on the overlay that was used for @@ -162,3 +175,4 @@ kubectl delete clusterrolebinding nfd-master [kustomize]: https://github.com/kubernetes-sigs/kustomize +[prometheus-operator]: https://github.com/prometheus-operator/prometheus-operator diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md new file mode 100644 index 0000000000..d18fd19307 --- /dev/null +++ b/docs/deployment/metrics.md @@ -0,0 +1,43 @@ +--- +title: "Metrics" +layout: default +sort: 7 +--- + +# Metrics + +Metrics are configured to be exposed using [prometheus operator](https://github.com/prometheus-operator/prometheus-operator) +API's by default. If you want to expose metrics using the prometheus operator +API's you need to install the prometheus operator in your cluster. +By default NFD Master and Worker expose metrics on port 8081. + +The exposed metrics are + +| Metric | Type | Meaning | +| ---------------------------------- | ------- | ---------------- | +| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. | +| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. | +| `nfd_updated_nodes` | Counter | Time taken to label a node | +| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD | +| `nfd_feature_discovery_duration_seconds` | HistogramVec | Time taken to discover features on a node | + +## Via Kustomize + +To deploy NFD with metrics enabled using kustomize, you can use the +[Metrics Overlay](kustomize.md#metrics). + +## Via Helm + +By default metrics are enabled when deploying NFD via Helm. To enable Prometheus +to scrape metrics from NFD, you need to pass the following values to Helm: + +```bash +--set prometheus.enable=true +``` + +For more info on Helm deployment, see [Helm](helm.md). + +We recommend setting +`--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false` +when deploying prometheus-operator via Helm to enable the prometheus-operator +to scrape metrics from any PodMonitor. diff --git a/go.mod b/go.mod index ef5221688a..fb5ea32e10 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/onsi/ginkgo/v2 v2.9.1 github.com/onsi/gomega v1.27.4 github.com/opencontainers/runc v1.1.6 + github.com/prometheus/client_golang v1.15.1 github.com/smartystreets/assertions v1.2.0 github.com/smartystreets/goconvey v1.6.4 github.com/stretchr/testify v1.8.1 @@ -62,7 +63,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cenkalti/backoff/v4 v4.1.3 // indirect - github.com/cespare/xxhash/v2 v2.1.2 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect github.com/cilium/ebpf v0.10.0 // indirect github.com/container-storage-interface/spec v1.7.0 // indirect @@ -111,7 +112,7 @@ require ( github.com/libopenstorage/openstorage v1.0.0 // indirect github.com/lithammer/dedent v1.1.0 // indirect github.com/mailru/easyjson v0.7.7 // indirect - github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/mapstructure v1.4.1 // indirect @@ -129,10 +130,9 @@ require ( github.com/opencontainers/selinux v1.11.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.14.0 // indirect github.com/prometheus/client_model v0.3.0 // indirect - github.com/prometheus/common v0.37.0 // indirect - github.com/prometheus/procfs v0.8.0 // indirect + github.com/prometheus/common v0.42.0 // indirect + github.com/prometheus/procfs v0.9.0 // indirect github.com/rubiojr/go-vhd v0.0.0-20200706105327-02e210299021 // indirect github.com/seccomp/libseccomp-golang v0.10.0 // indirect github.com/sirupsen/logrus v1.9.0 // indirect @@ -161,14 +161,14 @@ require ( go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect - go.uber.org/zap v1.19.0 // indirect + go.uber.org/zap v1.24.0 // indirect golang.org/x/crypto v0.1.0 // indirect - golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect + golang.org/x/oauth2 v0.5.0 // indirect golang.org/x/sync v0.1.0 // indirect golang.org/x/sys v0.7.0 // indirect golang.org/x/term v0.7.0 // indirect golang.org/x/text v0.9.0 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + golang.org/x/time v0.3.0 // indirect golang.org/x/tools v0.7.0 // indirect google.golang.org/api v0.60.0 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index 50b6f34f89..41106d81ea 100644 --- a/go.sum +++ b/go.sum @@ -108,7 +108,6 @@ github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4 github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= -github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -123,8 +122,9 @@ github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInq github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash v1.1.0/go.mod h1:XrSqR1VqqWfGrhpAt58auRo0WTKS1nRRg3ghfAqPWnc= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= -github.com/cespare/xxhash/v2 v2.1.2 h1:YRXhKfTDauu4ajMg1TPgFO5jnlC2HCbmLXMcTG5cbYE= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chai2010/gettext-go v1.0.2 h1:1Lwwip6Q2QGsAdl/ZKPCwTe9fe0CjlUbqj5bFNSjIRk= github.com/checkpoint-restore/go-criu/v5 v5.3.0 h1:wpFFOoomK3389ue2lAb0Boag6XPht5QYpipxmSNL4d8= github.com/checkpoint-restore/go-criu/v5 v5.3.0/go.mod h1:E/eQpaFtUKGOOSEBZgmKAcn+zUUwWxqcaKZlF54wK8E= @@ -451,8 +451,9 @@ github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czP github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= -github.com/matttproud/golang_protobuf_extensions v1.0.2 h1:hAHbPm5IJGijwng3PWk09JkG9WeqChjprR5s9bBZ+OM= github.com/matttproud/golang_protobuf_extensions v1.0.2/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible h1:aKW/4cBs+yK6gpqU3K/oIwk9Q/XICqd3zOX/UFuvqmk= github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible/go.mod h1:8AuVvqP/mXw1px98n46wfvcGfQ4ci2FwoAjKYxuo3Z4= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= @@ -553,8 +554,9 @@ github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5Fsn github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= -github.com/prometheus/client_golang v1.14.0 h1:nJdhIvne2eSX/XRAFV9PcvFFRbrjbcTUj0VP62TMhnw= github.com/prometheus/client_golang v1.14.0/go.mod h1:8vpkKitgIVNcqrRBWh1C4TIUQgYNtG/XQE4E/Zae36Y= +github.com/prometheus/client_golang v1.15.1 h1:8tXpTmJbyH5lydzFPoxSIJ0J46jdh3tylbvM1xCv0LI= +github.com/prometheus/client_golang v1.15.1/go.mod h1:e9yaBhRPU2pPNsZwE+JdQl0KEt1N9XgF6zxWmaC0xOk= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= @@ -567,16 +569,18 @@ github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y8 github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= -github.com/prometheus/common v0.37.0 h1:ccBbHCgIiT9uSoFY0vX8H3zsNR5eLt17/RQLUvn8pXE= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= +github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= +github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.0-20190507164030-5867b95ac084/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= -github.com/prometheus/procfs v0.8.0 h1:ODq8ZFEaYeCaZOJlZZdJA2AbQR98dSHSM1KW/You5mo= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= +github.com/prometheus/procfs v0.9.0 h1:wzCHvIvM5SxWqYvwgVL7yJY8Lz3PKn49KQtpgMYJfhI= +github.com/prometheus/procfs v0.9.0/go.mod h1:+pB4zwohETzFnmlpe6yd2lSc+0/46IYZRB/chUwxUZY= github.com/prometheus/tsdb v0.7.1/go.mod h1:qhTCs0VvXwvX/y3TZrWD7rabWM+ijKTux40TwIPHuXU= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= @@ -714,14 +718,13 @@ go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee33 go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= -go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE= -go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -852,8 +855,9 @@ golang.org/x/oauth2 v0.0.0-20210805134026-6f1e6394065a/go.mod h1:KelEdhl1UZF7XfJ golang.org/x/oauth2 v0.0.0-20210819190943-2bc19b11175f/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20211005180243-6b3c2da341f1/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20211104180415-d3ed0bb246c8/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b h1:clP8eMhB30EHdc0bd2Twtq6kgU7yl5ub2cQLSdrv1Dg= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= +golang.org/x/oauth2 v0.5.0 h1:HuArIo48skDwlrvM3sEdHXElYslAMsf3KwRkkW4MC4s= +golang.org/x/oauth2 v0.5.0/go.mod h1:9/XBHVqLaWO3/BRHs5jbpYCnOZVjj5V0ndyaAM7KB4I= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -984,8 +988,9 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -1003,7 +1008,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= @@ -1235,7 +1239,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go new file mode 100644 index 0000000000..cabe99fa5d --- /dev/null +++ b/pkg/nfd-master/metrics.go @@ -0,0 +1,82 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdmaster + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_master_build_info" + updatedNodesQuery = "nfd_updated_nodes" + crdProcessingTimeQuery = "nfd_crd_processing_time" +) + +var ( + srv *http.Server + + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) + updatedNodes = prometheus.NewCounter(prometheus.CounterOpts{ + Name: updatedNodesQuery, + Help: "Number of nodes updated by the master.", + }) + crdProcessingTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: crdProcessingTimeQuery, + Help: "Time spent processing the NodeFeatureRule CRD.", + }) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.SetToCurrentTime() +} + +// runMetricsServer starts a http server to expose metrics +func runMetricsServer(port int) { + r := prometheus.NewRegistry() + r.MustRegister(buildInfo, + updatedNodes, + crdProcessingTime) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) + + klog.InfoS("metrics server starting", "port", port) + srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux} + klog.InfoS("metrics server stopped", "exitCode", srv.ListenAndServe()) +} + +// stopMetricsServer stops the metrics server +func stopMetricsServer() { + if srv != nil { + klog.InfoS("stopping metrics server", "port", srv.Addr) + srv.Close() + } +} diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 078bbe1804..541781f5d2 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -114,6 +114,7 @@ type Args struct { VerifyNodeName bool Options string EnableLeaderElection bool + MetricsPort int Overrides ConfigOverrideArgs } @@ -241,6 +242,14 @@ func (m *nfdMaster) Run() error { return fmt.Errorf("failed to update master node: %v", err) } } + + // Register to metrics server + if m.args.MetricsPort > 0 { + go runMetricsServer(m.args.MetricsPort) + registerVersion(version.Get()) + defer stopMetricsServer() + } + // Run gRPC server grpcErr := make(chan error, 1) go m.runGrpcServer(grpcErr) @@ -841,6 +850,7 @@ func (m *nfdMaster) refreshNodeFeatures(cli *kubernetes.Clientset, nodeName stri return err } + updatedNodes.Inc() return nil } @@ -964,6 +974,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha } // Process all rule CRs + processStart := time.Now() for _, spec := range ruleSpecs { switch { case klog.V(3).Enabled(): @@ -990,6 +1001,9 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha features.InsertAttributeFeatures(nfdv1alpha1.RuleBackrefDomain, nfdv1alpha1.RuleBackrefFeature, ruleOut.Vars) } } + processingTime := time.Since(processStart) + crdProcessingTime.Set(float64(processingTime)) + klog.V(2).InfoS("processed NodeFeatureRule objects", "nodeName", nodeName, "objectCount", len(ruleSpecs), "duration", processingTime) return labels, extendedResources, taints } diff --git a/pkg/nfd-worker/metrics.go b/pkg/nfd-worker/metrics.go new file mode 100644 index 0000000000..ee2e5b3638 --- /dev/null +++ b/pkg/nfd-worker/metrics.go @@ -0,0 +1,80 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdworker + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_worker_build_info" + featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds" +) + +var ( + srv *http.Server + + featureDiscoveryDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: featureDiscoveryDurationQuery, + Help: "Time taken to discover features", + }, + []string{"NodeName"}, + ) + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.SetToCurrentTime() +} + +// runMetricsServer starts a http server to expose metrics +func runMetricsServer(port int) { + r := prometheus.NewRegistry() + r.MustRegister(featureDiscoveryDuration) + r.MustRegister(buildInfo) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) + + klog.InfoS("metrics server starting", "port", port) + srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux} + klog.InfoS("metrics server stopped", "exit code", srv.ListenAndServe()) +} + +// stopMetricsServer stops the metrics server +func stopMetricsServer() { + if srv != nil { + klog.InfoS("stopping metrics server", "port", srv.Addr) + srv.Close() + } +} diff --git a/pkg/nfd-worker/nfd-worker.go b/pkg/nfd-worker/nfd-worker.go index 5fe07370ac..6215b71c7b 100644 --- a/pkg/nfd-worker/nfd-worker.go +++ b/pkg/nfd-worker/nfd-worker.go @@ -101,6 +101,7 @@ type Args struct { Options string Server string ServerNameOverride string + MetricsPort int Overrides ConfigOverrideArgs } @@ -197,10 +198,10 @@ func (w *nfdWorker) runFeatureDiscovery() error { discoveryDuration := time.Since(discoveryStart) klog.V(2).InfoS("feature discovery of all sources completed", "duration", discoveryDuration) + featureDiscoveryDuration.WithLabelValues(utils.NodeName()).Observe(discoveryDuration.Seconds()) if w.config.Core.SleepInterval.Duration > 0 && discoveryDuration > w.config.Core.SleepInterval.Duration/2 { klog.InfoS("feature discovery sources took over half of sleep interval ", "duration", discoveryDuration, "sleepInterval", w.config.Core.SleepInterval.Duration) } - // Get the set of feature labels. labels := createFeatureLabels(w.labelSources, w.config.Core.LabelWhiteList.Regexp) @@ -239,6 +240,13 @@ func (w *nfdWorker) Run() error { labelTrigger.Reset(w.config.Core.SleepInterval.Duration) defer labelTrigger.Stop() + // Register to metrics server + if w.args.MetricsPort > 0 { + go runMetricsServer(w.args.MetricsPort) + registerVersion(version.Get()) + defer stopMetricsServer() + } + err = w.runFeatureDiscovery() if err != nil { return err