From e2fd000efe24e7830fd741d5b7920ded74478c42 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Tue, 6 Jun 2023 16:39:02 +0200 Subject: [PATCH] Enable metrics via prometheus operator Expose metrics via prometheus.monitoring.coreos.com/v1 The exposed metrics are | Metric | Type | Meaning | | --------------- | ---------------- | ---------------- | | `nfd_master_build_info` | Gauge | Version from which nfd-master was built. | | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. | | `nfd_updated_nodes` | Gauge | Time taken to label a node | | `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD | | `nfd_feature_discovery_duration_seconds` | Gauge | Time taken to discover features on a node | Signed-off-by: Carlos Eduardo Arango Gutierrez --- cmd/nfd-master/main.go | 2 + cmd/nfd-worker/main.go | 2 + deployment/base/master/master-deployment.yaml | 5 ++ .../worker-daemonset/worker-daemonset.yaml | 4 + .../components/prometheus/kustomization.yaml | 8 ++ deployment/components/prometheus/monitor.yaml | 20 +++++ deployment/components/prometheus/role.yaml | 49 +++++++++++ .../templates/master.yaml | 9 +- .../templates/prometheus.yaml | 73 ++++++++++++++++ .../templates/worker.yaml | 8 ++ .../helm/node-feature-discovery/values.yaml | 4 + .../overlays/prometheus/kustomization.yaml | 10 +++ deployment/overlays/prometheus/namespace.yaml | 4 + docs/deployment/helm.md | 2 + docs/deployment/kustomize.md | 25 ++++++ go.mod | 6 +- go.sum | 11 +-- pkg/nfd-master/metrics.go | 83 +++++++++++++++++++ pkg/nfd-master/nfd-master.go | 21 +++++ pkg/nfd-worker/metrics.go | 81 ++++++++++++++++++ pkg/nfd-worker/nfd-worker.go | 8 +- 21 files changed, 423 insertions(+), 12 deletions(-) create mode 100644 deployment/components/prometheus/kustomization.yaml create mode 100644 deployment/components/prometheus/monitor.yaml create mode 100644 deployment/components/prometheus/role.yaml create mode 100644 deployment/helm/node-feature-discovery/templates/prometheus.yaml create mode 100644 deployment/overlays/prometheus/kustomization.yaml create mode 100644 deployment/overlays/prometheus/namespace.yaml create mode 100644 pkg/nfd-master/metrics.go create mode 100644 pkg/nfd-worker/metrics.go diff --git a/cmd/nfd-master/main.go b/cmd/nfd-master/main.go index 949a31a585..238bace773 100644 --- a/cmd/nfd-master/main.go +++ b/cmd/nfd-master/main.go @@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs) "Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.") flagset.IntVar(&args.Port, "port", 8080, "Port on which to listen for connections.") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") flagset.BoolVar(&args.Prune, "prune", false, "Prune all NFD related attributes from all nodes of the cluaster and exit.") flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false, diff --git a/cmd/nfd-worker/main.go b/cmd/nfd-worker/main.go index 02866257d9..351260c9ea 100644 --- a/cmd/nfd-worker/main.go +++ b/cmd/nfd-worker/main.go @@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs) "Kubeconfig to use") flagset.BoolVar(&args.Oneshot, "oneshot", false, "Do not publish feature labels") + flagset.IntVar(&args.MetricsPort, "metrics", 8081, + "Port on which to expose metrics.") flagset.StringVar(&args.Options, "options", "", "Specify config options from command line. Config options are specified "+ "in the same format as in the config file (i.e. json or yaml). These options") diff --git a/deployment/base/master/master-deployment.yaml b/deployment/base/master/master-deployment.yaml index e52c91a56d..72e3a4d5d1 100644 --- a/deployment/base/master/master-deployment.yaml +++ b/deployment/base/master/master-deployment.yaml @@ -34,3 +34,8 @@ spec: failureThreshold: 10 command: - "nfd-master" + args: + - "-enable-nodefeature-api" + ports: + - name: metrics + containerPort: 8081 diff --git a/deployment/base/worker-daemonset/worker-daemonset.yaml b/deployment/base/worker-daemonset/worker-daemonset.yaml index f93bf175f7..9d00ba159f 100644 --- a/deployment/base/worker-daemonset/worker-daemonset.yaml +++ b/deployment/base/worker-daemonset/worker-daemonset.yaml @@ -23,3 +23,7 @@ spec: - "nfd-worker" args: - "-server=nfd-master:8080" + - "-enable-nodefeature-api" + ports: + - name: metrics + containerPort: 8081 diff --git a/deployment/components/prometheus/kustomization.yaml b/deployment/components/prometheus/kustomization.yaml new file mode 100644 index 0000000000..43608beb59 --- /dev/null +++ b/deployment/components/prometheus/kustomization.yaml @@ -0,0 +1,8 @@ +apiVersion: kustomize.config.k8s.io/v1alpha1 +kind: Component + +namespace: node-feature-discovery + +resources: +- monitor.yaml +- role.yaml diff --git a/deployment/components/prometheus/monitor.yaml b/deployment/components/prometheus/monitor.yaml new file mode 100644 index 0000000000..17c0638d67 --- /dev/null +++ b/deployment/components/prometheus/monitor.yaml @@ -0,0 +1,20 @@ +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: nfd-metrics + labels: + app: nfd +spec: + podMetricsEndpoints: + - honorLabels: true + interval: 10s + path: /metrics + targetPort: 8081 + scheme: http + namespaceSelector: + matchNames: + - node-feature-discovery + selector: + matchExpressions: + - {key: app, operator: Exists} diff --git a/deployment/components/prometheus/role.yaml b/deployment/components/prometheus/role.yaml new file mode 100644 index 0000000000..963ee6da26 --- /dev/null +++ b/deployment/components/prometheus/role.yaml @@ -0,0 +1,49 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: node-feature-discovery +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: node-feature-discovery +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- kind: ServiceAccount + name: prometheus-operator + namespace: monitoring diff --git a/deployment/helm/node-feature-discovery/templates/master.yaml b/deployment/helm/node-feature-discovery/templates/master.yaml index e40b31ab08..bbc9846205 100644 --- a/deployment/helm/node-feature-discovery/templates/master.yaml +++ b/deployment/helm/node-feature-discovery/templates/master.yaml @@ -118,6 +118,14 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} + {{- if .Values.metrics.enable }} + - "-metrics={{ .Values.metrics.port }}" + {{- end }} + {{- if .Values.metrics.enable }} + ports: + - name: metrics + containerPort: {{ .Values.metrics.port }} + {{- end }} volumeMounts: {{- if .Values.tls.enable }} - name: nfd-master-cert @@ -139,7 +147,6 @@ spec: items: - key: nfd-master.conf path: nfd-master.conf - {{- with .Values.master.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/deployment/helm/node-feature-discovery/templates/prometheus.yaml b/deployment/helm/node-feature-discovery/templates/prometheus.yaml new file mode 100644 index 0000000000..179a3da632 --- /dev/null +++ b/deployment/helm/node-feature-discovery/templates/prometheus.yaml @@ -0,0 +1,73 @@ +{{- if .Values.metrics.enable }} +# Prometheus Monitor Service (Metrics) +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: nfd-metrics + labels: + app: nfd +spec: + podMetricsEndpoints: + - honorLabels: true + interval: 10s + path: /metrics + targetPort: {{ .Values.metrics.port }} + scheme: http + namespaceSelector: + matchNames: + - node-feature-discovery + selector: + matchExpressions: + - {key: app, operator: Exists} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s + namespace: node-feature-discovery +rules: +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - extensions + resources: + - ingresses + verbs: + - get + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: node-feature-discovery +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- kind: ServiceAccount + name: prometheus-operator + namespace: monitoring + +{{- end }} diff --git a/deployment/helm/node-feature-discovery/templates/worker.yaml b/deployment/helm/node-feature-discovery/templates/worker.yaml index c1240bdc93..f5d1bf153b 100644 --- a/deployment/helm/node-feature-discovery/templates/worker.yaml +++ b/deployment/helm/node-feature-discovery/templates/worker.yaml @@ -54,6 +54,14 @@ spec: - "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key" - "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt" {{- end }} + {{- if .Values.metrics.enable }} + - "-metrics={{ .Values.metrics.port }}" + {{- end }} + {{- if .Values.metrics.enable }} + ports: + - name: metrics + containerPort: {{ .Values.metrics.port }} + {{- end }} volumeMounts: - name: host-boot mountPath: "/host-boot" diff --git a/deployment/helm/node-feature-discovery/values.yaml b/deployment/helm/node-feature-discovery/values.yaml index d2f5c965e7..6f1787cf43 100644 --- a/deployment/helm/node-feature-discovery/values.yaml +++ b/deployment/helm/node-feature-discovery/values.yaml @@ -493,3 +493,7 @@ topologyGC: tls: enable: false certManager: false + +metrics: + enable: false + port: 8081 diff --git a/deployment/overlays/prometheus/kustomization.yaml b/deployment/overlays/prometheus/kustomization.yaml new file mode 100644 index 0000000000..272b34ff76 --- /dev/null +++ b/deployment/overlays/prometheus/kustomization.yaml @@ -0,0 +1,10 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: node-feature-discovery + +resources: +- namespace.yaml + +components: +- ../../components/prometheus diff --git a/deployment/overlays/prometheus/namespace.yaml b/deployment/overlays/prometheus/namespace.yaml new file mode 100644 index 0000000000..8e54de2e34 --- /dev/null +++ b/deployment/overlays/prometheus/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: node-feature-discovery diff --git a/docs/deployment/helm.md b/docs/deployment/helm.md index 341211dad3..8b64a2460b 100644 --- a/docs/deployment/helm.md +++ b/docs/deployment/helm.md @@ -102,6 +102,8 @@ We have introduced the following Chart parameters. | `tls.enable` | bool | false | Specifies whether to use TLS for communications between components | | `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates | | `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication. +| `metrics.enable` | bool | false | Specifies whether to expose mtrics using prometheus | +| `metrics.port` | integer | 8081 | Port on which to expose metrics from components | ### Master pod parameters diff --git a/docs/deployment/kustomize.md b/docs/deployment/kustomize.md index 6e5abccefc..c499bb5017 100644 --- a/docs/deployment/kustomize.md +++ b/docs/deployment/kustomize.md @@ -57,6 +57,8 @@ scenarios under see [Master Worker Topologyupdater](#master-worker-topologyupdater) below - [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater): see [Topology Updater](#topologyupdater) below +- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus): + see [Metrics](#metrics) below - [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune): clean up the cluster after uninstallation, see [Removing feature labels](uninstallation.md#removing-feature-labels) @@ -137,6 +139,28 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo ``` +# Metrics + +To Add metrics scraping for prometheus-operator, +To allow [prometheus-operator][prometheus-operator] +to scrape metrics from node-feature-discovery, +run the following command: + +```bash +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }} +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }} +``` + +The exposed metrics are + +| Metric | Type | Meaning | +| --------------- | ---------------- | ---------------- | +| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. | +| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. | +| `nfd_updated_nodes` | Gauge | Time taken to label a node | +| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD | +| `nfd_feature_discovery_duration_seconds` | Gauge | Time taken to discover features on a node | + ## Uninstallation Simplest way is to invoke `kubectl delete` on the overlay that was used for @@ -162,3 +186,4 @@ kubectl delete clusterrolebinding nfd-master [kustomize]: https://github.com/kubernetes-sigs/kustomize +[prometheus-operator]: https://github.com/prometheus-operator/prometheus-operator diff --git a/go.mod b/go.mod index ef5221688a..c998627fe9 100644 --- a/go.mod +++ b/go.mod @@ -15,6 +15,7 @@ require ( github.com/onsi/ginkgo/v2 v2.9.1 github.com/onsi/gomega v1.27.4 github.com/opencontainers/runc v1.1.6 + github.com/prometheus/client_golang v1.14.0 github.com/smartystreets/assertions v1.2.0 github.com/smartystreets/goconvey v1.6.4 github.com/stretchr/testify v1.8.1 @@ -129,7 +130,6 @@ require ( github.com/opencontainers/selinux v1.11.0 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/prometheus/client_golang v1.14.0 // indirect github.com/prometheus/client_model v0.3.0 // indirect github.com/prometheus/common v0.37.0 // indirect github.com/prometheus/procfs v0.8.0 // indirect @@ -161,14 +161,14 @@ require ( go.opentelemetry.io/proto/otlp v0.19.0 // indirect go.uber.org/atomic v1.7.0 // indirect go.uber.org/multierr v1.6.0 // indirect - go.uber.org/zap v1.19.0 // indirect + go.uber.org/zap v1.24.0 // indirect golang.org/x/crypto v0.1.0 // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect golang.org/x/sync v0.1.0 // indirect golang.org/x/sys v0.7.0 // indirect golang.org/x/term v0.7.0 // indirect golang.org/x/text v0.9.0 // indirect - golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect + golang.org/x/time v0.3.0 // indirect golang.org/x/tools v0.7.0 // indirect google.golang.org/api v0.60.0 // indirect google.golang.org/appengine v1.6.7 // indirect diff --git a/go.sum b/go.sum index 50b6f34f89..e46cfa4948 100644 --- a/go.sum +++ b/go.sum @@ -108,7 +108,6 @@ github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4 github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY= github.com/aws/aws-sdk-go v1.35.24/go.mod h1:tlPOdRjfxPBpNIwqDj61rmsnA85v9jc0Ps9+muhnW+k= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= -github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= @@ -714,14 +713,13 @@ go.starlark.net v0.0.0-20200306205701-8dd3e2ee1dd5 h1:+FNtrFTmVw0YZGpBGX56XDee33 go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= -go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A= go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= -go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE= -go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= +go.uber.org/zap v1.24.0 h1:FiJd5l1UOLj0wCgbSE0rwwXHzEdAZS6hiiSnxJN/D60= +go.uber.org/zap v1.24.0/go.mod h1:2kMP+WWQ8aoFoedH3T2sq6iJ2yDWpHbP0f6MQbS9Gkg= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= @@ -984,8 +982,9 @@ golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 h1:vVKdlvoWBphwdxWKrFZEuM0kGgGLxUOYcY4U/2Vjg44= golang.org/x/time v0.0.0-20220210224613-90d013bbcef8/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= +golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -1003,7 +1002,6 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= @@ -1235,7 +1233,6 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.0-20200615113413-eeeca48fe776/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go new file mode 100644 index 0000000000..0dfbb97ed4 --- /dev/null +++ b/pkg/nfd-master/metrics.go @@ -0,0 +1,83 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdmaster + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_master_build_info" + updatedNodesQuery = "nfd_updated_nodes" + crdProcessingTimeQuery = "nfd_crd_processing_time" +) + +var ( + srv *http.Server + + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) + updatedNodes = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: updatedNodesQuery, + Help: "Number of nodes updated by the master.", + }) + crdProcessingTime = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: crdProcessingTimeQuery, + Help: "Time spent processing the NodeFeatureRule CRD.", + }) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.Set(1) +} + +// runMetricsServer starts a http server to expose metrics +func runMetricsServer(port int) { + + r := prometheus.NewRegistry() + r.MustRegister(buildInfo, + updatedNodes, + crdProcessingTime) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) + + klog.Infof("Metrics server serving on port: %d", port) + srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux} + klog.Info(srv.ListenAndServe()) +} + +// stopMetricsServer stops the metrics server +func stopMetricsServer() { + if srv != nil { + klog.Info("Stopping metrics server") + srv.Close() + } +} diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 078bbe1804..6308354f70 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -114,6 +114,7 @@ type Args struct { VerifyNodeName bool Options string EnableLeaderElection bool + MetricsPort int Overrides ConfigOverrideArgs } @@ -241,6 +242,12 @@ func (m *nfdMaster) Run() error { return fmt.Errorf("failed to update master node: %v", err) } } + + // Register to metrics server + go runMetricsServer(m.args.MetricsPort) + registerVersion(version.Get()) + defer stopMetricsServer() + // Run gRPC server grpcErr := make(chan error, 1) go m.runGrpcServer(grpcErr) @@ -381,6 +388,11 @@ func (m *nfdMaster) nfdAPIUpdateHandler() { } else { for nodeName := range updateNodes { m.nodeUpdaterPool.queue.Add(nodeName) + if err := m.nfdAPIUpdateOneNode(nodeName); err != nil { + klog.Error(err) + errNodes[nodeName] = struct{}{} + } + updatedNodes.Add(1) } } @@ -685,8 +697,13 @@ func (m *nfdMaster) nfdAPIUpdateAllNodes() error { return err } + // initialize the node updater counter for _, node := range nodes.Items { m.nodeUpdaterPool.queue.Add(node.Name) + if err := m.nfdAPIUpdateOneNode(node.Name); err != nil { + return err + } + updatedNodes.Add(1) } return nil @@ -964,6 +981,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha } // Process all rule CRs + processStart := time.Now() for _, spec := range ruleSpecs { switch { case klog.V(3).Enabled(): @@ -990,6 +1008,9 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha features.InsertAttributeFeatures(nfdv1alpha1.RuleBackrefDomain, nfdv1alpha1.RuleBackrefFeature, ruleOut.Vars) } } + processingTime := time.Since(processStart) + crdProcessingTime.Set(float64(processingTime)) + klog.V(2).Infof("processed %d NodeFeatureRule resources in %v", len(ruleSpecs), processingTime) return labels, extendedResources, taints } diff --git a/pkg/nfd-worker/metrics.go b/pkg/nfd-worker/metrics.go new file mode 100644 index 0000000000..f1c7a72d7f --- /dev/null +++ b/pkg/nfd-worker/metrics.go @@ -0,0 +1,81 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nfdworker + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + "k8s.io/klog/v2" + "sigs.k8s.io/node-feature-discovery/pkg/version" +) + +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names +const ( + buildInfoQuery = "nfd_worker_build_info" + featureDiscoveryDurationQuery = "nfd_feature_discovery_duration_seconds" +) + +var ( + srv *http.Server + + featureDiscoveryDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Name: featureDiscoveryDurationQuery, + Help: "Time taken to discover features", + }, + []string{"NodeName"}, + ) + buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ + Name: buildInfoQuery, + Help: "Version from which Node Feature Discovery was built.", + ConstLabels: map[string]string{ + "version": version.Get(), + }, + }) +) + +// registerVersion exposes the Operator build version. +func registerVersion(version string) { + buildInfo.Set(1) +} + +// runMetricsServer starts a http server to expose metrics +func runMetricsServer(port int) { + + r := prometheus.NewRegistry() + r.MustRegister(featureDiscoveryDuration) + r.MustRegister(buildInfo) + + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) + + klog.Infof("Metrics server serving on port: %d", port) + srv = &http.Server{Addr: fmt.Sprintf(":%d", port), Handler: mux} + klog.Info(srv.ListenAndServe()) +} + +// stopMetricsServer stops the metrics server +func stopMetricsServer() { + if srv != nil { + klog.Info("Stopping metrics server") + srv.Close() + } +} diff --git a/pkg/nfd-worker/nfd-worker.go b/pkg/nfd-worker/nfd-worker.go index 5fe07370ac..1e52399ec9 100644 --- a/pkg/nfd-worker/nfd-worker.go +++ b/pkg/nfd-worker/nfd-worker.go @@ -101,6 +101,7 @@ type Args struct { Options string Server string ServerNameOverride string + MetricsPort int Overrides ConfigOverrideArgs } @@ -197,10 +198,10 @@ func (w *nfdWorker) runFeatureDiscovery() error { discoveryDuration := time.Since(discoveryStart) klog.V(2).InfoS("feature discovery of all sources completed", "duration", discoveryDuration) + featureDiscoveryDuration.WithLabelValues(utils.NodeName()).Observe(discoveryDuration.Seconds()) if w.config.Core.SleepInterval.Duration > 0 && discoveryDuration > w.config.Core.SleepInterval.Duration/2 { klog.InfoS("feature discovery sources took over half of sleep interval ", "duration", discoveryDuration, "sleepInterval", w.config.Core.SleepInterval.Duration) } - // Get the set of feature labels. labels := createFeatureLabels(w.labelSources, w.config.Core.LabelWhiteList.Regexp) @@ -239,6 +240,11 @@ func (w *nfdWorker) Run() error { labelTrigger.Reset(w.config.Core.SleepInterval.Duration) defer labelTrigger.Stop() + // Register to metrics server + go runMetricsServer(w.args.MetricsPort) + registerVersion(version.Get()) + defer stopMetricsServer() + err = w.runFeatureDiscovery() if err != nil { return err