Skip to content

Commit

Permalink
Enable metrics via prometheus operator
Browse files Browse the repository at this point in the history
Expose metrics via prometheus.monitoring.coreos.com/v1

The exposed metrics are

| Metric        | Type | Meaning |
| --------------- | ---------------- | ---------------- |
|  `nfd_master_build_info`           | Gauge | Version from which nfd-master was built. |
|  `nfd_worker_build_info`           | Gauge | Version from which nfd-worker was built. |
|  `nfd_updated_nodes`           |  Counter | Time taken to label a node |
|  `nfd_crd_processing_time`          |  Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` |  HistogramVec | Time taken to discover features on a node |

Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Jul 20, 2023
1 parent f02d172 commit 90b1de6
Show file tree
Hide file tree
Showing 19 changed files with 345 additions and 25 deletions.
2 changes: 2 additions & 0 deletions cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to listen for connections.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluster and exit.")
flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false,
Expand Down
2 changes: 2 additions & 0 deletions cmd/nfd-worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs)
"Kubeconfig to use")
flagset.BoolVar(&args.Oneshot, "oneshot", false,
"Do not publish feature labels")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.StringVar(&args.Options, "options", "",
"Specify config options from command line. Config options are specified "+
"in the same format as in the config file (i.e. json or yaml). These options")
Expand Down
2 changes: 2 additions & 0 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,7 @@ spec:
command:
- "nfd-master"
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080
3 changes: 3 additions & 0 deletions deployment/base/worker-daemonset/worker-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ spec:
- "nfd-worker"
args:
- "-server=nfd-master:8080"
ports:
- name: metrics
containerPort: 8081
4 changes: 3 additions & 1 deletion deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ spec:
ports:
- containerPort: {{ .Values.master.port | default "8080" }}
name: grpc
- containerPort: {{ .Values.master.metricsPort | default "8081" }}
name: metrics
env:
- name: NODE_NAME
valueFrom:
Expand Down Expand Up @@ -118,6 +120,7 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
- "-metrics={{ .Values.master.metricsPort | default "8081" }}"
volumeMounts:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
Expand All @@ -139,7 +142,6 @@ spec:
items:
- key: nfd-master.conf
path: nfd-master.conf

{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
23 changes: 23 additions & 0 deletions deployment/helm/node-feature-discovery/templates/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{{- if .Values.prometheus.enable }}
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: {{ include "node-feature-discovery.fullname" . }}
labels:
{{- include "node-feature-discovery.selectorLabels" . | nindent 4 }}
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- {{ include "node-feature-discovery.namespace" . }}
selector:
matchExpressions:
- {key: app.kubernetes.io/instance, operator: In, values: ["{{ .Release.Name }}"]}
- {key: app.kubernetes.io/name, operator: In, values: ["{{ include "node-feature-discovery.name" . }}"]}
{{- end }}
4 changes: 4 additions & 0 deletions deployment/helm/node-feature-discovery/templates/worker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
- "-metrics={{ .Values.worker.metricsPort }}"
ports:
- name: metrics
containerPort: {{ .Values.worker.metricsPort }}
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
Expand Down
8 changes: 6 additions & 2 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ master:
### <NFD-MASTER-CONF-END-DO-NOT-REMOVE>
# The TCP port that nfd-master listens for incoming requests. Default: 8080
port: 8080
metricsPort: 8081
instance:
featureApi:
resyncPeriod:
Expand Down Expand Up @@ -341,8 +342,8 @@ worker:
# my.kernel.feature: {op: IsTrue}
# my.dummy.var: {op: Gt, value: ["0"]}
#
### <NFD-WORKER-CONF-END-DO-NOT-REMOVE>

### <NFD-WORKER-CONF-END-DO-NOT-REMOVE>
metricsPort: 8081
daemonsetAnnotations: {}
podSecurityContext: {}
# fsGroup: 2000
Expand Down Expand Up @@ -493,3 +494,6 @@ topologyGC:
tls:
enable: false
certManager: false

prometheus:
enable: false
7 changes: 7 additions & 0 deletions deployment/overlays/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: node-feature-discovery

resources:
- monitor.yaml
20 changes: 20 additions & 0 deletions deployment/overlays/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: In, values: ["nfd-master", "nfd-worker"]}
7 changes: 7 additions & 0 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,19 @@ We have introduced the following Chart parameters.
| `tls.enable` | bool | false | Specifies whether to use TLS for communications between components |
| `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates |
| `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication.
| `prometheus.enable` | bool | false | Specifies whether to expose metrics using prometheus.Operator |

Metrics are configured to be exposed using prometheus.Operator API's by
default. If you want to expose metrics using the prometheus.Operator
API's you need to install the prometheus.Operator in your cluster.

### Master pod parameters

| Name | Type | Default | description |
|-----------------------------|---------|-----------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|
| `master.*` | dict | | NFD master deployment configuration |
| `master.port` | integer | | Specifies the TCP port that nfd-master listens for incoming requests. |
| `master.metricsPort` | integer | 8081 | Port on which to expose metrics from components to prometheus.Operator |
| `master.instance` | string | | Instance name. Used to separate annotation namespaces for multiple parallel deployments |
| `master.resyncPeriod` | string | | NFD API controller resync period. |
| `master.extraLabelNs` | array | [] | List of allowed extra label namespaces |
Expand Down Expand Up @@ -139,6 +145,7 @@ We have introduced the following Chart parameters.
| Name | Type | Default | description |
| ---- | ---- | ------- | ----------- |
| `worker.*` | dict | | NFD worker daemonset configuration |
| `worker.metricsPort*` | integer | 8081 | Port on which to expose metrics from components to prometheus.Operator |
| `worker.config` | dict | | NFD worker [configuration](../reference/worker-configuration-reference) |
| `worker.podSecurityContext` | dict | {} | [PodSecurityContext](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-pod) holds pod-level security attributes and common container settings |
| `worker.securityContext` | dict | {} | Container [security settings](https://kubernetes.io/docs/tasks/configure-pod-container/security-context/#set-the-security-context-for-a-container) |
Expand Down
14 changes: 14 additions & 0 deletions docs/deployment/kustomize.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ scenarios under
see [Master Worker Topologyupdater](#master-worker-topologyupdater) below
- [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater):
see [Topology Updater](#topologyupdater) below
- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus):
see [Metrics](#metrics) below
- [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune):
clean up the cluster after uninstallation, see
[Removing feature labels](uninstallation.md#removing-feature-labels)
Expand Down Expand Up @@ -137,6 +139,17 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo

```

### Metrics

To allow [prometheus-operator][prometheus-operator]
to scrape metrics from node-feature-discovery,
run the following command:

```bash
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }}
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }}
```

## Uninstallation

Simplest way is to invoke `kubectl delete` on the overlay that was used for
Expand All @@ -162,3 +175,4 @@ kubectl delete clusterrolebinding nfd-master

<!-- Links -->
[kustomize]: https://github.com/kubernetes-sigs/kustomize
[prometheus-operator]: https://github.com/prometheus-operator/prometheus-operator
43 changes: 43 additions & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
---
title: "Metrics"
layout: default
sort: 7
---

# Metrics

Metrics are configured to be exposed using prometheus.Operator API's by
default. If you want to expose metrics using the prometheus.Operator
API's you need to install the prometheus.Operator in your cluster.
By default NFD Master and Worker expose metrics on port 8081.

The exposed metrics are

| Metric | Type | Meaning |
| ---------------------------------- | ------- | ---------------- |
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. |
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. |
| `nfd_updated_nodes` | Counter | Time taken to label a node |
| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` | HistogramVec | Time taken to discover features on a node |

## Via Kustomize

To deploy NFD with metrics enabled using kustomize, you can use the
[Metrics Overlay](kustomize.md#metrics).

## Via Helm

By default metrics are enabled when deploying NFD via Helm. To enable Prometheus
to scrape metrics from NFD, you need to pass the following values to Helm:

```bash
--set prometheus.enable=true
```

For more info on Helm deployment, see [Helm](helm.md).

We recommend setting
`--set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false`
when deploying prometheus-operator via Helm to enable the prometheus-operator
to scrape metrics from any PodMonitor.
16 changes: 8 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.4
github.com/opencontainers/runc v1.1.6
github.com/prometheus/client_golang v1.15.1
github.com/smartystreets/assertions v1.2.0
github.com/smartystreets/goconvey v1.6.4
github.com/stretchr/testify v1.8.1
Expand Down Expand Up @@ -62,7 +63,7 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.1.3 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect
github.com/cilium/ebpf v0.10.0 // indirect
github.com/container-storage-interface/spec v1.7.0 // indirect
Expand Down Expand Up @@ -111,7 +112,7 @@ require (
github.com/libopenstorage/openstorage v1.0.0 // indirect
github.com/lithammer/dedent v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.4.1 // indirect
Expand All @@ -129,10 +130,9 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/rubiojr/go-vhd v0.0.0-20200706105327-02e210299021 // indirect
github.com/seccomp/libseccomp-golang v0.10.0 // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
Expand Down Expand Up @@ -161,14 +161,14 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/oauth2 v0.5.0 // indirect
golang.org/x/sync v0.1.0 // indirect
golang.org/x/sys v0.7.0 // indirect
golang.org/x/term v0.7.0 // indirect
golang.org/x/text v0.9.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.7.0 // indirect
google.golang.org/api v0.60.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
Expand Down
Loading

0 comments on commit 90b1de6

Please sign in to comment.