Skip to content

Commit

Permalink
Enable metrics via prometheus operator
Browse files Browse the repository at this point in the history
Expose metrics via prometheus.monitoring.coreos.com/v1

The exposed metrics are

| Metric        | Type | Meaning |
| --------------- | ---------------- | ---------------- |
|  `nfd_master_build_info`           | Gauge | Version from which nfd-master was built. |
|  `nfd_worker_build_info`           | Gauge | Version from which nfd-worker was built. |
|  `nfd_updated_nodes`           |  Counter | Time taken to label a node |
|  `nfd_crd_processing_time`          |  Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` |  HistogramVec | Time taken to discover features on a node |

Signed-off-by: Carlos Eduardo Arango Gutierrez <[email protected]>
  • Loading branch information
ArangoGutierrez committed Jun 9, 2023
1 parent b9fa66b commit a53fcb2
Show file tree
Hide file tree
Showing 21 changed files with 405 additions and 24 deletions.
2 changes: 2 additions & 0 deletions cmd/nfd-master/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ func initFlags(flagset *flag.FlagSet) (*master.Args, *master.ConfigOverrideArgs)
"Enable NFD CRD API controller for processing NodeFeature and NodeFeatureRule objects.")
flagset.IntVar(&args.Port, "port", 8080,
"Port on which to listen for connections.")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.BoolVar(&args.Prune, "prune", false,
"Prune all NFD related attributes from all nodes of the cluaster and exit.")
flagset.BoolVar(&args.VerifyNodeName, "verify-node-name", false,
Expand Down
2 changes: 2 additions & 0 deletions cmd/nfd-worker/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ func initFlags(flagset *flag.FlagSet) (*worker.Args, *worker.ConfigOverrideArgs)
"Kubeconfig to use")
flagset.BoolVar(&args.Oneshot, "oneshot", false,
"Do not publish feature labels")
flagset.IntVar(&args.MetricsPort, "metrics", 8081,
"Port on which to expose metrics.")
flagset.StringVar(&args.Options, "options", "",
"Specify config options from command line. Config options are specified "+
"in the same format as in the config file (i.e. json or yaml). These options")
Expand Down
2 changes: 2 additions & 0 deletions deployment/base/master/master-deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,7 @@ spec:
command:
- "nfd-master"
ports:
- name: metrics
containerPort: 8081
- name: grpc
containerPort: 8080
3 changes: 3 additions & 0 deletions deployment/base/worker-daemonset/worker-daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,6 @@ spec:
- "nfd-worker"
args:
- "-server=nfd-master:8080"
ports:
- name: metrics
containerPort: 8081
13 changes: 12 additions & 1 deletion deployment/helm/node-feature-discovery/templates/master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,18 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
{{- if .Values.metrics.prometheus.enable }}
- "-metrics={{ .Values.metrics.port }}"
{{- end }}
{{- if .Values.metrics.enable }}
ports:
- name: metrics
containerPort: {{ .Values.metrics.port }}
{{- else }}
ports:
- name: metrics
containerPort: 0
{{- end }}
volumeMounts:
{{- if .Values.tls.enable }}
- name: nfd-master-cert
Expand All @@ -139,7 +151,6 @@ spec:
items:
- key: nfd-master.conf
path: nfd-master.conf

{{- with .Values.master.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
Expand Down
55 changes: 55 additions & 0 deletions deployment/helm/node-feature-discovery/templates/prometheus.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
{{- if .Values.metrics.prometheus.enable }}
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: Exists}
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring

{{- end }}
8 changes: 8 additions & 0 deletions deployment/helm/node-feature-discovery/templates/worker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ spec:
- "-key-file=/etc/kubernetes/node-feature-discovery/certs/tls.key"
- "-cert-file=/etc/kubernetes/node-feature-discovery/certs/tls.crt"
{{- end }}
{{- if .Values.metrics.enable }}
- "-metrics={{ .Values.metrics.port }}"
{{- end }}
{{- if .Values.metrics.enable }}
ports:
- name: metrics
containerPort: {{ .Values.metrics.port }}
{{- end }}
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
Expand Down
5 changes: 5 additions & 0 deletions deployment/helm/node-feature-discovery/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -493,3 +493,8 @@ topologyGC:
tls:
enable: false
certManager: false

metrics:
prometheus:
enable: false
port: 8081
1 change: 1 addition & 0 deletions deployment/overlays/default/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ components:
- ../../components/worker-config
- ../../components/common
- ../../components/master-config
- ../../components/prometheus
10 changes: 10 additions & 0 deletions deployment/overlays/prometheus/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

namespace: node-feature-discovery

resources:
- namespace.yaml

components:
- ../../components/prometheus
20 changes: 20 additions & 0 deletions deployment/overlays/prometheus/monitor.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Prometheus Monitor Service (Metrics)
apiVersion: monitoring.coreos.com/v1
kind: PodMonitor
metadata:
name: nfd-metrics
labels:
app: nfd
spec:
podMetricsEndpoints:
- honorLabels: true
interval: 10s
path: /metrics
port: metrics
scheme: http
namespaceSelector:
matchNames:
- node-feature-discovery
selector:
matchExpressions:
- {key: app, operator: In, values: ["nfd-master", "nfd-worker"]}
4 changes: 4 additions & 0 deletions deployment/overlays/prometheus/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: node-feature-discovery
31 changes: 31 additions & 0 deletions deployment/overlays/prometheus/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- list
- watch
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: node-feature-discovery
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring
6 changes: 6 additions & 0 deletions docs/deployment/helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,12 @@ We have introduced the following Chart parameters.
| `tls.enable` | bool | false | Specifies whether to use TLS for communications between components |
| `tls.certManager` | bool | false | If enabled, requires [cert-manager](https://cert-manager.io/docs/) to be installed and will automatically create the required TLS certificates |
| `enableNodeFeatureApi` | bool | false | Enable the [NodeFeature](../usage/custom-resources.md#nodefeature) CRD API for communicating node features. This will automatically disable the gRPC communication.
| `metrics.prometheus.enable` | bool | false | Specifies whether to expose mtrics using prometheus.Operator |
| `metrics.prometheus.port` | integer | 8081 | Port on which to expose metrics from components to prometheus.Operator |

Metrics are configured to be exposed using prometheus.Operator API's by
default. If you want to expose metrics using the prometheus.Operator
API's you need to install the prometheus.Operator in your cluster.

### Master pod parameters

Expand Down
25 changes: 25 additions & 0 deletions docs/deployment/kustomize.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ scenarios under
see [Master Worker Topologyupdater](#master-worker-topologyupdater) below
- [`topologyupdater`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/topologyupdater):
see [Topology Updater](#topologyupdater) below
- [`Metrics`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prometheus):
see [Metrics](#metrics) below
- [`prune`](https://github.com/kubernetes-sigs/node-feature-discovery/blob/{{site.release}}/deployment/overlays/prune):
clean up the cluster after uninstallation, see
[Removing feature labels](uninstallation.md#removing-feature-labels)
Expand Down Expand Up @@ -137,6 +139,28 @@ kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deplo

```

# Metrics

To Add metrics scraping for prometheus-operator,
To allow [prometheus-operator][prometheus-operator]
to scrape metrics from node-feature-discovery,
run the following command:

```bash
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref={{ site.release }}
kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/prometheus?ref={{ site.release }}
```

The exposed metrics are

| Metric | Type | Meaning |
| --------------- | ---------------- | ---------------- |
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built. |
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built. |
| `nfd_updated_nodes` | Counter | Time taken to label a node |
| `nfd_crd_processing_time` | Gauge | Time taken to process a NodeFeatureRule CRD |
| `nfd_feature_discovery_duration_seconds` | HistogramVec | Time taken to discover features on a node |

## Uninstallation

Simplest way is to invoke `kubectl delete` on the overlay that was used for
Expand All @@ -162,3 +186,4 @@ kubectl delete clusterrolebinding nfd-master

<!-- Links -->
[kustomize]: https://github.com/kubernetes-sigs/kustomize
[prometheus-operator]: https://github.com/prometheus-operator/prometheus-operator
16 changes: 8 additions & 8 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ require (
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.4
github.com/opencontainers/runc v1.1.6
github.com/prometheus/client_golang v1.15.1
github.com/smartystreets/assertions v1.2.0
github.com/smartystreets/goconvey v1.6.4
github.com/stretchr/testify v1.8.1
Expand Down Expand Up @@ -62,7 +63,7 @@ require (
github.com/beorn7/perks v1.0.1 // indirect
github.com/blang/semver/v4 v4.0.0 // indirect
github.com/cenkalti/backoff/v4 v4.1.3 // indirect
github.com/cespare/xxhash/v2 v2.1.2 // indirect
github.com/cespare/xxhash/v2 v2.2.0 // indirect
github.com/checkpoint-restore/go-criu/v5 v5.3.0 // indirect
github.com/cilium/ebpf v0.10.0 // indirect
github.com/container-storage-interface/spec v1.7.0 // indirect
Expand Down Expand Up @@ -111,7 +112,7 @@ require (
github.com/libopenstorage/openstorage v1.0.0 // indirect
github.com/lithammer/dedent v1.1.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect
github.com/mistifyio/go-zfs v2.1.2-0.20190413222219-f784269be439+incompatible // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.4.1 // indirect
Expand All @@ -129,10 +130,9 @@ require (
github.com/opencontainers/selinux v1.11.0 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.14.0 // indirect
github.com/prometheus/client_model v0.3.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
github.com/prometheus/common v0.42.0 // indirect
github.com/prometheus/procfs v0.9.0 // indirect
github.com/rubiojr/go-vhd v0.0.0-20200706105327-02e210299021 // indirect
github.com/seccomp/libseccomp-golang v0.10.0 // indirect
github.com/sirupsen/logrus v1.9.0 // indirect
Expand Down Expand Up @@ -161,14 +161,14 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.7.0 // indirect
go.uber.org/multierr v1.6.0 // indirect
go.uber.org/zap v1.19.0 // indirect
go.uber.org/zap v1.24.0 // indirect
golang.org/x/crypto v0.1.0 // indirect
golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect
golang.org/x/oauth2 v0.5.0 // indirect
golang.org/x/sync v0.1.0 // indirect
golang.org/x/sys v0.7.0 // indirect
golang.org/x/term v0.7.0 // indirect
golang.org/x/text v0.9.0 // indirect
golang.org/x/time v0.0.0-20220210224613-90d013bbcef8 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/tools v0.7.0 // indirect
google.golang.org/api v0.60.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
Expand Down
Loading

0 comments on commit a53fcb2

Please sign in to comment.