From b90f2c318e3dd72e05ed74383b7510231beb3eb3 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 1 Aug 2023 14:54:24 +0300 Subject: [PATCH 1/4] metrics: add nfd_node_update_failures_total counter Add a new counter for tracking node update failures from nfd-master. This tracks both normal feature updates and the --prune sub-command. This is a simple counter without any additional labels - nfd-master logs can be used for further diagnostics. --- docs/deployment/metrics.md | 1 + pkg/nfd-master/metrics.go | 12 +++++++++--- pkg/nfd-master/nfd-master.go | 2 ++ pkg/nfd-master/node-updater-pool.go | 1 + 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 38a4011fe0..a6db3bd3b2 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -18,6 +18,7 @@ The exposed metrics are | `nfd_master_build_info` | Gauge | Version from which nfd-master was built | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | `nfd_node_updates_total` | Counter | Number of nodes updated +| `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 9b409720d7..20bdbc7b8f 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,9 +28,10 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" ) var ( @@ -47,6 +48,10 @@ var ( Name: nodeUpdatesQuery, Help: "Number of nodes updated by the master.", }) + nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeUpdateFailuresQuery, + Help: "Number of node update failures.", + }) nfrProcessingTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: nfrProcessingTimeQuery, @@ -70,6 +75,7 @@ func runMetricsServer(port int) { r := prometheus.NewRegistry() r.MustRegister(buildInfo, nodeUpdates, + nodeUpdateFailures, nfrProcessingTime) mux := http.NewServeMux() diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index c194cd1768..731d0c1f66 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error { // Prune labels and extended resources err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{}) if err != nil { + nodeUpdateFailures.Inc() return fmt.Errorf("failed to prune node %q: %v", node.Name, err) } @@ -675,6 +676,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se // Create labels et al if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil { + nodeUpdateFailures.Inc() return &pb.SetLabelsReply{}, err } } diff --git a/pkg/nfd-master/node-updater-pool.go b/pkg/nfd-master/node-updater-pool.go index 8a19a91db8..59429c34b2 100644 --- a/pkg/nfd-master/node-updater-pool.go +++ b/pkg/nfd-master/node-updater-pool.go @@ -53,6 +53,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI return true } else { klog.ErrorS(err, "failed to update node", "nodeName", nodeName) + nodeUpdateFailures.Inc() } } queue.Forget(nodeName) From a8a29e6df22c4391609fb4d49c0f5d75f9a13d36 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 1 Aug 2023 15:50:03 +0300 Subject: [PATCH 2/4] metrics: add nfd_nodefeaturerule_processing_errors_total counter Add a counter for errors encountered when processing NodeFeatureRules. Another simple counter without any additional prometheus labels - nfd-master logs can provide further details. --- docs/deployment/metrics.md | 1 + pkg/nfd-master/metrics.go | 16 +++++++++++----- pkg/nfd-master/nfd-master.go | 1 + 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index a6db3bd3b2..06c6c31924 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -20,6 +20,7 @@ The exposed metrics are | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects +| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node ## Via Kustomize diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 20bdbc7b8f..63bb25b3e9 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -28,10 +28,11 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( - buildInfoQuery = "nfd_master_build_info" - nodeUpdatesQuery = "nfd_node_updates_total" - nodeUpdateFailuresQuery = "nfd_node_update_failures_total" - nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + buildInfoQuery = "nfd_master_build_info" + nodeUpdatesQuery = "nfd_node_updates_total" + nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" + nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" ) var ( @@ -63,6 +64,10 @@ var ( "node", }, ) + nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nfrProcessingErrorsQuery, + Help: "Number of errors encountered while processing NodeFeatureRule objects.", + }) ) // registerVersion exposes the Operator build version. @@ -76,7 +81,8 @@ func runMetricsServer(port int) { r.MustRegister(buildInfo, nodeUpdates, nodeUpdateFailures, - nfrProcessingTime) + nfrProcessingTime, + nfrProcessingErrors) mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{})) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 731d0c1f66..5357e228c1 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -991,6 +991,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha ruleOut, err := rule.Execute(features) if err != nil { klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName) + nfrProcessingErrors.Inc() continue } taints = append(taints, ruleOut.Taints...) From 4b24cc1afa0f04ecdae963d3a358ccfd7c19a148 Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Tue, 1 Aug 2023 16:11:58 +0300 Subject: [PATCH 3/4] metrics: counters for rejected labels, extended resources and taints Add counters for labels, extended resources and taints rejected/filtered out by nfd-master. --- docs/deployment/metrics.md | 3 +++ pkg/nfd-master/metrics.go | 21 ++++++++++++++++++++- pkg/nfd-master/nfd-master.go | 4 ++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 06c6c31924..550671f6e7 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -19,6 +19,9 @@ The exposed metrics are | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_node_update_failures_total` | Counter | Number of nodes update failures +| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master +| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master +| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master | `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects | `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects | `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 63bb25b3e9..1aec52f287 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -31,6 +31,9 @@ const ( buildInfoQuery = "nfd_master_build_info" nodeUpdatesQuery = "nfd_node_updates_total" nodeUpdateFailuresQuery = "nfd_node_update_failures_total" + nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total" + nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total" + nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total" nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds" nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total" ) @@ -53,6 +56,18 @@ var ( Name: nodeUpdateFailuresQuery, Help: "Number of node update failures.", }) + nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeLabelsRejectedQuery, + Help: "Number of node labels that were rejected by nfd-master.", + }) + nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeERsRejectedQuery, + Help: "Number of node extended resources that were rejected by nfd-master.", + }) + nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeTaintsRejectedQuery, + Help: "Number of node taints that were rejected by nfd-master.", + }) nfrProcessingTime = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: nfrProcessingTimeQuery, @@ -78,9 +93,13 @@ func registerVersion(version string) { // runMetricsServer starts a http server to expose metrics func runMetricsServer(port int) { r := prometheus.NewRegistry() - r.MustRegister(buildInfo, + r.MustRegister( + buildInfo, nodeUpdates, nodeUpdateFailures, + nodeLabelsRejected, + nodeERsRejected, + nodeTaintsRejected, nfrProcessingTime, nfrProcessingErrors) diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index 5357e228c1..cf92976ffc 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -510,6 +510,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea if value, err := m.filterFeatureLabel(name, value, features); err != nil { klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value) + nodeLabelsRejected.Inc() } else { outLabels[name] = value } @@ -523,6 +524,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea if value, ok := outLabels[extendedResourceName]; ok { if _, err := strconv.Atoi(value); err != nil { klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value) + nodeERsRejected.Inc() continue // non-numeric label can't be used } @@ -603,6 +605,7 @@ func filterTaints(taints []corev1.Taint) []corev1.Taint { for _, taint := range taints { if err := filterTaint(&taint); err != nil { klog.ErrorS(err, "ignoring taint", "taint", taint) + nodeTaintsRejected.Inc() } else { outTaints = append(outTaints, taint) } @@ -786,6 +789,7 @@ func filterExtendedResources(features *nfdv1alpha1.Features, extendedResources E capacity, err := filterExtendedResource(name, value, features) if err != nil { klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value) + nodeERsRejected.Inc() } else { outExtendedResources[name] = capacity } From 5ad2294c14e5a986b3c7f0d41ab487bbdb0d410f Mon Sep 17 00:00:00 2001 From: Markus Lehtonen Date: Wed, 2 Aug 2023 16:47:11 +0300 Subject: [PATCH 4/4] metrics: add nfd_node_update_requests_total counter Add a counter for total number of node update/sync requests. In practice, this counts the number of gRPC requests received if the gRPC API is in use. If the NodeFeature API is enabled, this counts the requests initiated by the NFD API controller, i.e. updates triggered by changes in NodeFeature or NodeFeatureRule objects plus updates initiated by the controller resync period. --- docs/deployment/metrics.md | 1 + pkg/nfd-master/metrics.go | 6 ++++++ pkg/nfd-master/nfd-master.go | 1 + pkg/nfd-master/node-updater-pool.go | 1 + 4 files changed, 9 insertions(+) diff --git a/docs/deployment/metrics.md b/docs/deployment/metrics.md index 550671f6e7..7e52687309 100644 --- a/docs/deployment/metrics.md +++ b/docs/deployment/metrics.md @@ -17,6 +17,7 @@ The exposed metrics are | ------------------------------------------------- | --------- | --------------------------------------- | `nfd_master_build_info` | Gauge | Version from which nfd-master was built | `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built +| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master | `nfd_node_updates_total` | Counter | Number of nodes updated | `nfd_node_update_failures_total` | Counter | Number of nodes update failures | `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master diff --git a/pkg/nfd-master/metrics.go b/pkg/nfd-master/metrics.go index 1aec52f287..c64842a7ae 100644 --- a/pkg/nfd-master/metrics.go +++ b/pkg/nfd-master/metrics.go @@ -29,6 +29,7 @@ import ( // When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names const ( buildInfoQuery = "nfd_master_build_info" + nodeUpdateRequestsQuery = "nfd_node_update_requests_total" nodeUpdatesQuery = "nfd_node_updates_total" nodeUpdateFailuresQuery = "nfd_node_update_failures_total" nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total" @@ -48,6 +49,10 @@ var ( "version": version.Get(), }, }) + nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{ + Name: nodeUpdateRequestsQuery, + Help: "Number of node update requests processed by the master.", + }) nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{ Name: nodeUpdatesQuery, Help: "Number of nodes updated by the master.", @@ -95,6 +100,7 @@ func runMetricsServer(port int) { r := prometheus.NewRegistry() r.MustRegister( buildInfo, + nodeUpdateRequests, nodeUpdates, nodeUpdateFailures, nodeLabelsRejected, diff --git a/pkg/nfd-master/nfd-master.go b/pkg/nfd-master/nfd-master.go index cf92976ffc..9f283398cc 100644 --- a/pkg/nfd-master/nfd-master.go +++ b/pkg/nfd-master/nfd-master.go @@ -654,6 +654,7 @@ func isNamespaceDenied(labelNs string, wildcardDeniedNs map[string]struct{}, nor // SetLabels implements LabelerServer func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) { + nodeUpdateRequests.Inc() err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName) if err != nil { klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName) diff --git a/pkg/nfd-master/node-updater-pool.go b/pkg/nfd-master/node-updater-pool.go index 59429c34b2..55ce2a712f 100644 --- a/pkg/nfd-master/node-updater-pool.go +++ b/pkg/nfd-master/node-updater-pool.go @@ -46,6 +46,7 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI defer queue.Done(nodeName) + nodeUpdateRequests.Inc() if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil { if queue.NumRequeues(nodeName) < 5 { klog.InfoS("retrying node update", "nodeName", nodeName)