Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics: additional metrics for nfd-master #1290

Merged
merged 4 commits into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@ The exposed metrics are
| ------------------------------------------------- | --------- | ---------------------------------------
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node

## Via Kustomize
Expand Down
47 changes: 42 additions & 5 deletions pkg/nfd-master/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ import (

// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_master_build_info"
nodeUpdatesQuery = "nfd_node_updates_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
buildInfoQuery = "nfd_master_build_info"
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
nodeUpdatesQuery = "nfd_node_updates_total"
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
)

var (
Expand All @@ -43,10 +49,30 @@ var (
"version": version.Get(),
},
})
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateRequestsQuery,
Help: "Number of node update requests processed by the master.",
})
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdatesQuery,
Help: "Number of nodes updated by the master.",
})
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateFailuresQuery,
Help: "Number of node update failures.",
})
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeLabelsRejectedQuery,
Help: "Number of node labels that were rejected by nfd-master.",
})
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeERsRejectedQuery,
Help: "Number of node extended resources that were rejected by nfd-master.",
})
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeTaintsRejectedQuery,
Help: "Number of node taints that were rejected by nfd-master.",
})
nfrProcessingTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: nfrProcessingTimeQuery,
Expand All @@ -58,6 +84,10 @@ var (
"node",
},
)
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: nfrProcessingErrorsQuery,
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
})
)

// registerVersion exposes the Operator build version.
Expand All @@ -68,9 +98,16 @@ func registerVersion(version string) {
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
r.MustRegister(
buildInfo,
nodeUpdateRequests,
nodeUpdates,
nfrProcessingTime)
nodeUpdateFailures,
nodeLabelsRejected,
nodeERsRejected,
nodeTaintsRejected,
nfrProcessingTime,
nfrProcessingErrors)

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
Expand Down
8 changes: 8 additions & 0 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error {
// Prune labels and extended resources
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
if err != nil {
nodeUpdateFailures.Inc()
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
}

Expand Down Expand Up @@ -509,6 +510,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea

if value, err := m.filterFeatureLabel(name, value, features); err != nil {
klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value)
nodeLabelsRejected.Inc()
} else {
outLabels[name] = value
}
Expand All @@ -522,6 +524,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea
if value, ok := outLabels[extendedResourceName]; ok {
if _, err := strconv.Atoi(value); err != nil {
klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value)
nodeERsRejected.Inc()
continue // non-numeric label can't be used
}

Expand Down Expand Up @@ -602,6 +605,7 @@ func filterTaints(taints []corev1.Taint) []corev1.Taint {
for _, taint := range taints {
if err := filterTaint(&taint); err != nil {
klog.ErrorS(err, "ignoring taint", "taint", taint)
nodeTaintsRejected.Inc()
} else {
outTaints = append(outTaints, taint)
}
Expand Down Expand Up @@ -650,6 +654,7 @@ func isNamespaceDenied(labelNs string, wildcardDeniedNs map[string]struct{}, nor

// SetLabels implements LabelerServer
func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) {
nodeUpdateRequests.Inc()
err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName)
if err != nil {
klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName)
Expand All @@ -675,6 +680,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se

// Create labels et al
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
nodeUpdateFailures.Inc()
return &pb.SetLabelsReply{}, err
}
}
Expand Down Expand Up @@ -784,6 +790,7 @@ func filterExtendedResources(features *nfdv1alpha1.Features, extendedResources E
capacity, err := filterExtendedResource(name, value, features)
if err != nil {
klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value)
nodeERsRejected.Inc()
} else {
outExtendedResources[name] = capacity
}
Expand Down Expand Up @@ -989,6 +996,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
ruleOut, err := rule.Execute(features)
if err != nil {
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
nfrProcessingErrors.Inc()
continue
}
taints = append(taints, ruleOut.Taints...)
Expand Down
2 changes: 2 additions & 0 deletions pkg/nfd-master/node-updater-pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI

defer queue.Done(nodeName)

nodeUpdateRequests.Inc()
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
if queue.NumRequeues(nodeName) < 5 {
klog.InfoS("retrying node update", "nodeName", nodeName)
queue.AddRateLimited(nodeName)
return true
} else {
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
nodeUpdateFailures.Inc()
}
}
queue.Forget(nodeName)
Expand Down