Skip to content

Commit

Permalink
Merge pull request #1290 from marquiz/devel/metrics-new
Browse files Browse the repository at this point in the history
metrics: additional metrics for nfd-master
  • Loading branch information
k8s-ci-robot authored Aug 28, 2023
2 parents e0c4770 + 5ad2294 commit 6d95e59
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 5 deletions.
6 changes: 6 additions & 0 deletions docs/deployment/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,14 @@ The exposed metrics are
| ------------------------------------------------- | --------- | ---------------------------------------
| `nfd_master_build_info` | Gauge | Version from which nfd-master was built
| `nfd_worker_build_info` | Gauge | Version from which nfd-worker was built
| `nfd_node_update_requests_total` | Counter | Number of node update requests processed by the master
| `nfd_node_updates_total` | Counter | Number of nodes updated
| `nfd_node_update_failures_total` | Counter | Number of nodes update failures
| `nfd_node_labels_rejected_total` | Counter | Number of nodes labels rejected by nfd-master
| `nfd_node_extendedresources_rejected_total` | Counter | Number of nodes extended resources rejected by nfd-master
| `nfd_node_taints_rejected_total` | Counter | Number of nodes taints rejected by nfd-master
| `nfd_nodefeaturerule_processing_duration_seconds` | Histogram | Time taken to process NodeFeatureRule objects
| `nfd_nodefeaturerule_processing_errors_total` | Counter | Number or errors encountered while processing NodeFeatureRule objects
| `nfd_feature_discovery_duration_seconds` | Histogram | Time taken to discover features on a node
| `nfd_topology_updater_scan_errors_total` | Counter | Number of errors in scanning resource allocation of pods.

Expand Down
47 changes: 42 additions & 5 deletions pkg/nfd-master/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,15 @@ import (

// When adding metric names, see https://prometheus.io/docs/practices/naming/#metric-names
const (
buildInfoQuery = "nfd_master_build_info"
nodeUpdatesQuery = "nfd_node_updates_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
buildInfoQuery = "nfd_master_build_info"
nodeUpdateRequestsQuery = "nfd_node_update_requests_total"
nodeUpdatesQuery = "nfd_node_updates_total"
nodeUpdateFailuresQuery = "nfd_node_update_failures_total"
nodeLabelsRejectedQuery = "nfd_node_labels_rejected_total"
nodeERsRejectedQuery = "nfd_node_extendedresources_rejected_total"
nodeTaintsRejectedQuery = "nfd_node_taints_rejected_total"
nfrProcessingTimeQuery = "nfd_nodefeaturerule_processing_duration_seconds"
nfrProcessingErrorsQuery = "nfd_nodefeaturerule_processing_errors_total"
)

var (
Expand All @@ -43,10 +49,30 @@ var (
"version": version.Get(),
},
})
nodeUpdateRequests = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateRequestsQuery,
Help: "Number of node update requests processed by the master.",
})
nodeUpdates = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdatesQuery,
Help: "Number of nodes updated by the master.",
})
nodeUpdateFailures = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeUpdateFailuresQuery,
Help: "Number of node update failures.",
})
nodeLabelsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeLabelsRejectedQuery,
Help: "Number of node labels that were rejected by nfd-master.",
})
nodeERsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeERsRejectedQuery,
Help: "Number of node extended resources that were rejected by nfd-master.",
})
nodeTaintsRejected = prometheus.NewCounter(prometheus.CounterOpts{
Name: nodeTaintsRejectedQuery,
Help: "Number of node taints that were rejected by nfd-master.",
})
nfrProcessingTime = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: nfrProcessingTimeQuery,
Expand All @@ -58,6 +84,10 @@ var (
"node",
},
)
nfrProcessingErrors = prometheus.NewCounter(prometheus.CounterOpts{
Name: nfrProcessingErrorsQuery,
Help: "Number of errors encountered while processing NodeFeatureRule objects.",
})
)

// registerVersion exposes the Operator build version.
Expand All @@ -68,9 +98,16 @@ func registerVersion(version string) {
// runMetricsServer starts a http server to expose metrics
func runMetricsServer(port int) {
r := prometheus.NewRegistry()
r.MustRegister(buildInfo,
r.MustRegister(
buildInfo,
nodeUpdateRequests,
nodeUpdates,
nfrProcessingTime)
nodeUpdateFailures,
nodeLabelsRejected,
nodeERsRejected,
nodeTaintsRejected,
nfrProcessingTime,
nfrProcessingErrors)

mux := http.NewServeMux()
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
Expand Down
8 changes: 8 additions & 0 deletions pkg/nfd-master/nfd-master.go
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,7 @@ func (m *nfdMaster) prune() error {
// Prune labels and extended resources
err := m.updateNodeObject(cli, node.Name, Labels{}, Annotations{}, ExtendedResources{}, []corev1.Taint{})
if err != nil {
nodeUpdateFailures.Inc()
return fmt.Errorf("failed to prune node %q: %v", node.Name, err)
}

Expand Down Expand Up @@ -509,6 +510,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea

if value, err := m.filterFeatureLabel(name, value, features); err != nil {
klog.ErrorS(err, "ignoring label", "labelKey", name, "labelValue", value)
nodeLabelsRejected.Inc()
} else {
outLabels[name] = value
}
Expand All @@ -522,6 +524,7 @@ func (m *nfdMaster) filterFeatureLabels(labels Labels, features *nfdv1alpha1.Fea
if value, ok := outLabels[extendedResourceName]; ok {
if _, err := strconv.Atoi(value); err != nil {
klog.ErrorS(err, "bad label value encountered for extended resource", "labelKey", extendedResourceName, "labelValue", value)
nodeERsRejected.Inc()
continue // non-numeric label can't be used
}

Expand Down Expand Up @@ -602,6 +605,7 @@ func filterTaints(taints []corev1.Taint) []corev1.Taint {
for _, taint := range taints {
if err := filterTaint(&taint); err != nil {
klog.ErrorS(err, "ignoring taint", "taint", taint)
nodeTaintsRejected.Inc()
} else {
outTaints = append(outTaints, taint)
}
Expand Down Expand Up @@ -650,6 +654,7 @@ func isNamespaceDenied(labelNs string, wildcardDeniedNs map[string]struct{}, nor

// SetLabels implements LabelerServer
func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.SetLabelsReply, error) {
nodeUpdateRequests.Inc()
err := authorizeClient(c, m.args.VerifyNodeName, r.NodeName)
if err != nil {
klog.ErrorS(err, "gRPC client authorization failed", "nodeName", r.NodeName)
Expand All @@ -675,6 +680,7 @@ func (m *nfdMaster) SetLabels(c context.Context, r *pb.SetLabelsRequest) (*pb.Se

// Create labels et al
if err := m.refreshNodeFeatures(cli, r.NodeName, annotations, r.GetLabels(), r.GetFeatures()); err != nil {
nodeUpdateFailures.Inc()
return &pb.SetLabelsReply{}, err
}
}
Expand Down Expand Up @@ -784,6 +790,7 @@ func filterExtendedResources(features *nfdv1alpha1.Features, extendedResources E
capacity, err := filterExtendedResource(name, value, features)
if err != nil {
klog.ErrorS(err, "failed to create extended resources", "extendedResourceName", name, "extendedResourceValue", value)
nodeERsRejected.Inc()
} else {
outExtendedResources[name] = capacity
}
Expand Down Expand Up @@ -989,6 +996,7 @@ func (m *nfdMaster) processNodeFeatureRule(nodeName string, features *nfdv1alpha
ruleOut, err := rule.Execute(features)
if err != nil {
klog.ErrorS(err, "failed to process rule", "ruleName", rule.Name, "nodefeaturerule", klog.KObj(spec), "nodeName", nodeName)
nfrProcessingErrors.Inc()
continue
}
taints = append(taints, ruleOut.Taints...)
Expand Down
2 changes: 2 additions & 0 deletions pkg/nfd-master/node-updater-pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ func (u *nodeUpdaterPool) processNodeUpdateRequest(queue workqueue.RateLimitingI

defer queue.Done(nodeName)

nodeUpdateRequests.Inc()
if err := u.nfdMaster.nfdAPIUpdateOneNode(nodeName.(string)); err != nil {
if queue.NumRequeues(nodeName) < 5 {
klog.InfoS("retrying node update", "nodeName", nodeName)
queue.AddRateLimited(nodeName)
return true
} else {
klog.ErrorS(err, "failed to update node", "nodeName", nodeName)
nodeUpdateFailures.Inc()
}
}
queue.Forget(nodeName)
Expand Down

0 comments on commit 6d95e59

Please sign in to comment.