Skip to content

Commit

Permalink
core/tracker: ensure success/fail/expected counters (#2080)
Browse files Browse the repository at this point in the history
Adds success/failure/expected counters for both peer participation and overall duty. This allows improving our dashboard to show duty and participation percentage success/failure with simple queries.

category: misc
ticket: #2034
  • Loading branch information
corverroos authored and LukeHackett12 committed Apr 11, 2023
1 parent deaea1c commit e77e7da
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 10 deletions.
33 changes: 31 additions & 2 deletions core/tracker/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,56 @@ var (
Help: "Set to 1 if peer participated successfully for the given duty or else 0",
}, []string{"duty", "peer"})

participationSuccess = promauto.NewCounterVec(prometheus.CounterOpts{
// TODO(corver): Remove in v0.17 once all dashboards have been updated.
participationSuccessLegacy = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "participation_total",
Help: "Total number of successful participations by peer and duty type",
}, []string{"duty", "peer"})

participationSuccess = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "participation_success_total",
Help: "Total number of successful participations by peer and duty type",
}, []string{"duty", "peer"})

participationMissed = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "participation_missed_total",
Help: "Total number of missed participations by peer and duty type",
}, []string{"duty", "peer"})

failedCounter = promauto.NewCounterVec(prometheus.CounterOpts{
participationExpect = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "participation_expected_total",
Help: "Total number of expected participations (fail + success) by peer and duty type",
}, []string{"duty", "peer"})

dutyFailed = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "failed_duties_total",
Help: "Total number of failed duties by type",
}, []string{"duty"})

dutySuccess = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "success_duties_total",
Help: "Total number of successful duties by type",
}, []string{"duty"})

dutyExpect = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Name: "expect_duties_total",
Help: "Total number of expected duties (failed + success) by type",
}, []string{"duty"})

unexpectedEventsCounter = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: "core",
Subsystem: "tracker",
Expand Down
35 changes: 27 additions & 8 deletions core/tracker/tracker.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,11 +568,23 @@ func extractParSigs(ctx context.Context, events []event) parsigsByMsg {
func newFailedDutyReporter() func(ctx context.Context, duty core.Duty, failed bool, step step, reason string, err error) {
var loggedNoSelections bool

return func(ctx context.Context, duty core.Duty, failed bool, step step, reason string, err error) {
counter := failedCounter.WithLabelValues(duty.Type.String())
counter.Add(0) // Zero the metric so first failure shows in grafana.
// Initialise counters to 0 to avoid non-existent metrics issues when querying prometheus.
for _, dutyType := range core.AllDutyTypes() {
dutyFailed.WithLabelValues(dutyType.String()).Add(0)
dutySuccess.WithLabelValues(dutyType.String()).Add(0)
dutyExpect.WithLabelValues(dutyType.String()).Add(0)
}

return func(ctx context.Context, duty core.Duty, failed bool, step step, reason string, err error) {
if !failed {
if step == fetcher {
// TODO(corver): improve detection of duties that are not expected to be performed (aggregation).
return
}

dutySuccess.WithLabelValues(duty.Type.String()).Inc()
dutyExpect.WithLabelValues(duty.Type.String()).Inc()

return
}

Expand All @@ -599,7 +611,8 @@ func newFailedDutyReporter() func(ctx context.Context, duty core.Duty, failed bo
z.Str("reason", reason),
)

counter.Inc()
dutyFailed.WithLabelValues(duty.Type.String()).Inc()
dutyExpect.WithLabelValues(duty.Type.String()).Inc()
}
}

Expand Down Expand Up @@ -669,11 +682,14 @@ func newParticipationReporter(peers []p2p.Peer) func(context.Context, core.Duty,
// prevAbsent is the set of peers who didn't participate in the last duty per type.
prevAbsent := make(map[core.DutyType][]string)

// Initialise participation metrics to 0 to avoid non-existent metrics issue on startup.
for _, duty := range core.AllDutyTypes() {
// Initialise counters to 0 to avoid non-existent metrics issues when querying prometheus.
for _, dutyType := range core.AllDutyTypes() {
duty := dutyType.String()
for _, peer := range peers {
participationSuccess.WithLabelValues(duty.String(), peer.Name).Add(0)
participationMissed.WithLabelValues(duty.String(), peer.Name).Add(0)
participationSuccess.WithLabelValues(duty, peer.Name).Add(0)
participationSuccessLegacy.WithLabelValues(duty, peer.Name).Add(0)
participationMissed.WithLabelValues(duty, peer.Name).Add(0)
participationExpect.WithLabelValues(duty, peer.Name).Add(0)
}
}

Expand All @@ -688,13 +704,16 @@ func newParticipationReporter(peers []p2p.Peer) func(context.Context, core.Duty,
if participatedShares[peer.ShareIdx()] {
participationGauge.WithLabelValues(duty.Type.String(), peer.Name).Set(1)
participationSuccess.WithLabelValues(duty.Type.String(), peer.Name).Inc()
participationSuccessLegacy.WithLabelValues(duty.Type.String(), peer.Name).Inc()
participationExpect.WithLabelValues(duty.Type.String(), peer.Name).Inc()
} else if unexpectedShares[peer.ShareIdx()] {
log.Warn(ctx, "Unexpected event found", nil, z.Str("peer", peer.Name), z.Str("duty", duty.String()))
unexpectedEventsCounter.WithLabelValues(peer.Name).Inc()
} else {
absentPeers = append(absentPeers, peer.Name)
participationGauge.WithLabelValues(duty.Type.String(), peer.Name).Set(0)
participationMissed.WithLabelValues(duty.Type.String(), peer.Name).Inc()
participationExpect.WithLabelValues(duty.Type.String(), peer.Name).Inc()
}
}

Expand Down

0 comments on commit e77e7da

Please sign in to comment.