Skip to content

Commit

Permalink
app: instrument not ready reasons (#1302)
Browse files Browse the repository at this point in the history
Instruments not ready reasons for improved UX. 
Also fix a "concurrent map write" panic in peerinfo.

category: feature 
ticket: #999
  • Loading branch information
corverroos authored Oct 17, 2022
1 parent c2a8f27 commit 952e371
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 31 deletions.
18 changes: 17 additions & 1 deletion app/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ import (
"github.com/obolnetwork/charon/eth2util"
)

const (
// readyzReady indicates that readyz returns 200s and the node is operational.
readyzReady = 1
// readyzBeaconNodeDown indicates the readyz is returning 500s since the Beacon Node API is down.
readyzBeaconNodeDown = 2
// readyzBeaconNodeSyncing indicates the readyz is returning 500s since the Beacon Node is syncing.
readyzBeaconNodeSyncing = 3
// readyzInsufficientPeers indicates the readyz is returning 500s since this node isn't connected
// to quorum peers via the P2P network.
readyzInsufficientPeers = 4
)

var (
versionGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: "app",
Expand Down Expand Up @@ -52,7 +64,11 @@ var (
Namespace: "app",
Subsystem: "monitoring",
Name: "readyz",
Help: "Set to 1 if monitoring api `/readyz` endpoint returned 200 or else 0",
Help: "Set to 1 if the node is operational and monitoring api `/readyz` endpoint is returning 200s. " +
"Else `/readyz` is returning 500s and this metric is either set to " +
"2 if the beacon node is down, or" +
"3 if the beacon node is syncing, or" +
"4 if quorum peers are not connected.",
})

thresholdGauge = promauto.NewGauge(prometheus.GaugeOpts{
Expand Down
22 changes: 11 additions & 11 deletions app/monitoringapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@ import (
)

var (
errReadyUninitialised = errors.New("ready check uninitialised")
errReadyTooFewPeers = errors.New("quorum peers not connected")
errReadySyncing = errors.New("beacon node not synced")
errReadyBeaconNodeFailed = errors.New("failed to get beacon sync state")
errReadyUninitialised = errors.New("ready check uninitialised")
errReadyInsufficientPeers = errors.New("quorum peers not connected")
errReadyBeaconNodeSyncing = errors.New("beacon node not synced")
errReadyBeaconNodeDown = errors.New("beacon node down")
)

// wireMonitoringAPI constructs the monitoring API and registers it with the life cycle manager.
Expand Down Expand Up @@ -118,16 +118,16 @@ func startReadyChecker(ctx context.Context, tcpNode host.Host, eth2Cl eth2client

syncing, err := beaconNodeSyncing(ctx, eth2Cl)
if err != nil {
err = errReadyBeaconNodeFailed
readyzGauge.Set(0)
err = errReadyBeaconNodeDown
readyzGauge.Set(readyzBeaconNodeDown)
} else if syncing {
err = errReadySyncing
readyzGauge.Set(0)
err = errReadyBeaconNodeSyncing
readyzGauge.Set(readyzBeaconNodeSyncing)
} else if notConnectedRounds >= minNotConnected {
err = errReadyTooFewPeers
readyzGauge.Set(0)
err = errReadyInsufficientPeers
readyzGauge.Set(readyzInsufficientPeers)
} else {
readyzGauge.Set(1)
readyzGauge.Set(readyzReady)
}

mu.Lock()
Expand Down
4 changes: 2 additions & 2 deletions app/monitoringapi_internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,14 @@ func TestStartChecker(t *testing.T) {
isSyncing: true,
numPeers: 5,
absentPeers: 0,
err: errReadySyncing,
err: errReadyBeaconNodeSyncing,
},
{
name: "too few peers",
isSyncing: false,
numPeers: 5,
absentPeers: 3,
err: errReadyTooFewPeers,
err: errReadyInsufficientPeers,
},
{
name: "success",
Expand Down
5 changes: 5 additions & 0 deletions app/peerinfo/peerinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"bytes"
"context"
"fmt"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -210,6 +211,7 @@ func supported(protocols []string) bool {
// newMetricsSubmitter returns a prometheus metric submitter.
func newMetricsSubmitter() metricSubmitter {
var (
mu sync.Mutex
prevVersions = make(map[string]string)
prevGitHashes = make(map[string]string)
)
Expand Down Expand Up @@ -238,6 +240,9 @@ func newMetricsSubmitter() metricSubmitter {
peerGitHash.WithLabelValues(peerName, gitHash).Set(1)

// Clear previous metrics if changed
mu.Lock()
defer mu.Unlock()

if prev, ok := prevVersions[peerName]; ok && version != prev {
peerVersion.WithLabelValues(peerName, prev).Set(0)
}
Expand Down
8 changes: 6 additions & 2 deletions app/simnet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,12 @@ func testSimnet(t *testing.T, args simnetArgs, expect simnetExpect) {
ParSigExFunc: parSigExFunc,
LcastTransportFunc: lcastTransportFunc,
BroadcastCallback: func(ctx context.Context, duty core.Duty, key core.PubKey, data core.SignedData) error {
results <- simResult{Duty: duty, Pubkey: key, Data: data}
return nil
select {
case <-ctx.Done():
return ctx.Err()
case results <- simResult{Duty: duty, Pubkey: key, Data: data}:
return nil
}
},
SimnetBMockOpts: append([]beaconmock.Option{
beaconmock.WithSlotsPerEpoch(1),
Expand Down
71 changes: 56 additions & 15 deletions testutil/compose/static/grafana/dash_simnet.json
Original file line number Diff line number Diff line change
Expand Up @@ -327,14 +327,29 @@
{
"options": {
"0": {
"color": "orange",
"color": "text",
"index": 1,
"text": "NOK"
"text": "Unknown"
},
"1": {
"color": "green",
"index": 0,
"text": "OK"
},
"2": {
"color": "red",
"index": 3,
"text": "BeaconNode Down"
},
"3": {
"color": "orange",
"index": 4,
"text": "BeaconNode Syncing"
},
"4": {
"color": "orange",
"index": 5,
"text": "Insufficient Peers"
}
},
"type": "value"
Expand All @@ -343,6 +358,7 @@
"options": {
"match": "null",
"result": {
"color": "text",
"index": 2,
"text": "Dead"
}
Expand Down Expand Up @@ -397,7 +413,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "app_monitoring_readyz{job=\"$node\"} ",
"expr": "app_monitoring_readyz{job=\"$node\"}",
"hide": false,
"instant": true,
"interval": "",
Expand Down Expand Up @@ -772,7 +788,21 @@
"properties": [
{
"id": "unit",
"value": "s"
"value": "ms"
},
{
"id": "mappings",
"value": [
{
"options": {
"0": {
"index": 0,
"text": "🆗"
}
},
"type": "value"
}
]
}
]
},
Expand Down Expand Up @@ -997,7 +1027,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "max(app_peerinfo_clock_offset_seconds{job=\"$node\"}) by (peer)",
"expr": "max(round(app_peerinfo_clock_offset_seconds{job=\"$node\"},0.1)*1000) by (peer) ",
"format": "table",
"hide": false,
"instant": true,
Expand Down Expand Up @@ -1041,7 +1071,7 @@
},
"editorMode": "code",
"exemplar": false,
"expr": "sum(app_peerinfo_git_commit{job=\"$node\"}) by (peer,git_hash) ",
"expr": "max(app_peerinfo_git_commit{job=\"$node\"}) by (peer,git_hash) > 0",
"format": "table",
"hide": false,
"instant": true,
Expand All @@ -1065,6 +1095,7 @@
"Time 1": true,
"Time 10": true,
"Time 11": true,
"Time 12": true,
"Time 2": true,
"Time 3": true,
"Time 4": true,
Expand Down Expand Up @@ -1113,6 +1144,7 @@
"Value #F": "You",
"Value #G": "PrepareAgg",
"Value #H": "Aggregate",
"Value #I": "Exit",
"Value #J": "ClockDiff",
"Value #K": "Direct",
"Value #L": "",
Expand Down Expand Up @@ -1375,7 +1407,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1507,7 +1540,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1599,7 +1633,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1691,7 +1726,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1783,7 +1819,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1876,7 +1913,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -1983,7 +2021,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -2078,7 +2117,8 @@
"mode": "absolute",
"steps": [
{
"color": "light-blue"
"color": "light-blue",
"value": null
},
{
"color": "red",
Expand Down Expand Up @@ -2282,7 +2322,8 @@
"mode": "absolute",
"steps": [
{
"color": "green"
"color": "green",
"value": null
},
{
"color": "red",
Expand Down

0 comments on commit 952e371

Please sign in to comment.