From 952e37148bdbb8d4cfd93fc990d8047439e44326 Mon Sep 17 00:00:00 2001 From: corver Date: Mon, 17 Oct 2022 15:48:21 +0200 Subject: [PATCH] app: instrument not ready reasons (#1302) Instruments not ready reasons for improved UX. Also fix a "concurrent map write" panic in peerinfo. category: feature ticket: #999 --- app/metrics.go | 18 ++++- app/monitoringapi.go | 22 +++--- app/monitoringapi_internal_test.go | 4 +- app/peerinfo/peerinfo.go | 5 ++ app/simnet_test.go | 8 ++- .../compose/static/grafana/dash_simnet.json | 71 +++++++++++++++---- 6 files changed, 97 insertions(+), 31 deletions(-) diff --git a/app/metrics.go b/app/metrics.go index c8ed25904..d29aa7082 100644 --- a/app/metrics.go +++ b/app/metrics.go @@ -23,6 +23,18 @@ import ( "github.com/obolnetwork/charon/eth2util" ) +const ( + // readyzReady indicates that readyz returns 200s and the node is operational. + readyzReady = 1 + // readyzBeaconNodeDown indicates the readyz is returning 500s since the Beacon Node API is down. + readyzBeaconNodeDown = 2 + // readyzBeaconNodeSyncing indicates the readyz is returning 500s since the Beacon Node is syncing. + readyzBeaconNodeSyncing = 3 + // readyzInsufficientPeers indicates the readyz is returning 500s since this node isn't connected + // to quorum peers via the P2P network. + readyzInsufficientPeers = 4 +) + var ( versionGauge = promauto.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "app", @@ -52,7 +64,11 @@ var ( Namespace: "app", Subsystem: "monitoring", Name: "readyz", - Help: "Set to 1 if monitoring api `/readyz` endpoint returned 200 or else 0", + Help: "Set to 1 if the node is operational and monitoring api `/readyz` endpoint is returning 200s. " + + "Else `/readyz` is returning 500s and this metric is either set to " + + "2 if the beacon node is down, or" + + "3 if the beacon node is syncing, or" + + "4 if quorum peers are not connected.", }) thresholdGauge = promauto.NewGauge(prometheus.GaugeOpts{ diff --git a/app/monitoringapi.go b/app/monitoringapi.go index 152d0c28e..9789399f2 100644 --- a/app/monitoringapi.go +++ b/app/monitoringapi.go @@ -37,10 +37,10 @@ import ( ) var ( - errReadyUninitialised = errors.New("ready check uninitialised") - errReadyTooFewPeers = errors.New("quorum peers not connected") - errReadySyncing = errors.New("beacon node not synced") - errReadyBeaconNodeFailed = errors.New("failed to get beacon sync state") + errReadyUninitialised = errors.New("ready check uninitialised") + errReadyInsufficientPeers = errors.New("quorum peers not connected") + errReadyBeaconNodeSyncing = errors.New("beacon node not synced") + errReadyBeaconNodeDown = errors.New("beacon node down") ) // wireMonitoringAPI constructs the monitoring API and registers it with the life cycle manager. @@ -118,16 +118,16 @@ func startReadyChecker(ctx context.Context, tcpNode host.Host, eth2Cl eth2client syncing, err := beaconNodeSyncing(ctx, eth2Cl) if err != nil { - err = errReadyBeaconNodeFailed - readyzGauge.Set(0) + err = errReadyBeaconNodeDown + readyzGauge.Set(readyzBeaconNodeDown) } else if syncing { - err = errReadySyncing - readyzGauge.Set(0) + err = errReadyBeaconNodeSyncing + readyzGauge.Set(readyzBeaconNodeSyncing) } else if notConnectedRounds >= minNotConnected { - err = errReadyTooFewPeers - readyzGauge.Set(0) + err = errReadyInsufficientPeers + readyzGauge.Set(readyzInsufficientPeers) } else { - readyzGauge.Set(1) + readyzGauge.Set(readyzReady) } mu.Lock() diff --git a/app/monitoringapi_internal_test.go b/app/monitoringapi_internal_test.go index fb8f921cc..510c9c8a0 100644 --- a/app/monitoringapi_internal_test.go +++ b/app/monitoringapi_internal_test.go @@ -50,14 +50,14 @@ func TestStartChecker(t *testing.T) { isSyncing: true, numPeers: 5, absentPeers: 0, - err: errReadySyncing, + err: errReadyBeaconNodeSyncing, }, { name: "too few peers", isSyncing: false, numPeers: 5, absentPeers: 3, - err: errReadyTooFewPeers, + err: errReadyInsufficientPeers, }, { name: "success", diff --git a/app/peerinfo/peerinfo.go b/app/peerinfo/peerinfo.go index 3ea0261fc..764f99b51 100644 --- a/app/peerinfo/peerinfo.go +++ b/app/peerinfo/peerinfo.go @@ -19,6 +19,7 @@ import ( "bytes" "context" "fmt" + "sync" "testing" "time" @@ -210,6 +211,7 @@ func supported(protocols []string) bool { // newMetricsSubmitter returns a prometheus metric submitter. func newMetricsSubmitter() metricSubmitter { var ( + mu sync.Mutex prevVersions = make(map[string]string) prevGitHashes = make(map[string]string) ) @@ -238,6 +240,9 @@ func newMetricsSubmitter() metricSubmitter { peerGitHash.WithLabelValues(peerName, gitHash).Set(1) // Clear previous metrics if changed + mu.Lock() + defer mu.Unlock() + if prev, ok := prevVersions[peerName]; ok && version != prev { peerVersion.WithLabelValues(peerName, prev).Set(0) } diff --git a/app/simnet_test.go b/app/simnet_test.go index 7300e53d2..fff9ee65d 100644 --- a/app/simnet_test.go +++ b/app/simnet_test.go @@ -288,8 +288,12 @@ func testSimnet(t *testing.T, args simnetArgs, expect simnetExpect) { ParSigExFunc: parSigExFunc, LcastTransportFunc: lcastTransportFunc, BroadcastCallback: func(ctx context.Context, duty core.Duty, key core.PubKey, data core.SignedData) error { - results <- simResult{Duty: duty, Pubkey: key, Data: data} - return nil + select { + case <-ctx.Done(): + return ctx.Err() + case results <- simResult{Duty: duty, Pubkey: key, Data: data}: + return nil + } }, SimnetBMockOpts: append([]beaconmock.Option{ beaconmock.WithSlotsPerEpoch(1), diff --git a/testutil/compose/static/grafana/dash_simnet.json b/testutil/compose/static/grafana/dash_simnet.json index cd612788f..d3d49f29b 100644 --- a/testutil/compose/static/grafana/dash_simnet.json +++ b/testutil/compose/static/grafana/dash_simnet.json @@ -327,14 +327,29 @@ { "options": { "0": { - "color": "orange", + "color": "text", "index": 1, - "text": "NOK" + "text": "Unknown" }, "1": { "color": "green", "index": 0, "text": "OK" + }, + "2": { + "color": "red", + "index": 3, + "text": "BeaconNode Down" + }, + "3": { + "color": "orange", + "index": 4, + "text": "BeaconNode Syncing" + }, + "4": { + "color": "orange", + "index": 5, + "text": "Insufficient Peers" } }, "type": "value" @@ -343,6 +358,7 @@ "options": { "match": "null", "result": { + "color": "text", "index": 2, "text": "Dead" } @@ -397,7 +413,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "app_monitoring_readyz{job=\"$node\"} ", + "expr": "app_monitoring_readyz{job=\"$node\"}", "hide": false, "instant": true, "interval": "", @@ -772,7 +788,21 @@ "properties": [ { "id": "unit", - "value": "s" + "value": "ms" + }, + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "index": 0, + "text": "🆗" + } + }, + "type": "value" + } + ] } ] }, @@ -997,7 +1027,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "max(app_peerinfo_clock_offset_seconds{job=\"$node\"}) by (peer)", + "expr": "max(round(app_peerinfo_clock_offset_seconds{job=\"$node\"},0.1)*1000) by (peer) ", "format": "table", "hide": false, "instant": true, @@ -1041,7 +1071,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(app_peerinfo_git_commit{job=\"$node\"}) by (peer,git_hash) ", + "expr": "max(app_peerinfo_git_commit{job=\"$node\"}) by (peer,git_hash) > 0", "format": "table", "hide": false, "instant": true, @@ -1065,6 +1095,7 @@ "Time 1": true, "Time 10": true, "Time 11": true, + "Time 12": true, "Time 2": true, "Time 3": true, "Time 4": true, @@ -1113,6 +1144,7 @@ "Value #F": "You", "Value #G": "PrepareAgg", "Value #H": "Aggregate", + "Value #I": "Exit", "Value #J": "ClockDiff", "Value #K": "Direct", "Value #L": "", @@ -1375,7 +1407,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1507,7 +1540,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1599,7 +1633,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1691,7 +1726,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1783,7 +1819,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1876,7 +1913,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1983,7 +2021,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -2078,7 +2117,8 @@ "mode": "absolute", "steps": [ { - "color": "light-blue" + "color": "light-blue", + "value": null }, { "color": "red", @@ -2282,7 +2322,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red",