From 0618e954842841b0121eba71d595148284acb402 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Fri, 27 Oct 2023 16:12:57 -0400 Subject: [PATCH] kvserver: add metric and log when raft.Storage returns an error The raft.storage.error metric is incremented on an error, and the error is logged every 30s (across all replicas). This was motivated by a test cluster that slowed to a crawl because of deliberate data loss, but was hard to diagnose. The metric could be used for alerting, since we don't expect to see transient errors. Informs #113053 Epic: none Release note: None --- pkg/kv/kvserver/metrics.go | 8 ++++++++ pkg/kv/kvserver/replica_raftstorage.go | 27 +++++++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 17505f14c02f..14dd8e485f40 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1418,6 +1418,12 @@ cache will already have moved on to newer entries. Measurement: "Bytes", Unit: metric.Unit_BYTES, } + metaRaftStorageError = metric.Metadata{ + Name: "raft.storage.error", + Help: "Number of calls to the raft.Storage API that returned an error", + Measurement: "Error Count", + Unit: metric.Unit_COUNT, + } // Raft message metrics. metaRaftRcvdProp = metric.Metadata{ @@ -2503,6 +2509,7 @@ type StoreMetrics struct { RaftSchedulerLatency metric.IHistogram RaftTimeoutCampaign *metric.Counter RaftStorageReadBytes *metric.Counter + RaftStorageError *metric.Counter WALBytesWritten *metric.Gauge WALBytesIn *metric.Gauge @@ -3205,6 +3212,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { }), RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), RaftStorageReadBytes: metric.NewCounter(metaRaftStorageReadBytes), + RaftStorageError: metric.NewCounter(metaRaftStorageError), // Raft message metrics. RaftRcvdMessages: [maxRaftMsgType + 1]*metric.Counter{ diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index 824513e81758..32ba3f3194b7 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -83,6 +83,9 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, hs, err := r.mu.stateLoader.LoadHardState(ctx, r.store.TODOEngine()) // For uninitialized ranges, membership is unknown at this point. if raft.IsEmptyHardState(hs) || err != nil { + if err != nil { + r.reportRaftStorageError(err) + } return raftpb.HardState{}, raftpb.ConfState{}, err } cs := r.mu.state.Desc.Replicas().ConfState() @@ -97,7 +100,11 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, // // Entries can return log entries that are not yet stable in durable storage. func (r *replicaRaftStorage) Entries(lo, hi uint64, maxBytes uint64) ([]raftpb.Entry, error) { - return r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes) + entries, err := r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes) + if err != nil { + r.reportRaftStorageError(err) + } + return entries, err } func (r *replicaRaftStorage) TypedEntries( @@ -128,6 +135,9 @@ const invalidLastTerm = 0 // Term implements the raft.Storage interface. func (r *replicaRaftStorage) Term(i uint64) (uint64, error) { term, err := r.TypedTerm(kvpb.RaftIndex(i)) + if err != nil { + r.reportRaftStorageError(err) + } return uint64(term), err } @@ -166,6 +176,9 @@ func (r *Replica) raftLastIndexRLocked() kvpb.RaftIndex { // LastIndex requires that r.mu is held for reading. func (r *replicaRaftStorage) LastIndex() (uint64, error) { index, err := r.TypedLastIndex() + if err != nil { + r.reportRaftStorageError(err) + } return uint64(index), err } @@ -190,6 +203,9 @@ func (r *Replica) raftFirstIndexRLocked() kvpb.RaftIndex { // FirstIndex requires that r.mu is held for reading. func (r *replicaRaftStorage) FirstIndex() (uint64, error) { index, err := r.TypedFirstIndex() + if err != nil { + r.reportRaftStorageError(err) + } return uint64(index), err } @@ -237,6 +253,15 @@ func (r *replicaRaftStorage) Snapshot() (raftpb.Snapshot, error) { }, nil } +var raftStorageErrorLogger = log.Every(30 * time.Second) + +func (r *replicaRaftStorage) reportRaftStorageError(err error) { + if raftStorageErrorLogger.ShouldLog() { + log.Errorf(r.raftCtx, "error in raft.Storage %v", err) + } + r.store.metrics.RaftStorageError.Inc(1) +} + // raftSnapshotLocked requires that r.mu is held for writing. func (r *Replica) raftSnapshotLocked() (raftpb.Snapshot, error) { return (*replicaRaftStorage)(r).Snapshot()