From 063ebceeab78186f8728e31e1e065bcfdb83ac15 Mon Sep 17 00:00:00 2001
From: sumeerbhola <sumeer@cockroachlabs.com>
Date: Fri, 27 Oct 2023 16:12:57 -0400
Subject: [PATCH] kvserver: add metric and log when raft.Storage returns an
 error

The raft.storage.error metric is incremented on an error, and the error
is logged every 30s (across all replicas).

This was motivated by a test cluster that slowed to a crawl because of
deliberate data loss, but was hard to diagnose. The metric could be used
for alerting, since we don't expect to see transient errors.

Informs #113053

Epic: none

Release note: None
---
 docs/generated/metrics/metrics.html    |  1 +
 pkg/kv/kvserver/metrics.go             |  8 ++++++++
 pkg/kv/kvserver/replica_raftstorage.go | 27 +++++++++++++++++++++++++-
 3 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html
index 0a8dcfeb9856..c6318a531e88 100644
--- a/docs/generated/metrics/metrics.html
+++ b/docs/generated/metrics/metrics.html
@@ -453,6 +453,7 @@
 <tr><td>STORAGE</td><td>raft.sent.bytes</td><td>Number of bytes in Raft messages sent by this store. Note that<br/>		this does not include raft snapshot sent.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>raft.sent.cross_region.bytes</td><td>Number of bytes sent by this store for cross region Raft messages<br/>		(when region tiers are configured). Note that this does not include raft<br/>		snapshot sent.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>raft.sent.cross_zone.bytes</td><td>Number of bytes sent by this store for cross zone, same region Raft<br/>		messages (when region and zone tiers are configured). If region tiers are<br/>		not configured, this count may include data sent between different regions.<br/>		To ensure accurate monitoring of transmitted data, it is important to set up<br/>		a consistent locality configuration across nodes. Note that this does not<br/>		include raft snapshot sent.</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
+<tr><td>STORAGE</td><td>raft.storage.error</td><td>Number of Raft storage errors</td><td>Error Count</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>raft.storage.read_bytes</td><td>Counter of raftpb.Entry.Size() read from pebble for raft log entries.<br/><br/>These are the bytes returned from the (raft.Storage).Entries method that were not<br/>returned via the raft entry cache. This metric plus the raft.entrycache.read_bytes<br/>metric represent the total bytes returned from the Entries method.<br/><br/>Since pebble might serve these entries from the block cache, only a fraction of this<br/>throughput might manifest in disk metrics.<br/><br/>Entries tracked in this metric incur an unmarshalling-related CPU and memory<br/>overhead that would not be incurred would the entries be served from the raft<br/>entry cache.<br/><br/>The bytes returned here do not correspond 1:1 to bytes read from pebble. This<br/>metric measures the in-memory size of the raftpb.Entry, whereas we read its<br/>encoded representation from pebble. As there is no compression involved, these<br/>will generally be comparable.<br/><br/>A common reason for elevated measurements on this metric is that a store is<br/>falling behind on raft log application. The raft entry cache generally tracks<br/>entries that were recently appended, so if log application falls behind the<br/>cache will already have moved on to newer entries.<br/></td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>raft.ticks</td><td>Number of Raft ticks queued</td><td>Ticks</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
 <tr><td>STORAGE</td><td>raft.timeoutcampaign</td><td>Number of Raft replicas campaigning after missed heartbeats from leader</td><td>Elections called after timeout</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go
index 17505f14c02f..e1dd41cf83a5 100644
--- a/pkg/kv/kvserver/metrics.go
+++ b/pkg/kv/kvserver/metrics.go
@@ -1418,6 +1418,12 @@ cache will already have moved on to newer entries.
 		Measurement: "Bytes",
 		Unit:        metric.Unit_BYTES,
 	}
+	metaRaftStorageError = metric.Metadata{
+		Name:        "raft.storage.error",
+		Help:        "Number of Raft storage errors",
+		Measurement: "Error Count",
+		Unit:        metric.Unit_COUNT,
+	}
 
 	// Raft message metrics.
 	metaRaftRcvdProp = metric.Metadata{
@@ -2503,6 +2509,7 @@ type StoreMetrics struct {
 	RaftSchedulerLatency       metric.IHistogram
 	RaftTimeoutCampaign        *metric.Counter
 	RaftStorageReadBytes       *metric.Counter
+	RaftStorageError           *metric.Counter
 	WALBytesWritten            *metric.Gauge
 	WALBytesIn                 *metric.Gauge
 
@@ -3205,6 +3212,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
 		}),
 		RaftTimeoutCampaign:  metric.NewCounter(metaRaftTimeoutCampaign),
 		RaftStorageReadBytes: metric.NewCounter(metaRaftStorageReadBytes),
+		RaftStorageError:     metric.NewCounter(metaRaftStorageError),
 
 		// Raft message metrics.
 		RaftRcvdMessages: [maxRaftMsgType + 1]*metric.Counter{
diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go
index 824513e81758..32ba3f3194b7 100644
--- a/pkg/kv/kvserver/replica_raftstorage.go
+++ b/pkg/kv/kvserver/replica_raftstorage.go
@@ -83,6 +83,9 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState,
 	hs, err := r.mu.stateLoader.LoadHardState(ctx, r.store.TODOEngine())
 	// For uninitialized ranges, membership is unknown at this point.
 	if raft.IsEmptyHardState(hs) || err != nil {
+		if err != nil {
+			r.reportRaftStorageError(err)
+		}
 		return raftpb.HardState{}, raftpb.ConfState{}, err
 	}
 	cs := r.mu.state.Desc.Replicas().ConfState()
@@ -97,7 +100,11 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState,
 //
 // Entries can return log entries that are not yet stable in durable storage.
 func (r *replicaRaftStorage) Entries(lo, hi uint64, maxBytes uint64) ([]raftpb.Entry, error) {
-	return r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes)
+	entries, err := r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes)
+	if err != nil {
+		r.reportRaftStorageError(err)
+	}
+	return entries, err
 }
 
 func (r *replicaRaftStorage) TypedEntries(
@@ -128,6 +135,9 @@ const invalidLastTerm = 0
 // Term implements the raft.Storage interface.
 func (r *replicaRaftStorage) Term(i uint64) (uint64, error) {
 	term, err := r.TypedTerm(kvpb.RaftIndex(i))
+	if err != nil {
+		r.reportRaftStorageError(err)
+	}
 	return uint64(term), err
 }
 
@@ -166,6 +176,9 @@ func (r *Replica) raftLastIndexRLocked() kvpb.RaftIndex {
 // LastIndex requires that r.mu is held for reading.
 func (r *replicaRaftStorage) LastIndex() (uint64, error) {
 	index, err := r.TypedLastIndex()
+	if err != nil {
+		r.reportRaftStorageError(err)
+	}
 	return uint64(index), err
 }
 
@@ -190,6 +203,9 @@ func (r *Replica) raftFirstIndexRLocked() kvpb.RaftIndex {
 // FirstIndex requires that r.mu is held for reading.
 func (r *replicaRaftStorage) FirstIndex() (uint64, error) {
 	index, err := r.TypedFirstIndex()
+	if err != nil {
+		r.reportRaftStorageError(err)
+	}
 	return uint64(index), err
 }
 
@@ -237,6 +253,15 @@ func (r *replicaRaftStorage) Snapshot() (raftpb.Snapshot, error) {
 	}, nil
 }
 
+var raftStorageErrorLogger = log.Every(30 * time.Second)
+
+func (r *replicaRaftStorage) reportRaftStorageError(err error) {
+	if raftStorageErrorLogger.ShouldLog() {
+		log.Errorf(r.raftCtx, "error in raft.Storage %v", err)
+	}
+	r.store.metrics.RaftStorageError.Inc(1)
+}
+
 // raftSnapshotLocked requires that r.mu is held for writing.
 func (r *Replica) raftSnapshotLocked() (raftpb.Snapshot, error) {
 	return (*replicaRaftStorage)(r).Snapshot()