From 063ebceeab78186f8728e31e1e065bcfdb83ac15 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Fri, 27 Oct 2023 16:12:57 -0400 Subject: [PATCH] kvserver: add metric and log when raft.Storage returns an error The raft.storage.error metric is incremented on an error, and the error is logged every 30s (across all replicas). This was motivated by a test cluster that slowed to a crawl because of deliberate data loss, but was hard to diagnose. The metric could be used for alerting, since we don't expect to see transient errors. Informs #113053 Epic: none Release note: None --- docs/generated/metrics/metrics.html | 1 + pkg/kv/kvserver/metrics.go | 8 ++++++++ pkg/kv/kvserver/replica_raftstorage.go | 27 +++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 1 deletion(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 0a8dcfeb9856..c6318a531e88 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -453,6 +453,7 @@ STORAGEraft.sent.bytesNumber of bytes in Raft messages sent by this store. Note that
this does not include raft snapshot sent.BytesCOUNTERBYTESAVGNON_NEGATIVE_DERIVATIVE STORAGEraft.sent.cross_region.bytesNumber of bytes sent by this store for cross region Raft messages
(when region tiers are configured). Note that this does not include raft
snapshot sent.BytesCOUNTERBYTESAVGNON_NEGATIVE_DERIVATIVE STORAGEraft.sent.cross_zone.bytesNumber of bytes sent by this store for cross zone, same region Raft
messages (when region and zone tiers are configured). If region tiers are
not configured, this count may include data sent between different regions.
To ensure accurate monitoring of transmitted data, it is important to set up
a consistent locality configuration across nodes. Note that this does not
include raft snapshot sent.BytesCOUNTERBYTESAVGNON_NEGATIVE_DERIVATIVE +STORAGEraft.storage.errorNumber of Raft storage errorsError CountCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEraft.storage.read_bytesCounter of raftpb.Entry.Size() read from pebble for raft log entries.

These are the bytes returned from the (raft.Storage).Entries method that were not
returned via the raft entry cache. This metric plus the raft.entrycache.read_bytes
metric represent the total bytes returned from the Entries method.

Since pebble might serve these entries from the block cache, only a fraction of this
throughput might manifest in disk metrics.

Entries tracked in this metric incur an unmarshalling-related CPU and memory
overhead that would not be incurred would the entries be served from the raft
entry cache.

The bytes returned here do not correspond 1:1 to bytes read from pebble. This
metric measures the in-memory size of the raftpb.Entry, whereas we read its
encoded representation from pebble. As there is no compression involved, these
will generally be comparable.

A common reason for elevated measurements on this metric is that a store is
falling behind on raft log application. The raft entry cache generally tracks
entries that were recently appended, so if log application falls behind the
cache will already have moved on to newer entries.
BytesCOUNTERBYTESAVGNON_NEGATIVE_DERIVATIVE STORAGEraft.ticksNumber of Raft ticks queuedTicksCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE STORAGEraft.timeoutcampaignNumber of Raft replicas campaigning after missed heartbeats from leaderElections called after timeoutCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index 17505f14c02f..e1dd41cf83a5 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -1418,6 +1418,12 @@ cache will already have moved on to newer entries. Measurement: "Bytes", Unit: metric.Unit_BYTES, } + metaRaftStorageError = metric.Metadata{ + Name: "raft.storage.error", + Help: "Number of Raft storage errors", + Measurement: "Error Count", + Unit: metric.Unit_COUNT, + } // Raft message metrics. metaRaftRcvdProp = metric.Metadata{ @@ -2503,6 +2509,7 @@ type StoreMetrics struct { RaftSchedulerLatency metric.IHistogram RaftTimeoutCampaign *metric.Counter RaftStorageReadBytes *metric.Counter + RaftStorageError *metric.Counter WALBytesWritten *metric.Gauge WALBytesIn *metric.Gauge @@ -3205,6 +3212,7 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics { }), RaftTimeoutCampaign: metric.NewCounter(metaRaftTimeoutCampaign), RaftStorageReadBytes: metric.NewCounter(metaRaftStorageReadBytes), + RaftStorageError: metric.NewCounter(metaRaftStorageError), // Raft message metrics. RaftRcvdMessages: [maxRaftMsgType + 1]*metric.Counter{ diff --git a/pkg/kv/kvserver/replica_raftstorage.go b/pkg/kv/kvserver/replica_raftstorage.go index 824513e81758..32ba3f3194b7 100644 --- a/pkg/kv/kvserver/replica_raftstorage.go +++ b/pkg/kv/kvserver/replica_raftstorage.go @@ -83,6 +83,9 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, hs, err := r.mu.stateLoader.LoadHardState(ctx, r.store.TODOEngine()) // For uninitialized ranges, membership is unknown at this point. if raft.IsEmptyHardState(hs) || err != nil { + if err != nil { + r.reportRaftStorageError(err) + } return raftpb.HardState{}, raftpb.ConfState{}, err } cs := r.mu.state.Desc.Replicas().ConfState() @@ -97,7 +100,11 @@ func (r *replicaRaftStorage) InitialState() (raftpb.HardState, raftpb.ConfState, // // Entries can return log entries that are not yet stable in durable storage. func (r *replicaRaftStorage) Entries(lo, hi uint64, maxBytes uint64) ([]raftpb.Entry, error) { - return r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes) + entries, err := r.TypedEntries(kvpb.RaftIndex(lo), kvpb.RaftIndex(hi), maxBytes) + if err != nil { + r.reportRaftStorageError(err) + } + return entries, err } func (r *replicaRaftStorage) TypedEntries( @@ -128,6 +135,9 @@ const invalidLastTerm = 0 // Term implements the raft.Storage interface. func (r *replicaRaftStorage) Term(i uint64) (uint64, error) { term, err := r.TypedTerm(kvpb.RaftIndex(i)) + if err != nil { + r.reportRaftStorageError(err) + } return uint64(term), err } @@ -166,6 +176,9 @@ func (r *Replica) raftLastIndexRLocked() kvpb.RaftIndex { // LastIndex requires that r.mu is held for reading. func (r *replicaRaftStorage) LastIndex() (uint64, error) { index, err := r.TypedLastIndex() + if err != nil { + r.reportRaftStorageError(err) + } return uint64(index), err } @@ -190,6 +203,9 @@ func (r *Replica) raftFirstIndexRLocked() kvpb.RaftIndex { // FirstIndex requires that r.mu is held for reading. func (r *replicaRaftStorage) FirstIndex() (uint64, error) { index, err := r.TypedFirstIndex() + if err != nil { + r.reportRaftStorageError(err) + } return uint64(index), err } @@ -237,6 +253,15 @@ func (r *replicaRaftStorage) Snapshot() (raftpb.Snapshot, error) { }, nil } +var raftStorageErrorLogger = log.Every(30 * time.Second) + +func (r *replicaRaftStorage) reportRaftStorageError(err error) { + if raftStorageErrorLogger.ShouldLog() { + log.Errorf(r.raftCtx, "error in raft.Storage %v", err) + } + r.store.metrics.RaftStorageError.Inc(1) +} + // raftSnapshotLocked requires that r.mu is held for writing. func (r *Replica) raftSnapshotLocked() (raftpb.Snapshot, error) { return (*replicaRaftStorage)(r).Snapshot()