From c1cd955e8b3877574befdd49084734cf4e955a3c Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 23 Aug 2022 11:35:41 +0200 Subject: [PATCH] kvserver: log when raft send/recv queue fills up Inspired by https://github.com/cockroachlabs/support/issues/1770. If either the raft send or receive queue fills up, wide-spread outages can occur as replication progress stalls. We have metrics that can indicate this, but straightforward logging is also appropriate to direct attention to the fact, which this commit achieves. Touches https://github.com/cockroachdb/cockroach/issues/79755 Release justification: important logging improvement Release note: None --- pkg/kv/kvserver/raft_transport.go | 3 +++ pkg/kv/kvserver/store_raft.go | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/pkg/kv/kvserver/raft_transport.go b/pkg/kv/kvserver/raft_transport.go index 566a834715e0..b3bd100ea3bd 100644 --- a/pkg/kv/kvserver/raft_transport.go +++ b/pkg/kv/kvserver/raft_transport.go @@ -505,6 +505,9 @@ func (t *RaftTransport) SendAsync( case ch <- req: return true default: + if logRaftSendQueueFullEvery.ShouldLog() { + log.Warningf(t.AnnotateCtx(context.Background()), "raft send queue to n%d is full", toNodeID) + } releaseRaftMessageRequest(req) return false } diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index 92164cd63f8e..eefd61f34f95 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -32,6 +32,11 @@ import ( "go.etcd.io/etcd/raft/v3/raftpb" ) +var ( + logRaftRecvQueueFullEvery = log.Every(1 * time.Second) + logRaftSendQueueFullEvery = log.Every(1 * time.Second) +) + type raftRequestInfo struct { req *kvserverpb.RaftMessageRequest size int64 // size of req in bytes @@ -305,6 +310,9 @@ func (s *Store) HandleRaftUncoalescedRequest( // that dropping the request is safe. Raft will retry. s.metrics.RaftRcvdDropped.Inc(1) s.metrics.RaftRcvdDroppedBytes.Inc(size) + if logRaftRecvQueueFullEvery.ShouldLog() { + log.Warningf(ctx, "raft receive queue for r%d is full", req.RangeID) + } return false } return enqueue