From b1b17b235ad3a03778c54f834d34402cf9fdd6c9 Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 23 Aug 2022 11:35:41 +0200 Subject: [PATCH] kvserver: log when raft send/recv queue fills up Inspired by https://github.com/cockroachlabs/support/issues/1770. If either the raft send or receive queue fills up, wide-spread outages can occur as replication progress stalls. We have metrics that can indicate this, but straightforward logging is also appropriate to direct attention to the fact, which this commit achieves. Touches https://github.com/cockroachdb/cockroach/issues/79755 Release justification: important logging improvement Release note: None --- pkg/kv/kvserver/raft_transport.go | 3 +++ pkg/kv/kvserver/store_raft.go | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/pkg/kv/kvserver/raft_transport.go b/pkg/kv/kvserver/raft_transport.go index fb95baed3edb..1221cfd981ac 100644 --- a/pkg/kv/kvserver/raft_transport.go +++ b/pkg/kv/kvserver/raft_transport.go @@ -592,6 +592,9 @@ func (t *RaftTransport) SendAsync(req *RaftMessageRequest, class rpc.ConnectionC } return true default: + if logRaftSendQueueFullEvery.ShouldLog() { + log.Warningf(t.AnnotateCtx(context.Background()), "raft send queue to n%d is full", toNodeID) + } req.release() return false } diff --git a/pkg/kv/kvserver/store_raft.go b/pkg/kv/kvserver/store_raft.go index 494c66019a18..87d20d2907df 100644 --- a/pkg/kv/kvserver/store_raft.go +++ b/pkg/kv/kvserver/store_raft.go @@ -27,6 +27,11 @@ import ( "go.etcd.io/etcd/raft/v3/raftpb" ) +var ( + logRaftRecvQueueFullEvery = log.Every(1 * time.Second) + logRaftSendQueueFullEvery = log.Every(1 * time.Second) +) + type raftRequestInfo struct { req *RaftMessageRequest respStream RaftMessageResponseStream @@ -177,6 +182,9 @@ func (s *Store) HandleRaftUncoalescedRequest( // TODO(peter): Return an error indicating the request was dropped. Note // that dropping the request is safe. Raft will retry. s.metrics.RaftRcvdMsgDropped.Inc(1) + if logRaftRecvQueueFullEvery.ShouldLog() { + log.Warningf(ctx, "raft receive queue for r%d is full", req.RangeID) + } return false } q.infos = append(q.infos, raftRequestInfo{