Skip to content

Commit

Permalink
kvserver: record paused replica message drops to a metric
Browse files Browse the repository at this point in the history
Release note: None
  • Loading branch information
pav-kv committed Aug 16, 2022
1 parent 8f1b11e commit 0bd35fb
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 7 deletions.
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/client_replica_raft_overload_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ func TestReplicaRaftOverload(t *testing.T) {
if n := s1.Metrics().RaftPausedFollowerCount.Value(); n == 0 {
return errors.New("no paused followers")
}
if n := s1.Metrics().RaftPausedFollowerDroppedMsgs.Count(); n == 0 {
return errors.New("no dropped messages to paused followers")
}
return nil
})

Expand Down
17 changes: 13 additions & 4 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1004,11 +1004,18 @@ Such Replicas will be ignored for the purposes of proposal quota, and will not
receive replication traffic. They are essentially treated as offline for the
purpose of replication. This serves as a crude form of admission control.
The count is emitted by the leaseholder of each range.
.`,
The count is emitted by the leaseholder of each range.`,
Measurement: "Followers",
Unit: metric.Unit_COUNT,
}
metaRaftPausedFollowerDroppedMsgs = metric.Metadata{
Name: "admission.raft.paused_replicas_dropped_msgs",
Help: `Number of messages dropped instead of being sent to paused replicas.
The messages are dropped to help these replicas to recover from I/O overload.`,
Measurement: "Messages",
Unit: metric.Unit_COUNT,
}

// Replica queue metrics.
metaMVCCGCQueueSuccesses = metric.Metadata{
Expand Down Expand Up @@ -1754,7 +1761,8 @@ type StoreMetrics struct {
RaftLogFollowerBehindCount *metric.Gauge
RaftLogTruncated *metric.Counter

RaftPausedFollowerCount *metric.Gauge
RaftPausedFollowerCount *metric.Gauge
RaftPausedFollowerDroppedMsgs *metric.Counter

RaftCoalescedHeartbeatsPending *metric.Gauge

Expand Down Expand Up @@ -2266,7 +2274,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RaftLogFollowerBehindCount: metric.NewGauge(metaRaftLogFollowerBehindCount),
RaftLogTruncated: metric.NewCounter(metaRaftLogTruncated),

RaftPausedFollowerCount: metric.NewGauge(metaRaftFollowerPaused),
RaftPausedFollowerCount: metric.NewGauge(metaRaftFollowerPaused),
RaftPausedFollowerDroppedMsgs: metric.NewCounter(metaRaftPausedFollowerDroppedMsgs),

// This Gauge measures the number of heartbeats queued up just before
// the queue is cleared, to avoid flapping wildly.
Expand Down
6 changes: 3 additions & 3 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -1468,6 +1468,9 @@ func (r *Replica) sendRaftMessagesRaftMuLocked(
var lastAppResp raftpb.Message
for _, message := range messages {
_, drop := blocked[roachpb.ReplicaID(message.To)]
if drop {
r.store.Metrics().RaftPausedFollowerDroppedMsgs.Inc(1)
}
switch message.Type {
case raftpb.MsgApp:
if util.RaceEnabled {
Expand Down Expand Up @@ -1531,9 +1534,6 @@ func (r *Replica) sendRaftMessagesRaftMuLocked(
}
}

// TODO(tbg): record this to metrics.
//
// See: https://github.com/cockroachdb/cockroach/issues/83917
if !drop {
r.sendRaftMessageRaftMuLocked(ctx, message)
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,12 @@ var charts = []sectionDescription{
"admission.raft.paused_replicas",
},
},
{
Title: "Paused Followers Dropped Messages",
Metrics: []string{
"admission.raft.paused_replicas_dropped_msgs",
},
},
{
Title: "Operations",
Metrics: []string{
Expand Down

0 comments on commit 0bd35fb

Please sign in to comment.