Skip to content

Commit

Permalink
kv: Add stats for delegate snapshots
Browse files Browse the repository at this point in the history
Fixes: cockroachdb#98243
This PR adds two new stats for delegate snapshots to track failure of
sending snapshots. There are failures either before data is transferred
or after the snapshot is received.

Epic: none

Release note:
This commit adds two new stats which are useful for tracking the
efficiency of snapshot transfers. Some snapshots will always fail due to
system level "races", but the goal is to keep it as low as possible.
range.snapshots.recv-failed - The number of snapshot send attempts that
are initiated but not accepted by the recipient.
range.snapshots.recv-unusable - The number of snapshots that were fully
transmitted but not used.
  • Loading branch information
andrewbaptist committed Apr 19, 2023
1 parent 809b452 commit 8292a29
Show file tree
Hide file tree
Showing 8 changed files with 301 additions and 44 deletions.
16 changes: 16 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,18 @@ var (
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecvFailed = metric.Metadata{
Name: "range.snapshots.recv-failed",
Help: "Number of range snapshot initialization messages that errored out on the recipient, typically before any data is transferred",
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaRangeSnapshotRecvUnusable = metric.Metadata{
Name: "range.snapshots.recv-unusable",
Help: "Number of range snapshot that were fully transmitted but determined to be unnecessary or unusable",
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaRangeSnapshotSendQueueLength = metric.Metadata{
Name: "range.snapshots.send-queue",
Help: "Number of snapshots queued to send",
Expand Down Expand Up @@ -2017,6 +2029,8 @@ type StoreMetrics struct {
RangeSnapshotRecoverySentBytes *metric.Counter
RangeSnapshotRebalancingRcvdBytes *metric.Counter
RangeSnapshotRebalancingSentBytes *metric.Counter
RangeSnapshotRecvFailed *metric.Counter
RangeSnapshotRecvUnusable *metric.Counter

// Range snapshot queue metrics.
RangeSnapshotSendQueueLength *metric.Gauge
Expand Down Expand Up @@ -2574,6 +2588,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RangeSnapshotRecoverySentBytes: metric.NewCounter(metaRangeSnapshotRecoverySentBytes),
RangeSnapshotRebalancingRcvdBytes: metric.NewCounter(metaRangeSnapshotRebalancingRcvdBytes),
RangeSnapshotRebalancingSentBytes: metric.NewCounter(metaRangeSnapshotRebalancingSentBytes),
RangeSnapshotRecvFailed: metric.NewCounter(metaRangeSnapshotRecvFailed),
RangeSnapshotRecvUnusable: metric.NewCounter(metaRangeSnapshotRecvUnusable),
RangeSnapshotSendQueueLength: metric.NewGauge(metaRangeSnapshotSendQueueLength),
RangeSnapshotRecvQueueLength: metric.NewGauge(metaRangeSnapshotRecvQueueLength),
RangeSnapshotSendInProgress: metric.NewGauge(metaRangeSnapshotSendInProgress),
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2838,7 +2838,7 @@ func (r *Replica) sendSnapshotUsingDelegate(
ctx, 2, "delegating snapshot transmission attempt %v for %v to %v", n+1, recipient, sender,
)

selfDelegate := n == len(senders)-1
selfDelegate := sender.StoreID == r.StoreID()

// On the last attempt, always queue on the delegate to time out naturally.
if selfDelegate {
Expand Down
Loading

0 comments on commit 8292a29

Please sign in to comment.