Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
100762: kv: Add stats for delegate snapshots r=kvoli a=andrewbaptist

This PR adds two new stats for delegate snapshots to track failure of sending snapshots. There are failures either before data is transferred or after the snapshot is received. Both stats are useful.

Epic: none

Release note: None

Release justification: Adds stats and testing only.

101692: rpc: Handle multiple tests on same node r=erikgrinaker a=andrewbaptist

Fixes: cockroachdb#101627
Network tests can interfere with each other. There isn't a good way to completely isolate the tests and so this commit adds an additional check to ignore invalid messages.

Epic: none

Release note: None

Release justification: Test only change.

Co-authored-by: Andrew Baptist <[email protected]>
  • Loading branch information
craig[bot] and andrewbaptist committed Apr 18, 2023
3 parents 1de7ba8 + d823361 + 0a86ee5 commit b29901b
Show file tree
Hide file tree
Showing 9 changed files with 304 additions and 46 deletions.
16 changes: 16 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,18 @@ var (
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecvFailed = metric.Metadata{
Name: "range.snapshots.recv-failed",
Help: "Number of range snapshot initialization messages that errored out on the recipient, typically before any data is transferred",
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaRangeSnapshotRecvUnusable = metric.Metadata{
Name: "range.snapshots.recv-unusable",
Help: "Number of range snapshot that were fully transmitted but determined to be unnecessary or unusable",
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaRangeSnapshotSendQueueLength = metric.Metadata{
Name: "range.snapshots.send-queue",
Help: "Number of snapshots queued to send",
Expand Down Expand Up @@ -2078,6 +2090,8 @@ type StoreMetrics struct {
RangeSnapshotRecoverySentBytes *metric.Counter
RangeSnapshotRebalancingRcvdBytes *metric.Counter
RangeSnapshotRebalancingSentBytes *metric.Counter
RangeSnapshotRecvFailed *metric.Counter
RangeSnapshotRecvUnusable *metric.Counter

// Range snapshot queue metrics.
RangeSnapshotSendQueueLength *metric.Gauge
Expand Down Expand Up @@ -2682,6 +2696,8 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RangeSnapshotRecoverySentBytes: metric.NewCounter(metaRangeSnapshotRecoverySentBytes),
RangeSnapshotRebalancingRcvdBytes: metric.NewCounter(metaRangeSnapshotRebalancingRcvdBytes),
RangeSnapshotRebalancingSentBytes: metric.NewCounter(metaRangeSnapshotRebalancingSentBytes),
RangeSnapshotRecvFailed: metric.NewCounter(metaRangeSnapshotRecvFailed),
RangeSnapshotRecvUnusable: metric.NewCounter(metaRangeSnapshotRecvUnusable),
RangeSnapshotSendQueueLength: metric.NewGauge(metaRangeSnapshotSendQueueLength),
RangeSnapshotRecvQueueLength: metric.NewGauge(metaRangeSnapshotRecvQueueLength),
RangeSnapshotSendInProgress: metric.NewGauge(metaRangeSnapshotSendInProgress),
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2839,7 +2839,7 @@ func (r *Replica) sendSnapshotUsingDelegate(
ctx, 2, "delegating snapshot transmission attempt %v for %v to %v", n+1, recipient, sender,
)

selfDelegate := n == len(senders)-1
selfDelegate := sender.StoreID == r.StoreID()

// On the last attempt, always queue on the delegate to time out naturally.
if selfDelegate {
Expand Down
Loading

0 comments on commit b29901b

Please sign in to comment.