Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
96350: kv: Add stats for delegated snapshots r=AlexTalks a=andrewbaptist

Adds three new stats for delegated snapshots. These are not currently
exposed through the admin UI, but will be useful for determining the
effectiveness of using delegated snapshots in a multi-region setup.

The stats are:
range.snapshot.delegate.send-bytes - Number of bytes sent by a delegate.
range.snapshot.delegate.successes - Successful delegation requests.
range.snapshot.delegate.successes - Failed delegation requests.

Note that the delegate success and failure stats intentionally do not
include self-delegated snapshots since those are already accounted for
by the standard snapshot stats. The cost savings from using delegated
snapshots can be directly determined by multiplying the send-bytes
metric by the cost to send data across regions.

Release note (ops change): Adding additional stats described above.
Epic: none

Co-authored-by: Andrew Baptist <[email protected]>
  • Loading branch information
craig[bot] and andrewbaptist committed Feb 23, 2023
2 parents 69174d4 + 4a43cf6 commit f33f404
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 9 deletions.
54 changes: 46 additions & 8 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -698,49 +698,49 @@ var (
Name: "range.snapshots.rcvd-bytes",
Help: "Number of snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotSentBytes = metric.Metadata{
Name: "range.snapshots.sent-bytes",
Help: "Number of snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotUnknownRcvdBytes = metric.Metadata{
Name: "range.snapshots.unknown.rcvd-bytes",
Help: "Number of unknown snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotUnknownSentBytes = metric.Metadata{
Name: "range.snapshots.unknown.sent-bytes",
Help: "Number of unknown snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRebalancingRcvdBytes = metric.Metadata{
Name: "range.snapshots.rebalancing.rcvd-bytes",
Help: "Number of rebalancing snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRebalancingSentBytes = metric.Metadata{
Name: "range.snapshots.rebalancing.sent-bytes",
Help: "Number of rebalancing snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecoveryRcvdBytes = metric.Metadata{
Name: "range.snapshots.recovery.rcvd-bytes",
Help: "Number of recovery snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecoverySentBytes = metric.Metadata{
Name: "range.snapshots.recovery.sent-bytes",
Help: "Number of recovery snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotSendQueueLength = metric.Metadata{
Name: "range.snapshots.send-queue",
Expand Down Expand Up @@ -778,6 +778,7 @@ var (
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}

metaRangeRaftLeaderTransfers = metric.Metadata{
Name: "range.raftleadertransfers",
Help: "Number of raft leader transfers",
Expand All @@ -794,6 +795,35 @@ is located starts following the recovery.`,
Measurement: "Quorum Recoveries",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotSendBytes = metric.Metadata{
Name: "range.snapshots.delegate.sent-bytes",
Help: `Bytes sent using a delegate.
The number of bytes sent as a result of a delegate snapshot request
that was originated from a different node. This metric is useful in
evaluating the network savings of not sending cross region traffic.
`,
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaDelegateSnapshotSuccesses = metric.Metadata{
Name: "range.snapshot.delegate.successes",
Help: `Number of snapshots that were delegated to a different node and
resulted in success on that delegate. This does not count self delegated snapshots.
`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotFailures = metric.Metadata{
Name: "range.snapshot.delegate.failures",
Help: `Number of snapshots that were delegated to a different node and
resulted in failure on that delegate. There are numerous reasons a failure can
occur on a delegate such as timeout, the delegate Raft log being too far behind
or the delegate being too busy to send.
`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}

// Quota pool metrics.
metaRaftQuotaPoolPercentUsed = metric.Metadata{
Expand Down Expand Up @@ -1870,6 +1900,11 @@ type StoreMetrics struct {
RangeSnapshotSendTotalInProgress *metric.Gauge
RangeSnapshotRecvTotalInProgress *metric.Gauge

// Delegate snapshot metrics. These don't count self-delegated snapshots.
DelegateSnapshotSendBytes *metric.Counter
DelegateSnapshotSuccesses *metric.Counter
DelegateSnapshotFailures *metric.Counter

// Raft processing metrics.
RaftTicks *metric.Counter
RaftQuotaPoolPercentUsed metric.IHistogram
Expand Down Expand Up @@ -2399,6 +2434,9 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RangeSnapshotRecvTotalInProgress: metric.NewGauge(metaRangeSnapshotRecvTotalInProgress),
RangeRaftLeaderTransfers: metric.NewCounter(metaRangeRaftLeaderTransfers),
RangeLossOfQuorumRecoveries: metric.NewCounter(metaRangeLossOfQuorumRecoveries),
DelegateSnapshotSendBytes: metric.NewCounter(metaDelegateSnapshotSendBytes),
DelegateSnapshotSuccesses: metric.NewCounter(metaDelegateSnapshotSuccesses),
DelegateSnapshotFailures: metric.NewCounter(metaDelegateSnapshotFailures),

// Raft processing metrics.
RaftTicks: metric.NewCounter(metaRaftTicks),
Expand Down
15 changes: 14 additions & 1 deletion pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2748,6 +2748,7 @@ func (r *Replica) sendSnapshotUsingDelegate(
senderQueueName kvserverpb.SnapshotRequest_QueueName,
senderQueuePriority float64,
) (retErr error) {

defer func() {
// Report the snapshot status to Raft, which expects us to do this once we
// finish sending the snapshot.
Expand Down Expand Up @@ -2837,8 +2838,10 @@ func (r *Replica) sendSnapshotUsingDelegate(
ctx, 2, "delegating snapshot transmission attempt %v for %v to %v", n+1, recipient, sender,
)

selfDelegate := n == len(senders)-1

// On the last attempt, always queue on the delegate to time out naturally.
if n == len(senders)-1 {
if selfDelegate {
delegateRequest.QueueOnDelegateLen = -1
}

Expand All @@ -2850,8 +2853,14 @@ func (r *Replica) sendSnapshotUsingDelegate(
)
// Return once we have success.
if retErr == nil {
if !selfDelegate {
r.store.Metrics().DelegateSnapshotSuccesses.Inc(1)
}
return
} else {
if !selfDelegate {
r.store.Metrics().DelegateSnapshotFailures.Inc(1)
}
log.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr)
}
}
Expand Down Expand Up @@ -3110,6 +3119,10 @@ func (r *Replica) followerSendSnapshot(
}

recordBytesSent := func(inc int64) {
// Only counts for delegated bytes if we are not self-delegating.
if r.NodeID() != req.CoordinatorReplica.NodeID {
r.store.metrics.DelegateSnapshotSendBytes.Inc(inc)
}
r.store.metrics.RangeSnapshotSentBytes.Inc(inc)

switch header.Priority {
Expand Down
3 changes: 3 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,8 @@ var charts = []sectionDescription{
"range.snapshots.applied-voter",
"range.snapshots.applied-initial",
"range.snapshots.applied-non-voter",
"range.snapshot.delegate.successes",
"range.snapshot.delegate.failures",
},
},
{
Expand All @@ -640,6 +642,7 @@ var charts = []sectionDescription{
"range.snapshots.rebalancing.sent-bytes",
"range.snapshots.unknown.rcvd-bytes",
"range.snapshots.unknown.sent-bytes",
"range.snapshots.delegate.sent-bytes",
},
},
},
Expand Down

0 comments on commit f33f404

Please sign in to comment.