Skip to content

Commit

Permalink
kv: Add stats for delegated snapshots
Browse files Browse the repository at this point in the history
Adds three new stats for delegated snapshots. These are not currently
exposed through the admin UI, but will be useful for determining the
effectiveness of using delegated snapshots in a multi-region setup.

The stats are:
range.snapshot.delegate.send-bytes - Number of bytes sent by a delegate.
range.snapshot.delegate.successes - Successful delegation requests.
range.snapshot.delegate.successes - Failed delegation requests.

Note that the delegate success and failure stats intentionally do not
include self-delegated snapshots since those are already accounted for
by the standard snapshot stats. The cost savings from using delegated
snapshots can be directly determined by multiplying the send-bytes
metric by the cost to send data across regions.

Release note (ops change): Adding additional stats described above.
Epic: none
  • Loading branch information
andrewbaptist committed Feb 21, 2023
1 parent 95fcd95 commit 4a43cf6
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 9 deletions.
54 changes: 46 additions & 8 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -679,49 +679,49 @@ var (
Name: "range.snapshots.rcvd-bytes",
Help: "Number of snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotSentBytes = metric.Metadata{
Name: "range.snapshots.sent-bytes",
Help: "Number of snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotUnknownRcvdBytes = metric.Metadata{
Name: "range.snapshots.unknown.rcvd-bytes",
Help: "Number of unknown snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotUnknownSentBytes = metric.Metadata{
Name: "range.snapshots.unknown.sent-bytes",
Help: "Number of unknown snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRebalancingRcvdBytes = metric.Metadata{
Name: "range.snapshots.rebalancing.rcvd-bytes",
Help: "Number of rebalancing snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRebalancingSentBytes = metric.Metadata{
Name: "range.snapshots.rebalancing.sent-bytes",
Help: "Number of rebalancing snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecoveryRcvdBytes = metric.Metadata{
Name: "range.snapshots.recovery.rcvd-bytes",
Help: "Number of recovery snapshot bytes received",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotRecoverySentBytes = metric.Metadata{
Name: "range.snapshots.recovery.sent-bytes",
Help: "Number of recovery snapshot bytes sent",
Measurement: "Bytes",
Unit: metric.Unit_COUNT,
Unit: metric.Unit_BYTES,
}
metaRangeSnapshotSendQueueLength = metric.Metadata{
Name: "range.snapshots.send-queue",
Expand Down Expand Up @@ -759,6 +759,7 @@ var (
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}

metaRangeRaftLeaderTransfers = metric.Metadata{
Name: "range.raftleadertransfers",
Help: "Number of raft leader transfers",
Expand All @@ -775,6 +776,35 @@ is located starts following the recovery.`,
Measurement: "Quorum Recoveries",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotSendBytes = metric.Metadata{
Name: "range.snapshots.delegate.sent-bytes",
Help: `Bytes sent using a delegate.
The number of bytes sent as a result of a delegate snapshot request
that was originated from a different node. This metric is useful in
evaluating the network savings of not sending cross region traffic.
`,
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
metaDelegateSnapshotSuccesses = metric.Metadata{
Name: "range.snapshot.delegate.successes",
Help: `Number of snapshots that were delegated to a different node and
resulted in success on that delegate. This does not count self delegated snapshots.
`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}
metaDelegateSnapshotFailures = metric.Metadata{
Name: "range.snapshot.delegate.failures",
Help: `Number of snapshots that were delegated to a different node and
resulted in failure on that delegate. There are numerous reasons a failure can
occur on a delegate such as timeout, the delegate Raft log being too far behind
or the delegate being too busy to send.
`,
Measurement: "Snapshots",
Unit: metric.Unit_COUNT,
}

// Quota pool metrics.
metaRaftQuotaPoolPercentUsed = metric.Metadata{
Expand Down Expand Up @@ -1848,6 +1878,11 @@ type StoreMetrics struct {
RangeSnapshotSendTotalInProgress *metric.Gauge
RangeSnapshotRecvTotalInProgress *metric.Gauge

// Delegate snapshot metrics. These don't count self-delegated snapshots.
DelegateSnapshotSendBytes *metric.Counter
DelegateSnapshotSuccesses *metric.Counter
DelegateSnapshotFailures *metric.Counter

// Raft processing metrics.
RaftTicks *metric.Counter
RaftQuotaPoolPercentUsed metric.IHistogram
Expand Down Expand Up @@ -2374,6 +2409,9 @@ func newStoreMetrics(histogramWindow time.Duration) *StoreMetrics {
RangeSnapshotRecvTotalInProgress: metric.NewGauge(metaRangeSnapshotRecvTotalInProgress),
RangeRaftLeaderTransfers: metric.NewCounter(metaRangeRaftLeaderTransfers),
RangeLossOfQuorumRecoveries: metric.NewCounter(metaRangeLossOfQuorumRecoveries),
DelegateSnapshotSendBytes: metric.NewCounter(metaDelegateSnapshotSendBytes),
DelegateSnapshotSuccesses: metric.NewCounter(metaDelegateSnapshotSuccesses),
DelegateSnapshotFailures: metric.NewCounter(metaDelegateSnapshotFailures),

// Raft processing metrics.
RaftTicks: metric.NewCounter(metaRaftTicks),
Expand Down
15 changes: 14 additions & 1 deletion pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -2758,6 +2758,7 @@ func (r *Replica) sendSnapshotUsingDelegate(
senderQueueName kvserverpb.SnapshotRequest_QueueName,
senderQueuePriority float64,
) (retErr error) {

defer func() {
// Report the snapshot status to Raft, which expects us to do this once we
// finish sending the snapshot.
Expand Down Expand Up @@ -2847,8 +2848,10 @@ func (r *Replica) sendSnapshotUsingDelegate(
ctx, 2, "delegating snapshot transmission attempt %v for %v to %v", n+1, recipient, sender,
)

selfDelegate := n == len(senders)-1

// On the last attempt, always queue on the delegate to time out naturally.
if n == len(senders)-1 {
if selfDelegate {
delegateRequest.QueueOnDelegateLen = -1
}

Expand All @@ -2860,8 +2863,14 @@ func (r *Replica) sendSnapshotUsingDelegate(
)
// Return once we have success.
if retErr == nil {
if !selfDelegate {
r.store.Metrics().DelegateSnapshotSuccesses.Inc(1)
}
return
} else {
if !selfDelegate {
r.store.Metrics().DelegateSnapshotFailures.Inc(1)
}
log.Warningf(ctx, "attempt %d: delegate snapshot %+v request failed %v", n+1, delegateRequest, retErr)
}
}
Expand Down Expand Up @@ -3119,6 +3128,10 @@ func (r *Replica) followerSendSnapshot(
}

recordBytesSent := func(inc int64) {
// Only counts for delegated bytes if we are not self-delegating.
if r.NodeID() != req.CoordinatorReplica.NodeID {
r.store.metrics.DelegateSnapshotSendBytes.Inc(inc)
}
r.store.metrics.RangeSnapshotSentBytes.Inc(inc)

switch header.Priority {
Expand Down
3 changes: 3 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,8 @@ var charts = []sectionDescription{
"range.snapshots.applied-voter",
"range.snapshots.applied-initial",
"range.snapshots.applied-non-voter",
"range.snapshot.delegate.successes",
"range.snapshot.delegate.failures",
},
},
{
Expand All @@ -640,6 +642,7 @@ var charts = []sectionDescription{
"range.snapshots.rebalancing.sent-bytes",
"range.snapshots.unknown.rcvd-bytes",
"range.snapshots.unknown.sent-bytes",
"range.snapshots.delegate.sent-bytes",
},
},
},
Expand Down

0 comments on commit 4a43cf6

Please sign in to comment.