Skip to content

Commit

Permalink
kvclient: fix rangefeed retry reason counters
Browse files Browse the repository at this point in the history
Previously, retriable errors would often be recorded in the wrong
retry counter because the code that initialised the metrics assumed a
stable ordering when iterating a map.

Given the amount of work that is about to happen when restarting a
rangefeed, I doubt the optimization of a slice lookup vs a map lookup
is that important here.

Further, we almost surely do not want to panic just because we
couldn't find a metric for the reason a node might have sent to us, so
now rather than panic'ing we have a fallback counter for unknown error
types.

Epic: none

Release note: Fix bug that could result in incorrect metrics related
to retriable rangefeed errors.
  • Loading branch information
stevendanna committed Oct 31, 2024
1 parent 56ff146 commit d772ac5
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 24 deletions.
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,7 @@
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.send</td><td>Number of ranges that encountered retryable send error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.slow_consumer</td><td>Number of ranges that encountered retryable SLOW_CONSUMER error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.store_not_found</td><td>Number of ranges that encountered retryable store not found error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.unknown</td><td>Number of ranges that encountered retryable unknown error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.total_ranges</td><td>Number of ranges executing rangefeed<br/><br/>This counts the number of ranges with an active rangefeed.<br/></td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangelookups</td><td>Number of range lookups</td><td>Range Lookups</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.addsstable.sent</td><td>Number of AddSSTable requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
35 changes: 11 additions & 24 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,24 +511,25 @@ func makeDistSenderCircuitBreakerMetrics() DistSenderCircuitBreakerMetrics {
type rangeFeedErrorCounters struct {
RangefeedRestartRanges *metric.Counter
RangefeedErrorCatchup *metric.Counter
RetryErrors []*metric.Counter
RetryErrors map[int32]*metric.Counter
SendErrors *metric.Counter
StoreNotFound *metric.Counter
NodeNotFound *metric.Counter
RangeNotFound *metric.Counter
RangeKeyMismatch *metric.Counter
RetryUnknown *metric.Counter
}

func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
var retryCounters []*metric.Counter
for name := range kvpb.RangeFeedRetryError_Reason_value {
retryCounters := make(map[int32]*metric.Counter, len(kvpb.RangeFeedRetryError_Reason_value))
for name, idx := range kvpb.RangeFeedRetryError_Reason_value {
name = strings.TrimPrefix(name, "REASON_")
retryCounters = append(retryCounters, metric.NewCounter(metric.Metadata{
retryCounters[idx] = metric.NewCounter(metric.Metadata{
Name: fmt.Sprintf("distsender.rangefeed.retry.%s", strings.ToLower(name)),
Help: fmt.Sprintf(`Number of ranges that encountered retryable %s error`, name),
Measurement: "Ranges",
Unit: metric.Unit_COUNT,
}))
})
}

retryMeta := func(name string) metric.Metadata {
Expand All @@ -549,6 +550,7 @@ func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
NodeNotFound: metric.NewCounter(retryMeta("node not found")),
RangeNotFound: metric.NewCounter(retryMeta("range not found")),
RangeKeyMismatch: metric.NewCounter(retryMeta("range key mismatch")),
RetryUnknown: metric.NewCounter(retryMeta("unknown")),
}
}

Expand All @@ -558,25 +560,10 @@ func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
func (c rangeFeedErrorCounters) GetRangeFeedRetryCounter(
reason kvpb.RangeFeedRetryError_Reason,
) *metric.Counter {
// Normally, retry reason values are contiguous. One way gaps could be
// introduced, is if some retry reasons are retired (deletions are
// accomplished by reserving enum value to prevent its re-use), and then more
// reason added after. Then, we can't use reason value as an index into
// retryCounters. Because this scenario is believed to be very unlikely, we
// forego any fancy re-mapping schemes, and instead opt for explicit handling.
switch reason {
case kvpb.RangeFeedRetryError_REASON_REPLICA_REMOVED,
kvpb.RangeFeedRetryError_REASON_RANGE_SPLIT,
kvpb.RangeFeedRetryError_REASON_MANUAL_RANGE_SPLIT,
kvpb.RangeFeedRetryError_REASON_RANGE_MERGED,
kvpb.RangeFeedRetryError_REASON_RAFT_SNAPSHOT,
kvpb.RangeFeedRetryError_REASON_LOGICAL_OPS_MISSING,
kvpb.RangeFeedRetryError_REASON_SLOW_CONSUMER,
kvpb.RangeFeedRetryError_REASON_NO_LEASEHOLDER,
kvpb.RangeFeedRetryError_REASON_RANGEFEED_CLOSED:
return c.RetryErrors[reason]
default:
panic(errors.AssertionFailedf("unknown retry reason %d", reason))
if metric, ok := c.RetryErrors[int32(reason)]; ok {
return metric
} else {
return c.RetryUnknown
}
}

Expand Down

0 comments on commit d772ac5

Please sign in to comment.