Skip to content

Commit

Permalink
Merge #133991
Browse files Browse the repository at this point in the history
133991: kvclient: fix rangefeed retry reason counters r=stevendanna a=stevendanna

Previously, retriable errors would often be recorded in the wrong
retry counter because the code that initialised the metrics assumed a
stable ordering when iterating a map.

Given the amount of work that is about to happen when restarting a
rangefeed, I doubt the optimization of a slice lookup vs a map lookup
is that important here.

Further, we almost surely do not want to panic just because we
couldn't find a metric for the reason a node might have sent to us, so
now rather than panic'ing we have a fallback counter for unknown error
types.

Epic: none

Release note: Fix bug that could result in incorrect metrics related
to retriable rangefeed errors.

Co-authored-by: Steven Danna <[email protected]>
  • Loading branch information
craig[bot] and stevendanna committed Nov 1, 2024
2 parents c7ce9ca + d772ac5 commit a39ae7d
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 31 deletions.
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1012,6 +1012,7 @@
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.send</td><td>Number of ranges that encountered retryable send error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.slow_consumer</td><td>Number of ranges that encountered retryable SLOW_CONSUMER error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.store_not_found</td><td>Number of ranges that encountered retryable store not found error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.retry.unknown</td><td>Number of ranges that encountered retryable unknown error</td><td>Ranges</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangefeed.total_ranges</td><td>Number of ranges executing rangefeed<br/><br/>This counts the number of ranges with an active rangefeed.<br/></td><td>Ranges</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rangelookups</td><td>Number of range lookups</td><td>Range Lookups</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.addsstable.sent</td><td>Number of AddSSTable requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
35 changes: 11 additions & 24 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,24 +511,25 @@ func makeDistSenderCircuitBreakerMetrics() DistSenderCircuitBreakerMetrics {
type rangeFeedErrorCounters struct {
RangefeedRestartRanges *metric.Counter
RangefeedErrorCatchup *metric.Counter
RetryErrors []*metric.Counter
RetryErrors map[int32]*metric.Counter
SendErrors *metric.Counter
StoreNotFound *metric.Counter
NodeNotFound *metric.Counter
RangeNotFound *metric.Counter
RangeKeyMismatch *metric.Counter
RetryUnknown *metric.Counter
}

func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
var retryCounters []*metric.Counter
for name := range kvpb.RangeFeedRetryError_Reason_value {
retryCounters := make(map[int32]*metric.Counter, len(kvpb.RangeFeedRetryError_Reason_value))
for name, idx := range kvpb.RangeFeedRetryError_Reason_value {
name = strings.TrimPrefix(name, "REASON_")
retryCounters = append(retryCounters, metric.NewCounter(metric.Metadata{
retryCounters[idx] = metric.NewCounter(metric.Metadata{
Name: fmt.Sprintf("distsender.rangefeed.retry.%s", strings.ToLower(name)),
Help: fmt.Sprintf(`Number of ranges that encountered retryable %s error`, name),
Measurement: "Ranges",
Unit: metric.Unit_COUNT,
}))
})
}

retryMeta := func(name string) metric.Metadata {
Expand All @@ -549,6 +550,7 @@ func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
NodeNotFound: metric.NewCounter(retryMeta("node not found")),
RangeNotFound: metric.NewCounter(retryMeta("range not found")),
RangeKeyMismatch: metric.NewCounter(retryMeta("range key mismatch")),
RetryUnknown: metric.NewCounter(retryMeta("unknown")),
}
}

Expand All @@ -558,25 +560,10 @@ func makeRangeFeedErrorCounters() rangeFeedErrorCounters {
func (c rangeFeedErrorCounters) GetRangeFeedRetryCounter(
reason kvpb.RangeFeedRetryError_Reason,
) *metric.Counter {
// Normally, retry reason values are contiguous. One way gaps could be
// introduced, is if some retry reasons are retired (deletions are
// accomplished by reserving enum value to prevent its re-use), and then more
// reason added after. Then, we can't use reason value as an index into
// retryCounters. Because this scenario is believed to be very unlikely, we
// forego any fancy re-mapping schemes, and instead opt for explicit handling.
switch reason {
case kvpb.RangeFeedRetryError_REASON_REPLICA_REMOVED,
kvpb.RangeFeedRetryError_REASON_RANGE_SPLIT,
kvpb.RangeFeedRetryError_REASON_MANUAL_RANGE_SPLIT,
kvpb.RangeFeedRetryError_REASON_RANGE_MERGED,
kvpb.RangeFeedRetryError_REASON_RAFT_SNAPSHOT,
kvpb.RangeFeedRetryError_REASON_LOGICAL_OPS_MISSING,
kvpb.RangeFeedRetryError_REASON_SLOW_CONSUMER,
kvpb.RangeFeedRetryError_REASON_NO_LEASEHOLDER,
kvpb.RangeFeedRetryError_REASON_RANGEFEED_CLOSED:
return c.RetryErrors[reason]
default:
panic(errors.AssertionFailedf("unknown retry reason %d", reason))
if metric, ok := c.RetryErrors[int32(reason)]; ok {
return metric
} else {
return c.RetryUnknown
}
}

Expand Down
20 changes: 13 additions & 7 deletions pkg/util/metric/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ func (r *Registry) AddMetricStruct(metricStruct interface{}) {
}
t := v.Type()

const allowNil = true
const disallowNil = false
for i := 0; i < v.NumField(); i++ {
vfield, tfield := v.Field(i), t.Field(i)
tname := tfield.Name
Expand All @@ -137,14 +139,18 @@ func (r *Registry) AddMetricStruct(metricStruct interface{}) {
for i := 0; i < vfield.Len(); i++ {
velem := vfield.Index(i)
telemName := fmt.Sprintf("%s[%d]", tname, i)
// Permit elements in the array to be nil.
const skipNil = true
r.addMetricValue(ctx, velem, telemName, skipNil, t)
r.addMetricValue(ctx, velem, telemName, allowNil, t)
}
case reflect.Map:
iter := vfield.MapRange()
for iter.Next() {
// telemName is only used for assertion errors.
telemName := iter.Key().String()
r.addMetricValue(ctx, iter.Value(), telemName, allowNil, t)
}
default:
// No metric fields should be nil.
const skipNil = false
r.addMetricValue(ctx, vfield, tname, skipNil, t)
r.addMetricValue(ctx, vfield, tname, disallowNil, t)
}
}
}
Expand Down Expand Up @@ -274,7 +280,7 @@ func checkFieldCanBeSkipped(
}

switch fieldType.Kind() {
case reflect.Array, reflect.Slice:
case reflect.Array, reflect.Slice, reflect.Map:
checkFieldCanBeSkipped(skipReason, fieldName, fieldType.Elem(), parentType)
case reflect.Struct:
containsMetrics := false
Expand Down Expand Up @@ -317,7 +323,7 @@ func containsMetricType(ft reflect.Type) bool {
}

switch ft.Kind() {
case reflect.Slice, reflect.Array:
case reflect.Slice, reflect.Array, reflect.Map:
return containsMetricType(ft.Elem())
case reflect.Struct:
for i := 0; i < ft.NumField(); i++ {
Expand Down
7 changes: 7 additions & 0 deletions pkg/util/metric/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ func TestRegistry(t *testing.T) {
StructHistogram IHistogram
NestedStructGauge NestedStruct
ArrayStructCounters [4]*Counter
MapOfCounters map[int32]*Counter
// Ensure that nil struct values in arrays are safe.
NestedStructArray [2]*NestedStruct
// A few extra ones: either not exported, or not metric objects.
Expand Down Expand Up @@ -125,6 +126,10 @@ func TestRegistry(t *testing.T) {
nil, // skipped
NewCounter(Metadata{Name: "array.struct.counter.3"}),
},
MapOfCounters: map[int32]*Counter{
1: NewCounter(Metadata{Name: "map.counter.0"}),
2: NewCounter(Metadata{Name: "map.counter.1"}),
},
NestedStructArray: [2]*NestedStruct{
0: nil, // skipped
1: {
Expand Down Expand Up @@ -154,6 +159,8 @@ func TestRegistry(t *testing.T) {
"array.struct.counter.0": {},
"array.struct.counter.1": {},
"array.struct.counter.3": {},
"map.counter.0": {},
"map.counter.1": {},
"nested.struct.array.1.gauge": {},
}
totalMetrics := len(expNames)
Expand Down

0 comments on commit a39ae7d

Please sign in to comment.