From bde1e381f41eb9e5922216b2f5b2cf9d068b96b3 Mon Sep 17 00:00:00 2001 From: Austen McClernon Date: Tue, 21 May 2024 18:56:36 -0400 Subject: [PATCH] kvserver: allow retrying scatter processing with more errors Previously, scatter processing would only be retried when encountering a snapshot error. Other errors commonly occur, which we expect to be transient and retryable, such as the range descriptor changing or rejected lease transfers. The range descriptor change error being most common, due to the proclivity of clients to issue splits alongside scatter requests, which would update the range descriptor. Retry failed scatter replicate processing if the returned error matches any of `IsRetriableReplicationChangeError`s, similar to range splits. Note the maximum number of retries remains at 5 for scatter. Resolves: #124522 Release note: None --- pkg/kv/kvserver/replica_command.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 3d516e3ade4a..d186ad95e38f 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -4156,8 +4156,12 @@ func (r *Replica) adminScatter( ctx, r, desc, conf, true /* scatter */, false, /* dryRun */ ) if err != nil { - // TODO(tbg): can this use IsRetriableReplicationError? - if isSnapshotError(err) { + // If the error is expected to be transient, retry processing the range. + // This is most likely to occur when concurrent split and scatters are + // issued, in which case the scatter may fail due to the range split + // updating the descriptor while processing. + if IsRetriableReplicationChangeError(err) { + log.VEventf(ctx, 1, "retrying scatter process after retryable error: %v", err) continue } break