Skip to content

Commit

Permalink
Merge #98116
Browse files Browse the repository at this point in the history
98116: kv: consider ErrReplicaNotFound and ErrReplicaCannotHoldLease to be retryable r=shralex a=nvanbenschoten

Fixes #96746.
Fixes #100379.

This commit considers ErrReplicaNotFound and ErrReplicaCannotHoldLease to be retryable replication change errors when thrown by lease transfer requests. In doing so, these errors will be retried by the retry loop in `Replica.executeAdminCommandWithDescriptor`.

This avoids spurious errors when a split gets blocked behind a lateral replica move like we see in the following situation:
1. issue AdminSplit
2. range in joint config, first needs to leave (maybeLeaveAtomicChangeReplicas)
3. to leave, needs to transfer lease from voter_outgoing to voter_incoming 4(a). lease transfer request sent to replica that has not yet applied the
      replication change that added the voter_incoming to the range
4(b). lease transfer request delayed and delivered after voter_incoming has
      been transferred the lease, added to the range, then removed from the
      range.

In either case, retrying the AdminSplit operation on these errors will ensure that it eventually succeeds.

Release note (bug fix): Fixed a rare race that could allow large RESTOREs to fail with a `unable to find store` error.

Co-authored-by: Nathan VanBenschoten <[email protected]>
  • Loading branch information
craig[bot] and nvanbenschoten committed Apr 7, 2023
2 parents f0e7b78 + c3519fe commit 2c85208
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 81 deletions.
13 changes: 1 addition & 12 deletions pkg/kv/kvnemesis/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,17 +356,7 @@ func transferLeaseResultIsIgnorable(res Result) bool {
return false
}
return kvserver.IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) ||
// Only VOTER (_FULL, _INCOMING, sometimes _OUTGOING) replicas can
// hold a range lease. Attempts to transfer to lease to any other
// replica type are rejected. See CheckCanReceiveLease.
resultIsErrorStr(res, `replica cannot hold lease`) ||
// Only replicas that are part of the range can be given
// the lease. This case is hit if a TransferLease op races
// with a ChangeReplicas op.
resultIsErrorStr(res, `replica not found in RangeDescriptor`) ||
// A lease transfer that races with a replica removal may find that
// the store it was targeting is no longer part of the range.
resultIsErrorStr(res, `unable to find store \d+ in range`) ||
kvserver.IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) ||
// A lease transfer is not permitted while a range merge is in its
// critical phase.
resultIsErrorStr(res, `cannot transfer lease while merge in progress`) ||
Expand Down Expand Up @@ -773,7 +763,6 @@ func (v *validator) processOp(op Operation) {
ignore = kvserver.IsRetriableReplicationChangeError(err) ||
kvserver.IsIllegalReplicationChangeError(err) ||
kvserver.IsReplicationChangeInProgressError(err) ||
errors.Is(err, roachpb.ErrReplicaCannotHoldLease) ||
transferLeaseResultIsIgnorable(t.Result) // replication changes can transfer leases
}
if !ignore {
Expand Down
6 changes: 3 additions & 3 deletions pkg/kv/kvserver/batcheval/cmd_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,13 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {
// Learners are not allowed to become leaseholders for now, see the comments
// in TransferLease and RequestLease.
_, err := TransferLease(ctx, nil, cArgs, nil)
require.EqualError(t, err, `replica cannot hold lease`)
require.EqualError(t, err, `lease target replica cannot hold lease`)

cArgs.Args = &kvpb.RequestLeaseRequest{}
_, err = RequestLease(ctx, nil, cArgs, nil)

const expForUnknown = `cannot replace lease <empty> with <empty>: ` +
`replica not found in RangeDescriptor`
`lease target replica not found in RangeDescriptor`
require.EqualError(t, err, expForUnknown)

cArgs.Args = &kvpb.RequestLeaseRequest{
Expand All @@ -169,7 +169,7 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {

const expForLearner = `cannot replace lease <empty> ` +
`with repl=(n2,s2):2LEARNER seq=0 start=0,0 exp=<nil>: ` +
`replica cannot hold lease`
`lease target replica cannot hold lease`
require.EqualError(t, err, expForLearner)
}

Expand Down
18 changes: 4 additions & 14 deletions pkg/kv/kvserver/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,21 +247,10 @@ func TestCannotTransferLeaseToVoterDemoting(t *testing.T) {
wg.Add(1)
go func() {
defer wg.Done()
err := tc.Server(0).DB().AdminTransferLease(context.Background(),
err := tc.Server(0).DB().AdminTransferLease(ctx,
scratchStartKey, tc.Target(2).StoreID)
require.Error(t, err)
require.Regexp(t,
// The error generated during evaluation.
"replica cannot hold lease|"+
// If the lease transfer request has not yet made it to the latching
// phase by the time we close(ch) below, we can receive the following
// error due to the sanity checking which happens in
// AdminTransferLease before attempting to evaluate the lease
// transfer.
// We have a sleep loop below to try to encourage the lease transfer
// to make it past that sanity check prior to letting the change
// of replicas proceed.
"cannot transfer lease to replica of type VOTER_DEMOTING_LEARNER", err.Error())
require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease))
}()
// Try really hard to make sure that our request makes it past the
// sanity check error to the evaluation error.
Expand Down Expand Up @@ -360,7 +349,8 @@ func TestTransferLeaseToVoterDemotingFails(t *testing.T) {
// (wasLastLeaseholder = false in CheckCanReceiveLease).
err = tc.Server(0).DB().AdminTransferLease(context.Background(),
scratchStartKey, tc.Target(2).StoreID)
require.EqualError(t, err, `replica cannot hold lease`)
require.Error(t, err)
require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease))
// Make sure the lease is still on n1.
leaseHolder, err = tc.FindRangeLeaseHolder(desc, nil)
require.NoError(t, err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/client_replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1637,7 +1637,7 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) {

{
// An invalid target should result in an error.
const expected = "unable to find store .* in range"
const expected = "lease target replica not found in RangeDescriptor"
if err := l.replica0.AdminTransferLease(ctx, 1000, false /* bypassSafetyChecks */); !testutils.IsError(err, expected) {
t.Fatalf("expected %s, but found %v", expected, err)
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/kv/kvserver/markers.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ var errMarkCanRetryReplicationChangeWithUpdatedDesc = errors.New("should retry w
func IsRetriableReplicationChangeError(err error) bool {
return errors.Is(err, errMarkCanRetryReplicationChangeWithUpdatedDesc) ||
IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) ||
IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) ||
isSnapshotError(err)
}

Expand Down Expand Up @@ -106,3 +107,13 @@ var errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot = errors.New(
func IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err error) bool {
return errors.Is(err, errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot)
}

// IsLeaseTransferRejectedBecauseTargetCannotReceiveLease returns true if err
// (assumed to have been emitted by the current leaseholder when processing a
// lease transfer request) indicates that the target replica is not qualified to
// receive the lease. This could be because the current leaseholder has an
// outdated view of the target replica's state.
func IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err error) bool {
return errors.Is(err, roachpb.ErrReplicaNotFound) ||
errors.Is(err, roachpb.ErrReplicaCannotHoldLease)
}
112 changes: 70 additions & 42 deletions pkg/kv/kvserver/replica_learner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,51 +955,79 @@ func TestSplitWithLearnerOrJointConfig(t *testing.T) {
func TestSplitRetriesOnFailedExitOfJointConfig(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

var rangeIDAtomic int64
var rejectedCount int
const maxRejects = 3
reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic))
if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects {
rejectedCount++
repl := ba.Requests[0].GetTransferLease().Lease.Replica
status := raftutil.ReplicaStateProbe
err := kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status)
return kvpb.NewError(err)
}
return nil
testCases := []struct {
name string
errFn func(*kvpb.TransferLeaseRequest) error
}{
{
name: "targetMayNeedSnapshot",
errFn: func(req *kvpb.TransferLeaseRequest) error {
repl := req.Lease.Replica
status := raftutil.ReplicaStateProbe
return kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status)
},
},
{
name: "replicaNotFound",
errFn: func(_ *kvpb.TransferLeaseRequest) error {
return roachpb.ErrReplicaNotFound
},
},
{
name: "replicaCannotHoldLease",
errFn: func(_ *kvpb.TransferLeaseRequest) error {
return roachpb.ErrReplicaCannotHoldLease
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var rangeIDAtomic int64
var rejectedCount int
const maxRejects = 3
reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic))
if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects {
rejectedCount++
req := ba.Requests[0].GetTransferLease()
err := tc.errFn(req)
return kvpb.NewError(err)
}
return nil
}

knobs, ltk := makeReplicationTestKnobs()
knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID))

// Rebalance the range from one store to the other. This will enter a joint
// configuration and then stop because of the testing knobs.
atomic.StoreInt64(&ltk.replicationAlwaysUseJointConfig, 1)
atomic.StoreInt64(&ltk.replicaAddStopAfterJointConfig, 1)
tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1))

// Perform a split of the range. This will auto-transitions us out of the
// joint conf before doing work. However, because of the filter we installed
// above, this will first run into a series of retryable errors when
// attempting to perform a lease transfer. The split should retry until the
// join configuration completes.
left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next())
require.False(t, left.Replicas().InAtomicReplicationChange(), left)
require.False(t, right.Replicas().InAtomicReplicationChange(), right)

require.Equal(t, maxRejects, rejectedCount)
ctx := context.Background()
knobs, ltk := makeReplicationTestKnobs()
knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID))

// Rebalance the range from one store to the other. This will enter a joint
// configuration and then stop because of the testing knobs.
atomic.StoreInt64(&ltk.replicationAlwaysUseJointConfig, 1)
atomic.StoreInt64(&ltk.replicaAddStopAfterJointConfig, 1)
tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1))

// Perform a split of the range. This will auto-transitions us out of the
// joint conf before doing work. However, because of the filter we installed
// above, this will first run into a series of retryable errors when
// attempting to perform a lease transfer. The split should retry until the
// joint configuration completes.
left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next())
require.False(t, left.Replicas().InAtomicReplicationChange(), left)
require.False(t, right.Replicas().InAtomicReplicationChange(), right)

require.Equal(t, maxRejects, rejectedCount)
})
}
}

func TestReplicateQueueSeesLearnerOrJointConfig(t *testing.T) {
Expand Down
10 changes: 6 additions & 4 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,11 +433,13 @@ func (r *Replica) propose(
if err := roachpb.CheckCanReceiveLease(
lhDesc, proposedDesc.Replicas(), true, /* wasLastLeaseholder */
); err != nil {
e := errors.Mark(errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+
err = errors.Handled(err)
err = errors.Mark(err, errMarkInvalidReplicationChange)
err = errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+
"remove self (leaseholder); lhRemovalAllowed: %v; current desc: %v; proposed desc: %v",
lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc), errMarkInvalidReplicationChange)
log.Errorf(p.ctx, "%v", e)
return kvpb.NewError(e)
lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc)
log.Errorf(p.ctx, "%v", err)
return kvpb.NewError(err)
}
} else if p.command.ReplicatedEvalResult.AddSSTable != nil {
log.VEvent(p.ctx, 4, "sideloadable proposal detected")
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ func (r *Replica) AdminTransferLease(
// Verify the target is a replica of the range.
var ok bool
if nextLeaseHolder, ok = desc.GetReplicaDescriptor(target); !ok {
return nil, nil, errors.Errorf("unable to find store %d in range %+v", target, desc)
return nil, nil, roachpb.ErrReplicaNotFound
}

if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok &&
Expand Down
11 changes: 7 additions & 4 deletions pkg/roachpb/metadata_replicas.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,12 +507,15 @@ func (c ReplicaChangeType) IsRemoval() bool {
}
}

var errReplicaNotFound = errors.Errorf(`replica not found in RangeDescriptor`)
// ErrReplicaNotFound can be returned from CheckCanReceiveLease.
//
// See: https://github.com/cockroachdb/cockroach/issues/93163.
var ErrReplicaNotFound = errors.New(`lease target replica not found in RangeDescriptor`)

// ErrReplicaCannotHoldLease can be returned from CheckCanReceiveLease.
//
// See: https://github.com/cockroachdb/cockroach/issues/93163
var ErrReplicaCannotHoldLease = errors.Errorf("replica cannot hold lease")
// See: https://github.com/cockroachdb/cockroach/issues/93163.
var ErrReplicaCannotHoldLease = errors.New(`lease target replica cannot hold lease`)

// CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease.
// Returns an error if the respective replica is not eligible.
Expand Down Expand Up @@ -543,7 +546,7 @@ func CheckCanReceiveLease(
) error {
repDesc, ok := replDescs.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID)
if !ok {
return errReplicaNotFound
return ErrReplicaNotFound
}
if !(repDesc.IsVoterNewConfig() ||
(repDesc.IsVoterOldConfig() && replDescs.containsVoterIncoming() && wasLastLeaseholder)) {
Expand Down

0 comments on commit 2c85208

Please sign in to comment.