Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kv: consider ErrReplicaNotFound and ErrReplicaCannotHoldLease to be retryable #98116

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 1 addition & 12 deletions pkg/kv/kvnemesis/validator.go
Original file line number Diff line number Diff line change
Expand Up @@ -356,17 +356,7 @@ func transferLeaseResultIsIgnorable(res Result) bool {
return false
}
return kvserver.IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) ||
// Only VOTER (_FULL, _INCOMING, sometimes _OUTGOING) replicas can
// hold a range lease. Attempts to transfer to lease to any other
// replica type are rejected. See CheckCanReceiveLease.
resultIsErrorStr(res, `replica cannot hold lease`) ||
// Only replicas that are part of the range can be given
// the lease. This case is hit if a TransferLease op races
// with a ChangeReplicas op.
resultIsErrorStr(res, `replica not found in RangeDescriptor`) ||
// A lease transfer that races with a replica removal may find that
// the store it was targeting is no longer part of the range.
resultIsErrorStr(res, `unable to find store \d+ in range`) ||
kvserver.IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) ||
// A lease transfer is not permitted while a range merge is in its
// critical phase.
resultIsErrorStr(res, `cannot transfer lease while merge in progress`) ||
Expand Down Expand Up @@ -773,7 +763,6 @@ func (v *validator) processOp(op Operation) {
ignore = kvserver.IsRetriableReplicationChangeError(err) ||
kvserver.IsIllegalReplicationChangeError(err) ||
kvserver.IsReplicationChangeInProgressError(err) ||
errors.Is(err, roachpb.ErrReplicaCannotHoldLease) ||
transferLeaseResultIsIgnorable(t.Result) // replication changes can transfer leases
}
if !ignore {
Expand Down
6 changes: 3 additions & 3 deletions pkg/kv/kvserver/batcheval/cmd_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,13 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {
// Learners are not allowed to become leaseholders for now, see the comments
// in TransferLease and RequestLease.
_, err := TransferLease(ctx, nil, cArgs, nil)
require.EqualError(t, err, `replica cannot hold lease`)
require.EqualError(t, err, `lease target replica cannot hold lease`)

cArgs.Args = &kvpb.RequestLeaseRequest{}
_, err = RequestLease(ctx, nil, cArgs, nil)

const expForUnknown = `cannot replace lease <empty> with <empty>: ` +
`replica not found in RangeDescriptor`
`lease target replica not found in RangeDescriptor`
require.EqualError(t, err, expForUnknown)

cArgs.Args = &kvpb.RequestLeaseRequest{
Expand All @@ -169,7 +169,7 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {

const expForLearner = `cannot replace lease <empty> ` +
`with repl=(n2,s2):2LEARNER seq=0 start=0,0 exp=<nil>: ` +
`replica cannot hold lease`
`lease target replica cannot hold lease`
require.EqualError(t, err, expForLearner)
}

Expand Down
18 changes: 4 additions & 14 deletions pkg/kv/kvserver/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,21 +247,10 @@ func TestCannotTransferLeaseToVoterDemoting(t *testing.T) {
wg.Add(1)
go func() {
defer wg.Done()
err := tc.Server(0).DB().AdminTransferLease(context.Background(),
err := tc.Server(0).DB().AdminTransferLease(ctx,
scratchStartKey, tc.Target(2).StoreID)
require.Error(t, err)
require.Regexp(t,
// The error generated during evaluation.
"replica cannot hold lease|"+
// If the lease transfer request has not yet made it to the latching
// phase by the time we close(ch) below, we can receive the following
// error due to the sanity checking which happens in
// AdminTransferLease before attempting to evaluate the lease
// transfer.
// We have a sleep loop below to try to encourage the lease transfer
// to make it past that sanity check prior to letting the change
// of replicas proceed.
"cannot transfer lease to replica of type VOTER_DEMOTING_LEARNER", err.Error())
require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease))
}()
// Try really hard to make sure that our request makes it past the
// sanity check error to the evaluation error.
Expand Down Expand Up @@ -360,7 +349,8 @@ func TestTransferLeaseToVoterDemotingFails(t *testing.T) {
// (wasLastLeaseholder = false in CheckCanReceiveLease).
err = tc.Server(0).DB().AdminTransferLease(context.Background(),
scratchStartKey, tc.Target(2).StoreID)
require.EqualError(t, err, `replica cannot hold lease`)
require.Error(t, err)
require.True(t, errors.Is(err, roachpb.ErrReplicaCannotHoldLease))
// Make sure the lease is still on n1.
leaseHolder, err = tc.FindRangeLeaseHolder(desc, nil)
require.NoError(t, err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/client_replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1637,7 +1637,7 @@ func TestLeaseExpirationBasedRangeTransfer(t *testing.T) {

{
// An invalid target should result in an error.
const expected = "unable to find store .* in range"
const expected = "lease target replica not found in RangeDescriptor"
if err := l.replica0.AdminTransferLease(ctx, 1000, false /* bypassSafetyChecks */); !testutils.IsError(err, expected) {
t.Fatalf("expected %s, but found %v", expected, err)
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/kv/kvserver/markers.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ var errMarkCanRetryReplicationChangeWithUpdatedDesc = errors.New("should retry w
func IsRetriableReplicationChangeError(err error) bool {
return errors.Is(err, errMarkCanRetryReplicationChangeWithUpdatedDesc) ||
IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err) ||
IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err) ||
isSnapshotError(err)
}

Expand Down Expand Up @@ -106,3 +107,13 @@ var errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot = errors.New(
func IsLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(err error) bool {
return errors.Is(err, errMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot)
}

// IsLeaseTransferRejectedBecauseTargetCannotReceiveLease returns true if err
// (assumed to have been emitted by the current leaseholder when processing a
// lease transfer request) indicates that the target replica is not qualified to
// receive the lease. This could be because the current leaseholder has an
// outdated view of the target replica's state.
func IsLeaseTransferRejectedBecauseTargetCannotReceiveLease(err error) bool {
return errors.Is(err, roachpb.ErrReplicaNotFound) ||
errors.Is(err, roachpb.ErrReplicaCannotHoldLease)
}
112 changes: 70 additions & 42 deletions pkg/kv/kvserver/replica_learner_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -955,51 +955,79 @@ func TestSplitWithLearnerOrJointConfig(t *testing.T) {
func TestSplitRetriesOnFailedExitOfJointConfig(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

var rangeIDAtomic int64
var rejectedCount int
const maxRejects = 3
reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic))
if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects {
rejectedCount++
repl := ba.Requests[0].GetTransferLease().Lease.Replica
status := raftutil.ReplicaStateProbe
err := kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status)
return kvpb.NewError(err)
}
return nil
testCases := []struct {
name string
errFn func(*kvpb.TransferLeaseRequest) error
}{
{
name: "targetMayNeedSnapshot",
errFn: func(req *kvpb.TransferLeaseRequest) error {
repl := req.Lease.Replica
status := raftutil.ReplicaStateProbe
return kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(repl, status)
},
},
{
name: "replicaNotFound",
errFn: func(_ *kvpb.TransferLeaseRequest) error {
return roachpb.ErrReplicaNotFound
},
},
{
name: "replicaCannotHoldLease",
errFn: func(_ *kvpb.TransferLeaseRequest) error {
return roachpb.ErrReplicaCannotHoldLease
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
var rangeIDAtomic int64
var rejectedCount int
const maxRejects = 3
reqFilter := func(ctx context.Context, ba *kvpb.BatchRequest) *kvpb.Error {
rangeID := roachpb.RangeID(atomic.LoadInt64(&rangeIDAtomic))
if ba.RangeID == rangeID && ba.IsSingleTransferLeaseRequest() && rejectedCount < maxRejects {
rejectedCount++
req := ba.Requests[0].GetTransferLease()
err := tc.errFn(req)
return kvpb.NewError(err)
}
return nil
}

knobs, ltk := makeReplicationTestKnobs()
knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID))

// Rebalance the range from one store to the other. This will enter a joint
// configuration and then stop because of the testing knobs.
atomic.StoreInt64(&ltk.replicationAlwaysUseJointConfig, 1)
atomic.StoreInt64(&ltk.replicaAddStopAfterJointConfig, 1)
tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1))

// Perform a split of the range. This will auto-transitions us out of the
// joint conf before doing work. However, because of the filter we installed
// above, this will first run into a series of retryable errors when
// attempting to perform a lease transfer. The split should retry until the
// join configuration completes.
left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next())
require.False(t, left.Replicas().InAtomicReplicationChange(), left)
require.False(t, right.Replicas().InAtomicReplicationChange(), right)

require.Equal(t, maxRejects, rejectedCount)
ctx := context.Background()
knobs, ltk := makeReplicationTestKnobs()
knobs.Store.(*kvserver.StoreTestingKnobs).TestingRequestFilter = reqFilter
tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
scratchDesc := tc.LookupRangeOrFatal(t, scratchStartKey)
atomic.StoreInt64(&rangeIDAtomic, int64(scratchDesc.RangeID))

// Rebalance the range from one store to the other. This will enter a joint
// configuration and then stop because of the testing knobs.
atomic.StoreInt64(&ltk.replicationAlwaysUseJointConfig, 1)
atomic.StoreInt64(&ltk.replicaAddStopAfterJointConfig, 1)
tc.RebalanceVoterOrFatal(ctx, t, scratchStartKey, tc.Target(0), tc.Target(1))

// Perform a split of the range. This will auto-transitions us out of the
// joint conf before doing work. However, because of the filter we installed
// above, this will first run into a series of retryable errors when
// attempting to perform a lease transfer. The split should retry until the
// joint configuration completes.
left, right := tc.SplitRangeOrFatal(t, scratchStartKey.Next())
require.False(t, left.Replicas().InAtomicReplicationChange(), left)
require.False(t, right.Replicas().InAtomicReplicationChange(), right)

require.Equal(t, maxRejects, rejectedCount)
})
}
}

func TestReplicateQueueSeesLearnerOrJointConfig(t *testing.T) {
Expand Down
10 changes: 6 additions & 4 deletions pkg/kv/kvserver/replica_raft.go
Original file line number Diff line number Diff line change
Expand Up @@ -433,11 +433,13 @@ func (r *Replica) propose(
if err := roachpb.CheckCanReceiveLease(
lhDesc, proposedDesc.Replicas(), true, /* wasLastLeaseholder */
); err != nil {
e := errors.Mark(errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+
err = errors.Handled(err)
err = errors.Mark(err, errMarkInvalidReplicationChange)
err = errors.Wrapf(err, "%v received invalid ChangeReplicasTrigger %s to "+
"remove self (leaseholder); lhRemovalAllowed: %v; current desc: %v; proposed desc: %v",
lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc), errMarkInvalidReplicationChange)
log.Errorf(p.ctx, "%v", e)
return kvpb.NewError(e)
lhDesc, crt, true /* lhRemovalAllowed */, r.Desc(), proposedDesc)
log.Errorf(p.ctx, "%v", err)
return kvpb.NewError(err)
}
} else if p.command.ReplicatedEvalResult.AddSSTable != nil {
log.VEvent(p.ctx, 4, "sideloadable proposal detected")
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -875,7 +875,7 @@ func (r *Replica) AdminTransferLease(
// Verify the target is a replica of the range.
var ok bool
if nextLeaseHolder, ok = desc.GetReplicaDescriptor(target); !ok {
return nil, nil, errors.Errorf("unable to find store %d in range %+v", target, desc)
return nil, nil, roachpb.ErrReplicaNotFound
}

if nextLease, ok := r.mu.pendingLeaseRequest.RequestPending(); ok &&
Expand Down
11 changes: 7 additions & 4 deletions pkg/roachpb/metadata_replicas.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,12 +507,15 @@ func (c ReplicaChangeType) IsRemoval() bool {
}
}

var errReplicaNotFound = errors.Errorf(`replica not found in RangeDescriptor`)
// ErrReplicaNotFound can be returned from CheckCanReceiveLease.
//
// See: https://github.com/cockroachdb/cockroach/issues/93163.
var ErrReplicaNotFound = errors.New(`lease target replica not found in RangeDescriptor`)

// ErrReplicaCannotHoldLease can be returned from CheckCanReceiveLease.
//
// See: https://github.com/cockroachdb/cockroach/issues/93163
var ErrReplicaCannotHoldLease = errors.Errorf("replica cannot hold lease")
// See: https://github.com/cockroachdb/cockroach/issues/93163.
var ErrReplicaCannotHoldLease = errors.New(`lease target replica cannot hold lease`)

// CheckCanReceiveLease checks whether `wouldbeLeaseholder` can receive a lease.
// Returns an error if the respective replica is not eligible.
Expand Down Expand Up @@ -543,7 +546,7 @@ func CheckCanReceiveLease(
) error {
repDesc, ok := replDescs.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID)
if !ok {
return errReplicaNotFound
return ErrReplicaNotFound
}
if !(repDesc.IsVoterNewConfig() ||
(repDesc.IsVoterOldConfig() && replDescs.containsVoterIncoming() && wasLastLeaseholder)) {
Expand Down