From cd806c15454155e3b6b07445348aeee0d3927d1f Mon Sep 17 00:00:00 2001 From: Andrew Werner Date: Thu, 31 Oct 2019 12:06:54 -0400 Subject: [PATCH] storage: check whether the acquiring replica can receive the lease Before this PR, when evaluating lease transfers or requests we'd check whether the evaluating replica could receive the lease rather than the acquiring replica. This patch fixes that bug by checking the state of the correct replica. Release note (bug fix): The system no longer erroneously transfers leases to replicas which are in the process of being removed which can lead to ranges being effectively unavailable due to an invalid lease. --- pkg/storage/batcheval/cmd_lease.go | 13 ++- pkg/storage/batcheval/cmd_lease_request.go | 2 +- pkg/storage/batcheval/cmd_lease_test.go | 32 ++++-- pkg/storage/batcheval/cmd_lease_transfer.go | 2 +- pkg/storage/client_lease_test.go | 110 ++++++++++++++++++++ pkg/storage/replica_test.go | 4 +- 6 files changed, 146 insertions(+), 17 deletions(-) diff --git a/pkg/storage/batcheval/cmd_lease.go b/pkg/storage/batcheval/cmd_lease.go index 1240f1f10665..96861bdf1b2f 100644 --- a/pkg/storage/batcheval/cmd_lease.go +++ b/pkg/storage/batcheval/cmd_lease.go @@ -42,11 +42,14 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result { return trigger } -func checkCanReceiveLease(rec EvalContext) error { - repDesc, ok := rec.Desc().GetReplicaDescriptor(rec.StoreID()) +func checkCanReceiveLease(newLease *roachpb.Lease, rec EvalContext) error { + repDesc, ok := rec.Desc().GetReplicaDescriptorByID(newLease.Replica.ReplicaID) if !ok { - return errors.AssertionFailedf( - `could not find replica for store %s in %s`, rec.StoreID(), rec.Desc()) + if newLease.Replica.StoreID == rec.StoreID() { + return errors.AssertionFailedf( + `could not find replica for store %s in %s`, rec.StoreID(), rec.Desc()) + } + return errors.Errorf(`replica %s not found in %s`, newLease.Replica, rec.Desc()) } else if t := repDesc.GetType(); t != roachpb.VOTER_FULL { // NB: there's no harm in transferring the lease to a VOTER_INCOMING, // but we disallow it anyway. On the other hand, transferring to @@ -66,7 +69,7 @@ func checkCanReceiveLease(rec EvalContext) error { // of minProposedTS needs to be "reversible" (tricky) or we make the // lease evaluation succeed, though with a lease that's "invalid" so that // a new lease can be requested right after. - return errors.Errorf(`replica of type %s cannot hold lease`, t) + return errors.Errorf(`replica %s of type %s cannot hold lease`, repDesc, t) } return nil } diff --git a/pkg/storage/batcheval/cmd_lease_request.go b/pkg/storage/batcheval/cmd_lease_request.go index b1f45aae395a..4f6c8c9663bc 100644 --- a/pkg/storage/batcheval/cmd_lease_request.go +++ b/pkg/storage/batcheval/cmd_lease_request.go @@ -53,7 +53,7 @@ func RequestLease( // // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil { + if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil { rErr.Message = err.Error() return newFailedLeaseTrigger(false /* isTransfer */), rErr } diff --git a/pkg/storage/batcheval/cmd_lease_test.go b/pkg/storage/batcheval/cmd_lease_test.go index 1c1bc3ea78ec..d2d12fd382b7 100644 --- a/pkg/storage/batcheval/cmd_lease_test.go +++ b/pkg/storage/batcheval/cmd_lease_test.go @@ -113,29 +113,45 @@ func TestLeaseCommandLearnerReplica(t *testing.T) { ctx := context.Background() const voterStoreID, learnerStoreID roachpb.StoreID = 1, 2 replicas := []roachpb.ReplicaDescriptor{ - {StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull()}, - {StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner()}, + {NodeID: 1, StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull(), ReplicaID: 1}, + {NodeID: 2, StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner(), ReplicaID: 2}, } desc := roachpb.RangeDescriptor{} desc.SetReplicas(roachpb.MakeReplicaDescriptors(replicas)) cArgs := CommandArgs{ EvalCtx: &mockEvalCtx{ - storeID: learnerStoreID, + storeID: voterStoreID, desc: &desc, }, - Args: &roachpb.TransferLeaseRequest{}, + Args: &roachpb.TransferLeaseRequest{ + Lease: roachpb.Lease{ + Replica: replicas[1], + }, + }, } // Learners are not allowed to become leaseholders for now, see the comments // in TransferLease and RequestLease. _, err := TransferLease(ctx, nil, cArgs, nil) - require.EqualError(t, err, `replica of type LEARNER cannot hold lease`) + require.EqualError(t, err, `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease`) cArgs.Args = &roachpb.RequestLeaseRequest{} _, err = RequestLease(ctx, nil, cArgs, nil) - const exp = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp= ` + + const expForUnknown = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp= ` + `with repl=(n0,s0):? seq=0 start=0.000000000,0 exp=: ` + - `replica of type LEARNER cannot hold lease` - require.EqualError(t, err, exp) + `replica (n0,s0):? not found in r0:{-} [(n1,s1):1, (n2,s2):2LEARNER, next=0, gen=0?]` + require.EqualError(t, err, expForUnknown) + + cArgs.Args = &roachpb.RequestLeaseRequest{ + Lease: roachpb.Lease{ + Replica: replicas[1], + }, + } + _, err = RequestLease(ctx, nil, cArgs, nil) + + const expForLearner = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp= ` + + `with repl=(n2,s2):2LEARNER seq=0 start=0.000000000,0 exp=: ` + + `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease` + require.EqualError(t, err, expForLearner) } diff --git a/pkg/storage/batcheval/cmd_lease_transfer.go b/pkg/storage/batcheval/cmd_lease_transfer.go index 946def98b443..5f35aab2e086 100644 --- a/pkg/storage/batcheval/cmd_lease_transfer.go +++ b/pkg/storage/batcheval/cmd_lease_transfer.go @@ -46,7 +46,7 @@ func TransferLease( // // If this check is removed at some point, the filtering of learners on the // sending side would have to be removed as well. - if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil { + if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil { return newFailedLeaseTrigger(true /* isTransfer */), err } diff --git a/pkg/storage/client_lease_test.go b/pkg/storage/client_lease_test.go index 2562a49ce10e..643961e2667c 100644 --- a/pkg/storage/client_lease_test.go +++ b/pkg/storage/client_lease_test.go @@ -14,18 +14,26 @@ import ( "context" "errors" "fmt" + "runtime" + "sync" + "sync/atomic" "testing" + "time" + "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/config" "github.com/cockroachdb/cockroach/pkg/gossip" "github.com/cockroachdb/cockroach/pkg/internal/client" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/storage" + "github.com/cockroachdb/cockroach/pkg/storage/storagebase" "github.com/cockroachdb/cockroach/pkg/storage/storagepb" "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" ) // TestStoreRangeLease verifies that regular ranges (not some special ones at @@ -301,3 +309,105 @@ func TestGossipNodeLivenessOnLeaseChange(t *testing.T) { return nil }) } + +// TestCannotTransferLeaseToVoterOutgoing ensures that the evaluation of lease +// requests for nodes which are already in the VOTER_OUTGOING state will fail. +func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + + knobs, ltk := makeReplicationTestKnobs() + // Add a testing knob to allow us to block the change replicas command + // while it is being proposed. When we detect that the change replicas + // command to move n3 to VOTER_OUTGOING has been evaluated, we'll send + // the request to transfer the lease to n3. The hope is that it will + // get past the sanity above latch acquisition prior to change replicas + // command committing. + var scratchRangeID atomic.Value + scratchRangeID.Store(roachpb.RangeID(0)) + changeReplicasChan := make(chan chan struct{}, 1) + shouldBlock := func(args storagebase.ProposalFilterArgs) bool { + // Block if a ChangeReplicas command is removing a node from our range. + return args.Req.RangeID == scratchRangeID.Load().(roachpb.RangeID) && + args.Cmd.ReplicatedEvalResult.ChangeReplicas != nil && + len(args.Cmd.ReplicatedEvalResult.ChangeReplicas.Removed()) > 0 + } + blockIfShould := func(args storagebase.ProposalFilterArgs) { + if shouldBlock(args) { + ch := make(chan struct{}) + changeReplicasChan <- ch + <-ch + } + } + knobs.Store.(*storage.StoreTestingKnobs).TestingProposalFilter = func(args storagebase.ProposalFilterArgs) *roachpb.Error { + blockIfShould(args) + return nil + } + tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{Knobs: knobs}, + ReplicationMode: base.ReplicationManual, + }) + defer tc.Stopper().Stop(ctx) + + scratchStartKey := tc.ScratchRange(t) + desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Targets(1, 2)...) + scratchRangeID.Store(desc.RangeID) + // Make sure n1 has the lease to start with. + err := tc.Server(0).DB().AdminTransferLease(context.TODO(), + scratchStartKey, tc.Target(0).StoreID) + require.NoError(t, err) + + // The test proceeds as follows: + // + // - Send an AdminChangeReplicasRequest to remove n3 and add n4 + // - Block the step that moves n3 to VOTER_OUTGOING on changeReplicasChan + // - Send an AdminLeaseTransfer to make n3 the leaseholder + // - Try really hard to make sure that the lease transfer at least gets to + // latch acquisition before unblocking the ChangeReplicas. + // - Unblock the ChangeReplicas. + // - Make sure the lease transfer fails. + + ltk.withStopAfterJointConfig(func() { + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + _, err = tc.Server(0).DB().AdminChangeReplicas( + client.ChangeReplicasCanMixAddAndRemoveContext(ctx), + scratchStartKey, desc, []roachpb.ReplicationChange{ + {ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(2)}, + {ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(3)}, + }) + require.NoError(t, err) + }() + ch := <-changeReplicasChan + wg.Add(1) + go func() { + defer wg.Done() + err := tc.Server(0).DB().AdminTransferLease(context.TODO(), + scratchStartKey, tc.Target(2).StoreID) + require.Error(t, err) + require.Regexp(t, + // The error generated during evaluation. + "replica.*of type VOTER_OUTGOING cannot hold lease|"+ + // If the lease transfer request has not yet made it to the latching + // phase by the time we close(ch) below, we can receive the following + // error due to the sanity checking which happens in + // AdminTransferLease before attempting to evaluate the lease + // transfer. + // We have a sleep loop below to try to encourage the lease transfer + // to make it past that sanity check prior to letting the change + // of replicas proceed. + "cannot transfer lease to replica of type VOTER_OUTGOING", err.Error()) + }() + // Try really hard to make sure that our request makes it past the + // sanity check error to the evaluation error. + for i := 0; i < 100; i++ { + runtime.Gosched() + time.Sleep(time.Microsecond) + } + close(ch) + wg.Wait() + }) + +} diff --git a/pkg/storage/replica_test.go b/pkg/storage/replica_test.go index 39e42f468299..dc8eefcc4bd8 100644 --- a/pkg/storage/replica_test.go +++ b/pkg/storage/replica_test.go @@ -1057,7 +1057,7 @@ func TestReplicaLease(t *testing.T) { Args: &roachpb.RequestLeaseRequest{ Lease: lease, }, - }, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "illegal lease") { + }, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "replica \\(n0,s0\\):\\? not found in r1") { t.Fatalf("unexpected error: %+v", err) } } @@ -1462,7 +1462,7 @@ func TestReplicaLeaseRejectUnknownRaftNodeID(t *testing.T) { // Remove ambiguity about where the "replica not found" error comes from. pErr = (<-ch).Err } - if !testutils.IsPError(pErr, "replica not found") { + if !testutils.IsPError(pErr, "replica.*not found") { t.Errorf("unexpected error obtaining lease for invalid store: %v", pErr) } }