Skip to content

Commit

Permalink
Merge pull request #42079 from ajwerner/backport19.2-42068
Browse files Browse the repository at this point in the history
release-19.2: storage: check whether the acquiring replica can receive the lease
  • Loading branch information
ajwerner authored Nov 1, 2019
2 parents 262e6f2 + cd806c1 commit 62801ce
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 17 deletions.
13 changes: 8 additions & 5 deletions pkg/storage/batcheval/cmd_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,14 @@ func newFailedLeaseTrigger(isTransfer bool) result.Result {
return trigger
}

func checkCanReceiveLease(rec EvalContext) error {
repDesc, ok := rec.Desc().GetReplicaDescriptor(rec.StoreID())
func checkCanReceiveLease(newLease *roachpb.Lease, rec EvalContext) error {
repDesc, ok := rec.Desc().GetReplicaDescriptorByID(newLease.Replica.ReplicaID)
if !ok {
return errors.AssertionFailedf(
`could not find replica for store %s in %s`, rec.StoreID(), rec.Desc())
if newLease.Replica.StoreID == rec.StoreID() {
return errors.AssertionFailedf(
`could not find replica for store %s in %s`, rec.StoreID(), rec.Desc())
}
return errors.Errorf(`replica %s not found in %s`, newLease.Replica, rec.Desc())
} else if t := repDesc.GetType(); t != roachpb.VOTER_FULL {
// NB: there's no harm in transferring the lease to a VOTER_INCOMING,
// but we disallow it anyway. On the other hand, transferring to
Expand All @@ -66,7 +69,7 @@ func checkCanReceiveLease(rec EvalContext) error {
// of minProposedTS needs to be "reversible" (tricky) or we make the
// lease evaluation succeed, though with a lease that's "invalid" so that
// a new lease can be requested right after.
return errors.Errorf(`replica of type %s cannot hold lease`, t)
return errors.Errorf(`replica %s of type %s cannot hold lease`, repDesc, t)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_lease_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func RequestLease(
//
// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil {
if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil {
rErr.Message = err.Error()
return newFailedLeaseTrigger(false /* isTransfer */), rErr
}
Expand Down
32 changes: 24 additions & 8 deletions pkg/storage/batcheval/cmd_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,29 +113,45 @@ func TestLeaseCommandLearnerReplica(t *testing.T) {
ctx := context.Background()
const voterStoreID, learnerStoreID roachpb.StoreID = 1, 2
replicas := []roachpb.ReplicaDescriptor{
{StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull()},
{StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner()},
{NodeID: 1, StoreID: voterStoreID, Type: roachpb.ReplicaTypeVoterFull(), ReplicaID: 1},
{NodeID: 2, StoreID: learnerStoreID, Type: roachpb.ReplicaTypeLearner(), ReplicaID: 2},
}
desc := roachpb.RangeDescriptor{}
desc.SetReplicas(roachpb.MakeReplicaDescriptors(replicas))
cArgs := CommandArgs{
EvalCtx: &mockEvalCtx{
storeID: learnerStoreID,
storeID: voterStoreID,
desc: &desc,
},
Args: &roachpb.TransferLeaseRequest{},
Args: &roachpb.TransferLeaseRequest{
Lease: roachpb.Lease{
Replica: replicas[1],
},
},
}

// Learners are not allowed to become leaseholders for now, see the comments
// in TransferLease and RequestLease.
_, err := TransferLease(ctx, nil, cArgs, nil)
require.EqualError(t, err, `replica of type LEARNER cannot hold lease`)
require.EqualError(t, err, `replica (n2,s2):2LEARNER of type LEARNER cannot hold lease`)

cArgs.Args = &roachpb.RequestLeaseRequest{}
_, err = RequestLease(ctx, nil, cArgs, nil)

const exp = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
const expForUnknown = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
`with repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil>: ` +
`replica of type LEARNER cannot hold lease`
require.EqualError(t, err, exp)
`replica (n0,s0):? not found in r0:{-} [(n1,s1):1, (n2,s2):2LEARNER, next=0, gen=0?]`
require.EqualError(t, err, expForUnknown)

cArgs.Args = &roachpb.RequestLeaseRequest{
Lease: roachpb.Lease{
Replica: replicas[1],
},
}
_, err = RequestLease(ctx, nil, cArgs, nil)

const expForLearner = `cannot replace lease repl=(n0,s0):? seq=0 start=0.000000000,0 exp=<nil> ` +
`with repl=(n2,s2):2LEARNER seq=0 start=0.000000000,0 exp=<nil>: ` +
`replica (n2,s2):2LEARNER of type LEARNER cannot hold lease`
require.EqualError(t, err, expForLearner)
}
2 changes: 1 addition & 1 deletion pkg/storage/batcheval/cmd_lease_transfer.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TransferLease(
//
// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
if err := checkCanReceiveLease(cArgs.EvalCtx); err != nil {
if err := checkCanReceiveLease(&args.Lease, cArgs.EvalCtx); err != nil {
return newFailedLeaseTrigger(true /* isTransfer */), err
}

Expand Down
110 changes: 110 additions & 0 deletions pkg/storage/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,26 @@ import (
"context"
"errors"
"fmt"
"runtime"
"sync"
"sync/atomic"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/config"
"github.com/cockroachdb/cockroach/pkg/gossip"
"github.com/cockroachdb/cockroach/pkg/internal/client"
"github.com/cockroachdb/cockroach/pkg/keys"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/storagebase"
"github.com/cockroachdb/cockroach/pkg/storage/storagepb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/stretchr/testify/require"
)

// TestStoreRangeLease verifies that regular ranges (not some special ones at
Expand Down Expand Up @@ -301,3 +309,105 @@ func TestGossipNodeLivenessOnLeaseChange(t *testing.T) {
return nil
})
}

// TestCannotTransferLeaseToVoterOutgoing ensures that the evaluation of lease
// requests for nodes which are already in the VOTER_OUTGOING state will fail.
func TestCannotTransferLeaseToVoterOutgoing(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()

knobs, ltk := makeReplicationTestKnobs()
// Add a testing knob to allow us to block the change replicas command
// while it is being proposed. When we detect that the change replicas
// command to move n3 to VOTER_OUTGOING has been evaluated, we'll send
// the request to transfer the lease to n3. The hope is that it will
// get past the sanity above latch acquisition prior to change replicas
// command committing.
var scratchRangeID atomic.Value
scratchRangeID.Store(roachpb.RangeID(0))
changeReplicasChan := make(chan chan struct{}, 1)
shouldBlock := func(args storagebase.ProposalFilterArgs) bool {
// Block if a ChangeReplicas command is removing a node from our range.
return args.Req.RangeID == scratchRangeID.Load().(roachpb.RangeID) &&
args.Cmd.ReplicatedEvalResult.ChangeReplicas != nil &&
len(args.Cmd.ReplicatedEvalResult.ChangeReplicas.Removed()) > 0
}
blockIfShould := func(args storagebase.ProposalFilterArgs) {
if shouldBlock(args) {
ch := make(chan struct{})
changeReplicasChan <- ch
<-ch
}
}
knobs.Store.(*storage.StoreTestingKnobs).TestingProposalFilter = func(args storagebase.ProposalFilterArgs) *roachpb.Error {
blockIfShould(args)
return nil
}
tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{Knobs: knobs},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

scratchStartKey := tc.ScratchRange(t)
desc := tc.AddReplicasOrFatal(t, scratchStartKey, tc.Targets(1, 2)...)
scratchRangeID.Store(desc.RangeID)
// Make sure n1 has the lease to start with.
err := tc.Server(0).DB().AdminTransferLease(context.TODO(),
scratchStartKey, tc.Target(0).StoreID)
require.NoError(t, err)

// The test proceeds as follows:
//
// - Send an AdminChangeReplicasRequest to remove n3 and add n4
// - Block the step that moves n3 to VOTER_OUTGOING on changeReplicasChan
// - Send an AdminLeaseTransfer to make n3 the leaseholder
// - Try really hard to make sure that the lease transfer at least gets to
// latch acquisition before unblocking the ChangeReplicas.
// - Unblock the ChangeReplicas.
// - Make sure the lease transfer fails.

ltk.withStopAfterJointConfig(func() {
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
_, err = tc.Server(0).DB().AdminChangeReplicas(
client.ChangeReplicasCanMixAddAndRemoveContext(ctx),
scratchStartKey, desc, []roachpb.ReplicationChange{
{ChangeType: roachpb.REMOVE_REPLICA, Target: tc.Target(2)},
{ChangeType: roachpb.ADD_REPLICA, Target: tc.Target(3)},
})
require.NoError(t, err)
}()
ch := <-changeReplicasChan
wg.Add(1)
go func() {
defer wg.Done()
err := tc.Server(0).DB().AdminTransferLease(context.TODO(),
scratchStartKey, tc.Target(2).StoreID)
require.Error(t, err)
require.Regexp(t,
// The error generated during evaluation.
"replica.*of type VOTER_OUTGOING cannot hold lease|"+
// If the lease transfer request has not yet made it to the latching
// phase by the time we close(ch) below, we can receive the following
// error due to the sanity checking which happens in
// AdminTransferLease before attempting to evaluate the lease
// transfer.
// We have a sleep loop below to try to encourage the lease transfer
// to make it past that sanity check prior to letting the change
// of replicas proceed.
"cannot transfer lease to replica of type VOTER_OUTGOING", err.Error())
}()
// Try really hard to make sure that our request makes it past the
// sanity check error to the evaluation error.
for i := 0; i < 100; i++ {
runtime.Gosched()
time.Sleep(time.Microsecond)
}
close(ch)
wg.Wait()
})

}
4 changes: 2 additions & 2 deletions pkg/storage/replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ func TestReplicaLease(t *testing.T) {
Args: &roachpb.RequestLeaseRequest{
Lease: lease,
},
}, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "illegal lease") {
}, &roachpb.RequestLeaseResponse{}); !testutils.IsError(err, "replica \\(n0,s0\\):\\? not found in r1") {
t.Fatalf("unexpected error: %+v", err)
}
}
Expand Down Expand Up @@ -1462,7 +1462,7 @@ func TestReplicaLeaseRejectUnknownRaftNodeID(t *testing.T) {
// Remove ambiguity about where the "replica not found" error comes from.
pErr = (<-ch).Err
}
if !testutils.IsPError(pErr, "replica not found") {
if !testutils.IsPError(pErr, "replica.*not found") {
t.Errorf("unexpected error obtaining lease for invalid store: %v", pErr)
}
}
Expand Down

0 comments on commit 62801ce

Please sign in to comment.