diff --git a/pkg/kv/db.go b/pkg/kv/db.go index 898e4ae1a6a0..345f83f4c036 100644 --- a/pkg/kv/db.go +++ b/pkg/kv/db.go @@ -624,7 +624,13 @@ func (db *DB) AdminTransferLease( ) error { b := &Batch{} b.adminTransferLease(key, target) - return getOneErr(db.Run(ctx, b), b) + err := db.Run(ctx, b) + if err == nil { + log.Infof(ctx, "Transferring lease to StoreID %s succeeded", target.String()) + } else { + log.Infof(ctx, "Transferring lease to StoreID %s failed with error: %s", target.String(), err) + } + return getOneErr(err, b) } // AdminChangeReplicas adds or removes a set of replicas for a range. diff --git a/pkg/kv/kvserver/allocator.go b/pkg/kv/kvserver/allocator.go index 83a48310d684..848fc4bcaec6 100644 --- a/pkg/kv/kvserver/allocator.go +++ b/pkg/kv/kvserver/allocator.go @@ -1378,7 +1378,8 @@ func (a *Allocator) TransferLeaseTarget( } fallthrough case decideWithoutStats: - if !a.shouldTransferLeaseForLeaseCountConvergence(ctx, sl, source, existing) { + if !opts.disableLeaseCountConvergenceChecks && + !a.shouldTransferLeaseForLeaseCountConvergence(ctx, sl, source, existing) { return roachpb.ReplicaDescriptor{} } case shouldTransfer: diff --git a/pkg/kv/kvserver/client_lease_test.go b/pkg/kv/kvserver/client_lease_test.go index 94449910f5e1..173454213fbe 100644 --- a/pkg/kv/kvserver/client_lease_test.go +++ b/pkg/kv/kvserver/client_lease_test.go @@ -16,7 +16,6 @@ import ( "math" "runtime" "strconv" - "strings" "sync" "sync/atomic" "testing" @@ -568,6 +567,223 @@ func TestLeasePreferencesRebalance(t *testing.T) { }) } +// Tests that when leaseholder is relocated, the lease can be transferred directly to new node +// This verifies https://github.com/cockroachdb/cockroach/issues/67740 +func TestLeaseholderLocalRelocate(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + stickyRegistry := server.NewStickyInMemEnginesRegistry() + defer stickyRegistry.CloseAllStickyInMemEngines() + ctx := context.Background() + manualClock := hlc.NewHybridManualClock() + zcfg := zonepb.DefaultZoneConfig() + zcfg.LeasePreferences = []zonepb.LeasePreference{ + { + Constraints: []zonepb.Constraint{ + {Type: zonepb.Constraint_REQUIRED, Key: "region", Value: "us"}, + }, + }, + } + + serverArgs := make(map[int]base.TestServerArgs) + locality := func(region string) roachpb.Locality { + return roachpb.Locality{ + Tiers: []roachpb.Tier{ + {Key: "region", Value: region}, + }, + } + } + localities := []roachpb.Locality{ + locality("eu"), + locality("eu"), + locality("us"), + locality("us"), + } + + const numStores = 4 + const numNodes = 4 + for i := 0; i < numNodes; i++ { + serverArgs[i] = base.TestServerArgs{ + Locality: localities[i], + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + ClockSource: manualClock.UnixNano, + DefaultZoneConfigOverride: &zcfg, + StickyEngineRegistry: stickyRegistry, + }, + }, + StoreSpecs: []base.StoreSpec{ + { + InMemory: true, + StickyInMemoryEngineID: strconv.FormatInt(int64(i), 10), + }, + }, + } + } + tc := testcluster.StartTestCluster(t, numNodes, + base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgsPerNode: serverArgs, + }) + defer tc.Stopper().Stop(ctx) + + _, rhsDesc := tc.SplitRangeOrFatal(t, keys.UserTableDataMin) + tc.AddVotersOrFatal(t, rhsDesc.StartKey.AsRawKey(), tc.Targets(1, 2)...) + + // We start with having the range under test on (1,2,3). + db := tc.ServerConn(0) + + // Manually move lease out of preference. + tc.TransferRangeLeaseOrFatal(t, rhsDesc, tc.Target(0)) + + // Check that the lease is on 1 + leaseHolder, err := tc.FindRangeLeaseHolder(rhsDesc, nil) + require.NoError(t, err) + require.Equal(t, tc.Target(0), leaseHolder) + + // Cause the lease to moved based on lease preferences (to the only us node, 3) + tc.GetFirstStoreFromServer(t, 0).SetReplicateQueueActive(true) + require.NoError(t, tc.GetFirstStoreFromServer(t, 0).ForceReplicationScanAndProcess()) + + // Check that the lease moved to 3 + leaseHolder, err = tc.FindRangeLeaseHolder(rhsDesc, nil) + require.NoError(t, err) + require.Equal(t, tc.Target(2), leaseHolder) + + gossipLiveness(t, tc) + + // Relocate range 3 -> 4. + _, err = db.Exec("ALTER RANGE " + rhsDesc.RangeID.String() + " RELOCATE FROM 3 TO 4") + require.NoError(t, err) + + // Make sure lease moved to 4 + leaseHolder, err = tc.FindRangeLeaseHolder(rhsDesc, nil) + require.NoError(t, err) + require.Equal(t, tc.Target(3), leaseHolder) + + // Double check that lease moved directly 3 -> 4 + repl := tc.GetFirstStoreFromServer(t, 3).LookupReplica(roachpb.RKey(rhsDesc.StartKey.AsRawKey())) + history := repl.GetLeaseHistory() + require.Equal(t, tc.Target(2).NodeID, history[len(history)-2].Replica.NodeID) + require.Equal(t, tc.Target(3).NodeID, history[len(history)-1].Replica.NodeID) +} + +func gossipLiveness(t *testing.T, tc *testcluster.TestCluster) { + for i := range tc.Servers { + testutils.SucceedsSoon(t, tc.Servers[i].HeartbeatNodeLiveness) + } + // Make sure that all store pools have seen liveness heartbeats from everyone. + testutils.SucceedsSoon(t, func() error { + for i := range tc.Servers { + for j := range tc.Servers { + live, err := tc.GetFirstStoreFromServer(t, i).GetStoreConfig().StorePool.IsLive(tc.Target(j).StoreID) + if err != nil { + return err + } + if !live { + return errors.Errorf("Expected server %d to be suspect on server %d", j, i) + } + } + } + return nil + }) +} + +// Tests that when leaseholder is relocated, the lease is transferred directly to one of the +// preferred nodes. This verifies https://github.com/cockroachdb/cockroach/issues/67740 +func TestLeaseholderLocalRelocateNonPreferred(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + stickyRegistry := server.NewStickyInMemEnginesRegistry() + defer stickyRegistry.CloseAllStickyInMemEngines() + ctx := context.Background() + manualClock := hlc.NewHybridManualClock() + zcfg := zonepb.DefaultZoneConfig() + zcfg.LeasePreferences = []zonepb.LeasePreference{ + { + Constraints: []zonepb.Constraint{ + {Type: zonepb.Constraint_REQUIRED, Key: "region", Value: "eu"}, + }, + }, + } + + serverArgs := make(map[int]base.TestServerArgs) + locality := func(region string) roachpb.Locality { + return roachpb.Locality{ + Tiers: []roachpb.Tier{ + {Key: "region", Value: region}, + }, + } + } + localities := []roachpb.Locality{ + locality("eu"), + locality("eu"), + locality("us"), + locality("us"), + } + + const numStores = 4 + const numNodes = 4 + for i := 0; i < numNodes; i++ { + serverArgs[i] = base.TestServerArgs{ + Locality: localities[i], + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + ClockSource: manualClock.UnixNano, + DefaultZoneConfigOverride: &zcfg, + StickyEngineRegistry: stickyRegistry, + }, + }, + StoreSpecs: []base.StoreSpec{ + { + InMemory: true, + StickyInMemoryEngineID: strconv.FormatInt(int64(i), 10), + }, + }, + } + } + tc := testcluster.StartTestCluster(t, numNodes, + base.TestClusterArgs{ + ReplicationMode: base.ReplicationManual, + ServerArgsPerNode: serverArgs, + }) + defer tc.Stopper().Stop(ctx) + + _, rhsDesc := tc.SplitRangeOrFatal(t, keys.UserTableDataMin) + tc.AddVotersOrFatal(t, rhsDesc.StartKey.AsRawKey(), tc.Targets(1, 2)...) + + // We start with having the range under test on (1,2,3). + db := tc.ServerConn(0) + + // Manually move lease out of preference. + tc.TransferRangeLeaseOrFatal(t, rhsDesc, tc.Target(2)) + + // Check that the lease is on 1 + leaseHolder, err := tc.FindRangeLeaseHolder(rhsDesc, nil) + require.NoError(t, err) + require.Equal(t, tc.Target(2), leaseHolder) + + gossipLiveness(t, tc) + + // Relocate range 3 -> 4. + _, err = db.Exec("ALTER RANGE " + rhsDesc.RangeID.String() + " RELOCATE FROM 3 TO 4") + require.NoError(t, err) + + // Make sure lease moved to 0 or 1, not 4 + leaseHolder, err = tc.FindRangeLeaseHolder(rhsDesc, nil) + require.NoError(t, err) + require.True(t, + leaseHolder.Equal(tc.Target(0)) || leaseHolder.Equal(tc.Target(1))) + + // Double check that lease moved directly + repl := tc.GetFirstStoreFromServer(t, 3).LookupReplica(roachpb.RKey(rhsDesc.StartKey.AsRawKey())) + history := repl.GetLeaseHistory() + require.Equal(t, tc.Target(2).NodeID, history[len(history)-2].Replica.NodeID) + require.Equal(t, leaseHolder.NodeID, history[len(history)-1].Replica.NodeID) +} + // This test replicates the behavior observed in // https://github.com/cockroachdb/cockroach/issues/62485. We verify that // when a dc with the leaseholder is lost, a node in a dc that does not have the @@ -686,29 +902,10 @@ func TestLeasePreferencesDuringOutage(t *testing.T) { return nil }) - _, processError, enqueueError := tc.GetFirstStoreFromServer(t, 0). + _, _, enqueueError := tc.GetFirstStoreFromServer(t, 0). ManuallyEnqueue(ctx, "replicate", repl, true) + require.NoError(t, enqueueError) - if processError != nil { - log.Infof(ctx, "a US replica stole lease, manually moving it to the EU.") - if !strings.Contains(processError.Error(), "does not have the range lease") { - t.Fatal(processError) - } - // The us replica ended up stealing the lease, so we need to manually - // transfer the lease and then do another run through the replicate queue - // to move it to the us. - tc.TransferRangeLeaseOrFatal(t, *repl.Desc(), tc.Target(0)) - testutils.SucceedsSoon(t, func() error { - if !repl.OwnsValidLease(ctx, tc.Servers[0].Clock().NowAsClockTimestamp()) { - return errors.Errorf("Expected lease to transfer to server 0") - } - return nil - }) - _, processError, enqueueError = tc.GetFirstStoreFromServer(t, 0). - ManuallyEnqueue(ctx, "replicate", repl, true) - require.NoError(t, enqueueError) - require.NoError(t, processError) - } var newLeaseHolder roachpb.ReplicationTarget testutils.SucceedsSoon(t, func() error { @@ -717,22 +914,26 @@ func TestLeasePreferencesDuringOutage(t *testing.T) { return err }) + // Check that the leaseholder is in the US srv, err := tc.FindMemberServer(newLeaseHolder.StoreID) require.NoError(t, err) region, ok := srv.Locality().Find("region") require.True(t, ok) require.Equal(t, "us", region) - require.Equal(t, 3, len(repl.Desc().Replicas().Voters().VoterDescriptors())) + // Validate that we upreplicated outside of SF. - for _, replDesc := range repl.Desc().Replicas().Voters().VoterDescriptors() { + replicas := repl.Desc().Replicas().Voters().VoterDescriptors() + require.Equal(t, 3, len(replicas)) + for _, replDesc := range replicas { serv, err := tc.FindMemberServer(replDesc.StoreID) require.NoError(t, err) dc, ok := serv.Locality().Find("dc") require.True(t, ok) require.NotEqual(t, "sf", dc) } - history := repl.GetLeaseHistory() + // make sure we see the eu node as a lease holder in the second to last position. + history := repl.GetLeaseHistory() require.Equal(t, tc.Target(0).NodeID, history[len(history)-2].Replica.NodeID) } @@ -811,7 +1012,7 @@ func TestLeasesDontThrashWhenNodeBecomesSuspect(t *testing.T) { _, rhsDesc := tc.SplitRangeOrFatal(t, keys.UserTableDataMin) tc.AddVotersOrFatal(t, rhsDesc.StartKey.AsRawKey(), tc.Targets(1, 2, 3)...) - tc.RemoveLeaseHolderOrFatal(t, rhsDesc, tc.Target(0), tc.Target(1)) + tc.RemoveLeaseHolderOrFatal(t, rhsDesc, tc.Target(0)) startKeys := make([]roachpb.Key, 20) startKeys[0] = rhsDesc.StartKey.AsRawKey() diff --git a/pkg/kv/kvserver/client_metrics_test.go b/pkg/kv/kvserver/client_metrics_test.go index 10f1808d0ed0..badf807c3fb7 100644 --- a/pkg/kv/kvserver/client_metrics_test.go +++ b/pkg/kv/kvserver/client_metrics_test.go @@ -337,7 +337,7 @@ func TestStoreMetrics(t *testing.T) { // Verify stats after addition. verifyStats(t, tc, 1, 2) checkGauge(t, "store 0", tc.GetFirstStoreFromServer(t, 0).Metrics().ReplicaCount, initialCount+1) - tc.RemoveLeaseHolderOrFatal(t, desc, tc.Target(0), tc.Target(1)) + tc.RemoveLeaseHolderOrFatal(t, desc, tc.Target(0)) testutils.SucceedsSoon(t, func() error { _, err := tc.GetFirstStoreFromServer(t, 0).GetReplica(desc.RangeID) if err == nil { diff --git a/pkg/kv/kvserver/client_raft_test.go b/pkg/kv/kvserver/client_raft_test.go index 5cfc9e598630..13a33e269235 100644 --- a/pkg/kv/kvserver/client_raft_test.go +++ b/pkg/kv/kvserver/client_raft_test.go @@ -3800,6 +3800,7 @@ func TestReplicateReAddAfterDown(t *testing.T) { tc.WaitForValues(t, key, []int64{16, 16, 16}) } + // TestLeaseHolderRemoveSelf verifies that a lease holder cannot remove itself // without encountering an error. func TestLeaseHolderRemoveSelf(t *testing.T) { @@ -3818,11 +3819,22 @@ func TestLeaseHolderRemoveSelf(t *testing.T) { tc.SplitRangeOrFatal(t, key) tc.AddVotersOrFatal(t, key, tc.Target(1)) - // Attempt to remove the replica from first store. - expectedErr := "invalid ChangeReplicasTrigger" - if _, err := tc.RemoveVoters(key, tc.Target(0)); !testutils.IsError(err, expectedErr) { - t.Fatalf("expected %q error trying to remove leaseholder replica; got %v", expectedErr, err) + // Remove the replica from first store. + _, err := tc.RemoveVoters(key, tc.Target(0)) + require.NoError(t, err) + + // Check that lease moved to server 2 + leaseInfo := getLeaseInfoOrFatal(t, context.Background(), tc.Servers[1].DB(), key) + rangeDesc, err := tc.LookupRange(key) + if err != nil { + t.Fatal(err) + } + replica, ok := rangeDesc.GetReplicaDescriptor(tc.Servers[1].GetFirstStoreID()) + if !ok { + t.Fatalf("expected to find replica in server 2") } + require.Equal(t, leaseInfo.Lease.Replica, replica) + leaseHolder = tc.GetFirstStoreFromServer(t, 1) // Expect that we can still successfully do a get on the range. getArgs := getArgs(key) @@ -3832,6 +3844,7 @@ func TestLeaseHolderRemoveSelf(t *testing.T) { } } + // TestRemovedReplicaError verifies that a replica that has been removed from a // range returns a RangeNotFoundError if it receives a request for that range // (not RaftGroupDeletedError, and even before the ReplicaGCQueue has run). diff --git a/pkg/kv/kvserver/client_relocate_range_test.go b/pkg/kv/kvserver/client_relocate_range_test.go index b40f43f62ec7..aaad8b31d3a1 100644 --- a/pkg/kv/kvserver/client_relocate_range_test.go +++ b/pkg/kv/kvserver/client_relocate_range_test.go @@ -236,12 +236,10 @@ func TestAdminRelocateRange(t *testing.T) { relocateAndCheck(t, tc, k, tc.Targets(4), nil /* nonVoterTargets */) }) } - // s5 (LH) ---> s3 (LH) - // Lateral movement while at replication factor one. In this case atomic - // replication changes cannot be used; we add-then-remove instead. + // Lateral movement while at replication factor one. { - requireNumAtomic(0, 2, func() { + requireNumAtomic(1, 0, func() { relocateAndCheck(t, tc, k, tc.Targets(2), nil /* nonVoterTargets */) }) } diff --git a/pkg/kv/kvserver/client_replica_test.go b/pkg/kv/kvserver/client_replica_test.go index 2a265fbb3acc..7e24a566772f 100644 --- a/pkg/kv/kvserver/client_replica_test.go +++ b/pkg/kv/kvserver/client_replica_test.go @@ -1530,10 +1530,23 @@ func TestLeaseExtensionNotBlockedByRead(t *testing.T) { readBlocked <- struct{}{} } } +func validateLeaseholderSoon( + t *testing.T, db *kv.DB, key roachpb.Key, replica roachpb.ReplicaDescriptor, isTarget bool) { + testutils.SucceedsSoon(t, func() error { + leaseInfo := getLeaseInfoOrFatal(t, context.Background(), db, key) + if isTarget && leaseInfo.Lease.Replica != replica { + return fmt.Errorf("lease holder should be replica %+v, but is: %+v", + replica, leaseInfo.Lease.Replica) + } else if !isTarget && leaseInfo.Lease.Replica == replica { + return fmt.Errorf("lease holder still on replica: %+v", replica) + } + return nil + }) +} -func getLeaseInfo( - ctx context.Context, db *kv.DB, key roachpb.Key, -) (*roachpb.LeaseInfoResponse, error) { +func getLeaseInfoOrFatal( + t *testing.T, ctx context.Context, db *kv.DB, key roachpb.Key, +) *roachpb.LeaseInfoResponse { header := roachpb.Header{ // INCONSISTENT read with a NEAREST routing policy, since we want to make // sure that the node used to send this is the one that processes the @@ -1544,19 +1557,18 @@ func getLeaseInfo( leaseInfoReq := &roachpb.LeaseInfoRequest{RequestHeader: roachpb.RequestHeader{Key: key}} reply, pErr := kv.SendWrappedWith(ctx, db.NonTransactionalSender(), header, leaseInfoReq) if pErr != nil { - return nil, pErr.GoError() + t.Fatal(pErr) } - return reply.(*roachpb.LeaseInfoResponse), nil + return reply.(*roachpb.LeaseInfoResponse) } -func TestLeaseInfoRequest(t *testing.T) { +func TestRemoveLeaseholder(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{}) defer tc.Stopper().Stop(context.Background()) kvDB0 := tc.Servers[0].DB() - kvDB1 := tc.Servers[1].DB() key := []byte("a") rangeDesc, err := tc.LookupRange(key) @@ -1571,12 +1583,43 @@ func TestLeaseInfoRequest(t *testing.T) { t.Fatalf("expected to find replica in server %d", i) } } - mustGetLeaseInfo := func(db *kv.DB) *roachpb.LeaseInfoResponse { - resp, err := getLeaseInfo(context.Background(), db, rangeDesc.StartKey.AsRawKey()) - if err != nil { - t.Fatal(err) + + // Transfer the lease to Servers[0] so we start in a known state. Otherwise, + // there might be already a lease owned by a random node. + err = tc.TransferRangeLease(rangeDesc, tc.Target(0)) + if err != nil { + t.Fatal(err) + } + + // Now test the LeaseInfo. We might need to loop until the node we query has applied the lease. + validateLeaseholderSoon(t, kvDB0, rangeDesc.StartKey.AsRawKey(), replicas[0], true) + + tc.RemoveLeaseHolderOrFatal(t, rangeDesc, tc.Target(0)) + + // Check that the lease is no longer on 0 + validateLeaseholderSoon(t, kvDB0, rangeDesc.StartKey.AsRawKey(), replicas[0], false) +} + +func TestLeaseInfoRequest(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{}) + defer tc.Stopper().Stop(context.Background()) + + kvDB0 := tc.Servers[0].DB() + + key := []byte("a") + rangeDesc, err := tc.LookupRange(key) + if err != nil { + t.Fatal(err) + } + replicas := make([]roachpb.ReplicaDescriptor, 3) + for i := 0; i < 3; i++ { + var ok bool + replicas[i], ok = rangeDesc.GetReplicaDescriptor(tc.Servers[i].GetFirstStoreID()) + if !ok { + t.Fatalf("expected to find replica in server %d", i) } - return resp } // Transfer the lease to Servers[0] so we start in a known state. Otherwise, @@ -1586,16 +1629,8 @@ func TestLeaseInfoRequest(t *testing.T) { t.Fatal(err) } - // Now test the LeaseInfo. We might need to loop until the node we query has - // applied the lease. - testutils.SucceedsSoon(t, func() error { - leaseHolderReplica := mustGetLeaseInfo(kvDB0).Lease.Replica - if leaseHolderReplica != replicas[0] { - return fmt.Errorf("lease holder should be replica %+v, but is: %+v", - replicas[0], leaseHolderReplica) - } - return nil - }) + // Now test the LeaseInfo. We might need to loop until the node we query has applied the lease. + validateLeaseholderSoon(t, kvDB0, rangeDesc.StartKey.AsRawKey(), replicas[0], true) // Transfer the lease to Server 1 and check that LeaseInfoRequest gets the // right answer. @@ -1606,26 +1641,19 @@ func TestLeaseInfoRequest(t *testing.T) { // An inconsistent LeaseInfoReqeust on the old lease holder should give us the // right answer immediately, since the old holder has definitely applied the // transfer before TransferRangeLease returned. - leaseHolderReplica := mustGetLeaseInfo(kvDB0).Lease.Replica - if !leaseHolderReplica.Equal(replicas[1]) { + leaseInfo := getLeaseInfoOrFatal(t, context.Background(), kvDB0, rangeDesc.StartKey.AsRawKey()) + if !leaseInfo.Lease.Replica.Equal(replicas[1]) { t.Fatalf("lease holder should be replica %+v, but is: %+v", - replicas[1], leaseHolderReplica) + replicas[1], leaseInfo.Lease.Replica) } // A read on the new lease holder does not necessarily succeed immediately, // since it might take a while for it to apply the transfer. - testutils.SucceedsSoon(t, func() error { - // We can't reliably do a CONSISTENT read here, even though we're reading - // from the supposed lease holder, because this node might initially be - // unaware of the new lease and so the request might bounce around for a - // while (see #8816). - leaseHolderReplica = mustGetLeaseInfo(kvDB1).Lease.Replica - if !leaseHolderReplica.Equal(replicas[1]) { - return errors.Errorf("lease holder should be replica %+v, but is: %+v", - replicas[1], leaseHolderReplica) - } - return nil - }) + // We can't reliably do a CONSISTENT read here, even though we're reading + // from the supposed lease holder, because this node might initially be + // unaware of the new lease and so the request might bounce around for a + // while (see #8816). + validateLeaseholderSoon(t, kvDB0, rangeDesc.StartKey.AsRawKey(), replicas[1], true) // Transfer the lease to Server 2 and check that LeaseInfoRequest gets the // right answer. @@ -1657,10 +1685,9 @@ func TestLeaseInfoRequest(t *testing.T) { t.Fatal(pErr) } resp := *(reply.(*roachpb.LeaseInfoResponse)) - leaseHolderReplica = resp.Lease.Replica - if !leaseHolderReplica.Equal(replicas[2]) { - t.Fatalf("lease holder should be replica %s, but is: %s", replicas[2], leaseHolderReplica) + if !resp.Lease.Replica.Equal(replicas[2]) { + t.Fatalf("lease holder should be replica %s, but is: %s", replicas[2], resp.Lease.Replica) } // TODO(andrei): test the side-effect of LeaseInfoRequest when there's no @@ -1991,8 +2018,8 @@ func TestClearRange(t *testing.T) { t.Fatal(err) } var actualKeys []roachpb.Key - for _, kv := range kvs { - actualKeys = append(actualKeys, kv.Key.Key) + for _, keyValue := range kvs { + actualKeys = append(actualKeys, keyValue.Key.Key) } if !reflect.DeepEqual(expectedKeys, actualKeys) { t.Fatalf("expected %v, but got %v", expectedKeys, actualKeys) @@ -2253,7 +2280,7 @@ func TestRandomConcurrentAdminChangeReplicasRequests(t *testing.T) { ctx := context.Background() defer tc.Stopper().Stop(ctx) const actors = 10 - errors := make([]error, actors) + errs := make([]error, actors) var wg sync.WaitGroup key := roachpb.Key("a") db := tc.Servers[0].DB() @@ -2288,11 +2315,11 @@ func TestRandomConcurrentAdminChangeReplicasRequests(t *testing.T) { } wg.Add(actors) for i := 0; i < actors; i++ { - go func(i int) { errors[i] = addReplicas(); wg.Done() }(i) + go func(i int) { errs[i] = addReplicas(); wg.Done() }(i) } wg.Wait() var gotSuccess bool - for _, err := range errors { + for _, err := range errs { if err != nil { require.Truef(t, kvserver.IsRetriableReplicationChangeError(err), "%s; desc: %v", err, rangeInfo.Desc) } else if gotSuccess { @@ -2318,7 +2345,7 @@ func TestChangeReplicasSwapVoterWithNonVoter(t *testing.T) { key := tc.ScratchRange(t) // NB: The test cluster starts with firstVoter having a voting replica (and // the lease) for all ranges. - firstVoter, secondVoter, nonVoter := tc.Target(0), tc.Target(1), tc.Target(3) + firstVoter, nonVoter := tc.Target(0), tc.Target(1) firstStore, err := tc.Server(0).GetStores().(*kvserver.Stores).GetStore(tc.Server(0).GetFirstStoreID()) require.NoError(t, err) firstRepl := firstStore.LookupReplica(roachpb.RKey(key)) @@ -2326,17 +2353,8 @@ func TestChangeReplicasSwapVoterWithNonVoter(t *testing.T) { " replica for the ScratchRange") tc.AddNonVotersOrFatal(t, key, nonVoter) - // TODO(aayush): Trying to swap the last voting replica with a non-voter hits - // the safeguard inside Replica.propose() as the last voting replica is always - // the leaseholder. There are a bunch of subtleties around getting a - // leaseholder to remove itself without another voter to immediately transfer - // the lease to. See #40333. - _, err = tc.SwapVoterWithNonVoter(key, firstVoter, nonVoter) - require.Regexp(t, "received invalid ChangeReplicasTrigger", err) - - tc.AddVotersOrFatal(t, key, secondVoter) - - tc.SwapVoterWithNonVoterOrFatal(t, key, secondVoter, nonVoter) + // Swap the only voting replica (leaseholder) with a non-voter + tc.SwapVoterWithNonVoterOrFatal(t, key, firstVoter, nonVoter) } // TestReplicaTombstone ensures that tombstones are written when we expect diff --git a/pkg/kv/kvserver/merge_queue.go b/pkg/kv/kvserver/merge_queue.go index 76d7d29eb041..d93bf479551a 100644 --- a/pkg/kv/kvserver/merge_queue.go +++ b/pkg/kv/kvserver/merge_queue.go @@ -285,13 +285,13 @@ func (mq *mergeQueue) process( // side and AdminRelocateRange removes any on the range it operates on. // For the sake of obviousness, just fix this all upfront. var err error - lhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, lhsDesc) + lhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, lhsDesc, lhsRepl) if err != nil { log.VEventf(ctx, 2, `%v`, err) return false, err } - rhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, rhsDesc) + rhsDesc, err = maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, store, rhsDesc, nil) if err != nil { log.VEventf(ctx, 2, `%v`, err) return false, err diff --git a/pkg/kv/kvserver/replica_command.go b/pkg/kv/kvserver/replica_command.go index 37128af6e541..93236cf75d0a 100644 --- a/pkg/kv/kvserver/replica_command.go +++ b/pkg/kv/kvserver/replica_command.go @@ -15,7 +15,6 @@ import ( "context" "fmt" "math/rand" - "sort" "time" "github.com/cockroachdb/cockroach/pkg/base" @@ -302,7 +301,7 @@ func (r *Replica) adminSplitWithDescriptor( // The split queue doesn't care about the set of replicas, so if we somehow // are being handed one that's in a joint state, finalize that before // continuing. - desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc) + desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc, r) if err != nil { return roachpb.AdminSplitResponse{}, err } @@ -973,7 +972,7 @@ func (r *Replica) ChangeReplicas( return nil, errors.New("must disable replicate queue to use ChangeReplicas manually") } } - return r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs) + return r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs, r) } func (r *Replica) changeReplicasImpl( @@ -983,12 +982,13 @@ func (r *Replica) changeReplicasImpl( reason kvserverpb.RangeLogEventReason, details string, chgs roachpb.ReplicationChanges, + repl *Replica, ) (updatedDesc *roachpb.RangeDescriptor, _ error) { var err error // If in a joint config, clean up. The assumption here is that the caller // of ChangeReplicas didn't even realize that they were holding on to a // joint descriptor and would rather not have to deal with that fact. - desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc) + desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc, repl) if err != nil { return nil, err } @@ -1066,7 +1066,7 @@ func (r *Replica) changeReplicasImpl( // If the error occurred while transitioning out of an atomic replication // change, try again here with a fresh descriptor; this is a noop // otherwise. - if _, err := maybeLeaveAtomicChangeReplicas(ctx, r.store, r.Desc()); err != nil { + if _, err := maybeLeaveAtomicChangeReplicas(ctx, r.store, r.Desc(), r); err != nil { return nil, err } if fn := r.store.cfg.TestingKnobs.ReplicaAddSkipLearnerRollback; fn != nil && fn() { @@ -1125,7 +1125,7 @@ func (r *Replica) changeReplicasImpl( // If we demoted or swapped any voters with non-voters, we likely are in a // joint config or have learners on the range. Let's exit the joint config // and remove the learners. - return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc) + return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc, r) } return desc, nil } @@ -1169,17 +1169,57 @@ func synthesizeTargetsByChangeType( // descriptor returned from this method will contain replicas of type LEARNER, // NON_VOTER, and VOTER_FULL only. func maybeLeaveAtomicChangeReplicas( - ctx context.Context, s *Store, desc *roachpb.RangeDescriptor, + ctx context.Context, s *Store, desc *roachpb.RangeDescriptor, r *Replica, ) (*roachpb.RangeDescriptor, error) { // We want execChangeReplicasTxn to be able to make sure it's only tasked // with leaving a joint state when it's in one, so make sure we don't call // it if we're not. - if !desc.Replicas().InAtomicReplicationChange() { + if !desc.Replicas().InAtomicReplicationChange(){ return desc, nil } // NB: this is matched on in TestMergeQueueSeesLearner. log.Eventf(ctx, "transitioning out of joint configuration %s", desc) + voters := desc.Replicas().VoterDescriptors() + if r != nil { + // Current leaseholder is being removed + beingRemoved := true + for _, v := range voters { + if v.ReplicaID == r.ReplicaID() { + beingRemoved = false + break + } + } + if beingRemoved { + target := s.allocator.TransferLeaseTarget( + ctx, + r.SpanConfig(), + voters, + r, + r.leaseholderStats, + true, /* forceDecisionWithoutStats */ + transferLeaseOptions{ + goal: followTheWorkload, + checkTransferLeaseSource: true, + checkCandidateFullness: false, + disableLeaseCountConvergenceChecks: true, + dryRun: false, + }, + ) + if target == (roachpb.ReplicaDescriptor{}) { + log.Infof(ctx, "could not find a better lease transfer target for r%d", + desc.RangeID) + } else { + log.Infof(ctx, "current leaseholder %v is being removed. Transferring lease to %v", + r.String(), target.String()) + err := s.DB().AdminTransferLease(ctx, r.startKey, target.StoreID) + if err != nil { + return nil, err + } + } + } + } + // NB: reason and detail won't be used because no range log event will be // emitted. // @@ -1199,9 +1239,9 @@ func maybeLeaveAtomicChangeReplicas( // config (if there is one), and then removes all learners. After this function // returns, all remaining replicas will be of type VOTER_FULL or NON_VOTER. func maybeLeaveAtomicChangeReplicasAndRemoveLearners( - ctx context.Context, store *Store, desc *roachpb.RangeDescriptor, + ctx context.Context, store *Store, desc *roachpb.RangeDescriptor, repl *Replica, ) (*roachpb.RangeDescriptor, error) { - desc, err := maybeLeaveAtomicChangeReplicas(ctx, store, desc) + desc, err := maybeLeaveAtomicChangeReplicas(ctx, store, desc, repl) if err != nil { return nil, err } @@ -1772,7 +1812,7 @@ func (r *Replica) execReplicationChangesForVoters( // Leave the joint config if we entered one. Also, remove any learners we // might have picked up due to removal-via-demotion. - return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc) + return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc, r) } // tryRollbackRaftLearner attempts to remove a learner specified by the target. @@ -2653,7 +2693,7 @@ func updateRangeDescriptor( // // This is best-effort; it's possible that the replicate queue on the // leaseholder could take action at the same time, causing errors. -func (s *Store) AdminRelocateRange( +func (r *Replica) AdminRelocateRange( ctx context.Context, rangeDesc roachpb.RangeDescriptor, voterTargets, nonVoterTargets []roachpb.ReplicationTarget, @@ -2679,14 +2719,14 @@ func (s *Store) AdminRelocateRange( // Remove learners so we don't have to think about relocating them, and leave // the joint config if we're in one. - newDesc, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, s, &rangeDesc) + newDesc, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, &rangeDesc, r) if err != nil { log.Warningf(ctx, "%v", err) return err } rangeDesc = *newDesc - rangeDesc, err = s.relocateReplicas(ctx, rangeDesc, voterTargets, nonVoterTargets) + rangeDesc, err = r.store.relocateReplicas(ctx, rangeDesc, voterTargets, nonVoterTargets) if err != nil { return err } @@ -2744,20 +2784,20 @@ func (s *Store) relocateReplicas( return rangeDesc, err } - ops, leaseTarget, err := s.relocateOne(ctx, &rangeDesc, voterTargets, nonVoterTargets) + ops, err := s.relocateOne(ctx, &rangeDesc, voterTargets, nonVoterTargets) if err != nil { return rangeDesc, err } - if leaseTarget != nil { - // NB: we may need to transfer even if there are no ops, to make - // sure the attempt is made to make the first target the final - // leaseholder. - if err := transferLease(*leaseTarget); err != nil { - return rangeDesc, err - } - } if len(ops) == 0 { - // Done. + if len(voterTargets) > 0 { + // NB: we may need to transfer even if there are no ops, to make + // sure the attempt is made to make the first target the final + // leaseholder. If there are ops, lease transfer will happen during joint config exit, + // if leader is removed. + if err := transferLease(voterTargets[0]); err != nil { + return rangeDesc, err + } + } return rangeDesc, ctx.Err() } @@ -2780,7 +2820,7 @@ func (s *Store) relocateReplicas( } if success { if fn := s.cfg.TestingKnobs.OnRelocatedOne; fn != nil { - fn(ops, leaseTarget) + fn(ops, &voterTargets[0]) } break @@ -2833,21 +2873,21 @@ func (s *Store) relocateOne( ctx context.Context, desc *roachpb.RangeDescriptor, voterTargets, nonVoterTargets []roachpb.ReplicationTarget, -) ([]roachpb.ReplicationChange, *roachpb.ReplicationTarget, error) { +) ([]roachpb.ReplicationChange, error) { if repls := desc.Replicas(); len(repls.VoterFullAndNonVoterDescriptors()) != len(repls.Descriptors()) { // The caller removed all the learners and left the joint config, so there // shouldn't be anything but voters and non_voters. - return nil, nil, errors.AssertionFailedf( + return nil, errors.AssertionFailedf( `range %s was either in a joint configuration or had learner replicas: %v`, desc, desc.Replicas()) } confReader, err := s.GetConfReader() if err != nil { - return nil, nil, errors.Wrap(err, "can't relocate range") + return nil, errors.Wrap(err, "can't relocate range") } conf, err := confReader.GetSpanConfigForKey(ctx, desc.StartKey) if err != nil { - return nil, nil, err + return nil, err } storeList, _, _ := s.allocator.storePool.getStoreList(storeFilterNone) @@ -2863,7 +2903,7 @@ func (s *Store) relocateOne( existingReplicas := desc.Replicas().Descriptors() var additionTarget, removalTarget roachpb.ReplicationTarget - var shouldAdd, shouldRemove, canPromoteNonVoter, canDemoteVoter bool + var canPromoteNonVoter, canDemoteVoter bool if len(args.targetsToAdd()) > 0 { // Each iteration, pick the most desirable replica to add. However, // prefer the first target because it's the one that should hold the @@ -2882,7 +2922,7 @@ func (s *Store) relocateOne( for _, candidate := range candidateTargets { store, ok := storeMap[candidate.StoreID] if !ok { - return nil, nil, fmt.Errorf( + return nil, fmt.Errorf( "cannot up-replicate to s%d; missing gossiped StoreDescriptor"+ " (the store is likely dead, draining or decommissioning)", candidate.StoreID, ) @@ -2905,7 +2945,7 @@ func (s *Store) relocateOne( args.targetType, ) if targetStore == nil { - return nil, nil, fmt.Errorf( + return nil, fmt.Errorf( "none of the remaining %ss %v are legal additions to %v", args.targetType, args.targetsToAdd(), desc.Replicas(), ) @@ -2951,10 +2991,8 @@ func (s *Store) relocateOne( }, ) } - shouldAdd = true } - var transferTarget *roachpb.ReplicationTarget if len(args.targetsToRemove()) > 0 { // Pick a replica to remove. Note that existingVoters/existingNonVoters may // already reflect a replica we're adding in the current round. This is the @@ -2974,7 +3012,7 @@ func (s *Store) relocateOne( s.allocator.scorerOptions(), ) if err != nil { - return nil, nil, errors.Wrapf( + return nil, errors.Wrapf( err, "unable to select removal target from %v; current replicas %v", args.targetsToRemove(), existingReplicas, ) @@ -2993,10 +3031,8 @@ func (s *Store) relocateOne( liReq.Key = desc.StartKey.AsRawKey() b.AddRawRequest(liReq) if err := s.DB().Run(ctx, &b); err != nil { - return nil, nil, errors.Wrap(err, "looking up lease") + return nil, errors.Wrap(err, "looking up lease") } - curLeaseholder := b.RawResponse().Responses[0].GetLeaseInfo().Lease.Replica - shouldRemove = curLeaseholder.StoreID != removalTarget.StoreID if args.targetType == voterTarget { // If the voter being removed is about to be added as a non-voter, then we // can just demote it. @@ -3005,55 +3041,24 @@ func (s *Store) relocateOne( canDemoteVoter = true } } - if !shouldRemove { - // Pick a voting replica that we can give the lease to. We sort the first - // target to the beginning (if it's there) because that's where the lease - // needs to be in the end. We also exclude the last voter if it was - // added by the add branch above (in which case it doesn't exist yet). - added := 0 - if shouldAdd { - added++ - } - sortedTargetReplicas := append( - []roachpb.ReplicaDescriptor(nil), - existingVoters[:len(existingVoters)-added]..., - ) - sort.Slice( - sortedTargetReplicas, func(i, j int) bool { - sl := sortedTargetReplicas - // finalRelocationTargets[0] goes to the front (if it's present). - return sl[i].StoreID == args.finalRelocationTargets()[0].StoreID - }, - ) - for _, rDesc := range sortedTargetReplicas { - if rDesc.StoreID != curLeaseholder.StoreID { - transferTarget = &roachpb.ReplicationTarget{ - NodeID: rDesc.NodeID, - StoreID: rDesc.StoreID, - } - shouldRemove = true - break - } - } - } } } var ops []roachpb.ReplicationChange - if shouldAdd && shouldRemove { + if len(args.targetsToAdd()) > 0 && len(args.targetsToRemove()) > 0 { ops, _, err = replicationChangesForRebalance( ctx, desc, len(existingVoters), additionTarget, removalTarget, args.targetType, ) if err != nil { - return nil, nil, err + return nil, err } - } else if shouldAdd { + } else if len(args.targetsToAdd()) > 0 { if canPromoteNonVoter { ops = roachpb.ReplicationChangesForPromotion(additionTarget) } else { ops = roachpb.MakeReplicationChanges(args.targetType.AddChangeType(), additionTarget) } - } else if shouldRemove { + } else if len(args.targetsToRemove()) > 0 { // Carry out the removal only if there was no lease problem above. If there // was, we're not going to do a swap in this round but just do the addition. // (Note that !shouldRemove implies that we're trying to remove the last @@ -3065,12 +3070,7 @@ func (s *Store) relocateOne( } } - if len(ops) == 0 { - // Make sure that the first target is the final leaseholder, as - // AdminRelocateRange specifies. - transferTarget = &voterTargets[0] - } - return ops, transferTarget, nil + return ops, nil } func getRelocationArgs( diff --git a/pkg/kv/kvserver/replica_learner_test.go b/pkg/kv/kvserver/replica_learner_test.go index d1859b786d60..a1b87552042b 100644 --- a/pkg/kv/kvserver/replica_learner_test.go +++ b/pkg/kv/kvserver/replica_learner_test.go @@ -837,8 +837,8 @@ func TestLearnerNoAcceptLease(t *testing.T) { } } -// TestJointConfigLease verifies that incoming and outgoing voters can't have the -// lease transferred to them. +// TestJointConfigLease verifies that incoming voters can have the +// lease transferred to them, and outgoing voters cannot. func TestJointConfigLease(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) @@ -857,14 +857,14 @@ func TestJointConfigLease(t *testing.T) { require.True(t, desc.Replicas().InAtomicReplicationChange(), desc) err := tc.TransferRangeLease(desc, tc.Target(1)) - exp := `replica cannot hold lease` - require.True(t, testutils.IsError(err, exp), err) + require.NoError(t, err) // NB: we don't have to transition out of the previous joint config first // because this is done automatically by ChangeReplicas before it does what // it's asked to do. - desc = tc.RemoveVotersOrFatal(t, k, tc.Target(1)) - err = tc.TransferRangeLease(desc, tc.Target(1)) + desc = tc.RemoveVotersOrFatal(t, k, tc.Target(0)) + err = tc.TransferRangeLease(desc, tc.Target(0)) + exp := `replica cannot hold lease` require.True(t, testutils.IsError(err, exp), err) } @@ -1186,7 +1186,7 @@ func TestMergeQueueSeesLearnerOrJointConfig(t *testing.T) { require.False(t, desc.Replicas().InAtomicReplicationChange(), desc) // Repeat the game, except now we start with two replicas and we're - // giving the RHS a VOTER_OUTGOING. + // giving the RHS a VOTER_DEMOTING_LEARNER. desc = splitAndUnsplit() ltk.withStopAfterJointConfig(func() { descRight := tc.RemoveVotersOrFatal(t, desc.EndKey.AsRawKey(), tc.Target(1)) diff --git a/pkg/kv/kvserver/replica_raft.go b/pkg/kv/kvserver/replica_raft.go index a0fb7e8ce8c8..636e1ea02487 100644 --- a/pkg/kv/kvserver/replica_raft.go +++ b/pkg/kv/kvserver/replica_raft.go @@ -299,27 +299,36 @@ func (r *Replica) propose( log.Infof(p.ctx, "proposing %s", crt) prefix = false - // Ensure that we aren't trying to remove ourselves from the range without - // having previously given up our lease, since the range won't be able - // to make progress while the lease is owned by a removed replica (and - // leases can stay in such a state for a very long time when using epoch- - // based range leases). This shouldn't happen often, but has been seen - // before (#12591). + // The following deals with removing a leaseholder. A voter can be removed in two ways. + // 1) Simple (old style) where there is a reconfiguration turning a voter into a LEARNER / + // NON-VOTER. 2) Through an intermediate joint configuration, where the replica remains in + // the descriptor, but as VOTER_{OUTGOING, DEMOTING}. When leaving the JOINT config (a second + // Raft operation), the removed replica transitions a LEARNER / NON-VOTER. // - // Note that due to atomic replication changes, when a removal is initiated, - // the replica remains in the descriptor, but as VOTER_{OUTGOING,DEMOTING}. - // We want to block it from getting into that state in the first place, - // since there's no stopping the actual removal/demotion once it's there. - // The Removed() field has contains these replicas when this first - // transition is initiated, so its use here is copacetic. + // In case (1) the lease needs to be transferred out before a removal is proposed (cooperative + // transfer). The code below permits leaseholder removal only if entering a joint configuration + // (option 2 above) in which the leaseholder is (any kind of) voter. In this case, the lease is + // transferred to a different voter (potentially incoming) in maybeLeaveAtomicChangeReplicas + // right before we exit the joint configuration. + // + // When the leaseholder is replaced by a new replica, transferring the lease in the joint config + // allows transferring directly from old to new, since both are active in the joint config, + // without going through a third node or adding the new node before transferring, which might + // reduce fault tolerance. For example, consider v1 in region1 (leaseholder), v2 in region2 and + // v3 in region3. We want to relocate v1 to a new node v4 in region1. We add v4 as LEARNER. + // At this point we can't transfer the lease to v4, so we could transfer it to v2 first, but + // this is likely to hurt application performance. We could instead add v4 as VOTER first, + // and then transfer lease directly to v4, but this would change the number of replicas to 4, + // and if region1 goes down, we loose a quorum. Instead, we move to a joint config where v1 + // (VOTER_DEMOTED_LEARNER) transfer the lease to v4 (VOTER_INCOMING) directly. + // See also https://github.com/cockroachdb/cockroach/issues/67740. replID := r.ReplicaID() - for _, rDesc := range crt.Removed() { - if rDesc.ReplicaID == replID { - err := errors.Mark(errors.Newf("received invalid ChangeReplicasTrigger %s to remove self (leaseholder)", crt), - errMarkInvalidReplicationChange) - log.Errorf(p.ctx, "%v", err) - return roachpb.NewError(err) - } + rDesc, ok := p.command.ReplicatedEvalResult.State.Desc.GetReplicaDescriptorByID(replID) + if !ok || !rDesc.IsAnyVoter() { + err := errors.Mark(errors.Newf("received invalid ChangeReplicasTrigger %s to remove self (leaseholder)", crt), + errMarkInvalidReplicationChange) + log.Errorf(p.ctx, "%v", err) + return roachpb.NewError(err) } } else if p.command.ReplicatedEvalResult.AddSSTable != nil { log.VEvent(p.ctx, 4, "sideloadable proposal detected") diff --git a/pkg/kv/kvserver/replica_send.go b/pkg/kv/kvserver/replica_send.go index 38e860590a9c..8d8305593de6 100644 --- a/pkg/kv/kvserver/replica_send.go +++ b/pkg/kv/kvserver/replica_send.go @@ -809,7 +809,7 @@ func (r *Replica) executeAdminBatch( } case *roachpb.AdminRelocateRangeRequest: - err := r.store.AdminRelocateRange(ctx, *r.Desc(), tArgs.VoterTargets, tArgs.NonVoterTargets) + err := r.AdminRelocateRange(ctx, *r.Desc(), tArgs.VoterTargets, tArgs.NonVoterTargets) pErr = roachpb.NewError(err) resp = &roachpb.AdminRelocateRangeResponse{} diff --git a/pkg/kv/kvserver/replicate_queue.go b/pkg/kv/kvserver/replicate_queue.go index 75e12ffcaa48..cfa3248df453 100644 --- a/pkg/kv/kvserver/replicate_queue.go +++ b/pkg/kv/kvserver/replicate_queue.go @@ -483,7 +483,7 @@ func (rq *replicateQueue) processOneChange( case AllocatorConsiderRebalance: return rq.considerRebalance(ctx, repl, voterReplicas, nonVoterReplicas, canTransferLeaseFrom, dryRun) case AllocatorFinalizeAtomicReplicationChange: - _, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, repl.store, repl.Desc()) + _, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, repl.store, repl.Desc(), repl) // Requeue because either we failed to transition out of a joint state // (bad) or we did and there might be more to do for that range. return true, err @@ -1106,13 +1106,6 @@ func (rq *replicateQueue) considerRebalance( if !ok { log.VEventf(ctx, 1, "no suitable rebalance target for non-voters") - } else if done, err := rq.maybeTransferLeaseAway( - ctx, repl, removeTarget.StoreID, dryRun, canTransferLeaseFrom, - ); err != nil { - log.VEventf(ctx, 1, "want to remove self, but failed to transfer lease away: %s", err) - } else if done { - // Lease is now elsewhere, so we're not in charge any more. - return false, nil } else { // If we have a valid rebalance action (ok == true) and we haven't // transferred our lease away, execute the rebalance. @@ -1292,6 +1285,8 @@ type transferLeaseOptions struct { // checkCandidateFullness, when false, tells `TransferLeaseTarget` // to disregard the existing lease counts on candidates. checkCandidateFullness bool + // when true, ignores lease count convergence considerations + disableLeaseCountConvergenceChecks bool dryRun bool } @@ -1390,7 +1385,8 @@ func (rq *replicateQueue) changeReplicas( // NB: this calls the impl rather than ChangeReplicas because // the latter traps tests that try to call it while the replication // queue is active. - if _, err := repl.changeReplicasImpl(ctx, desc, priority, reason, details, chgs); err != nil { + if _, err := repl.changeReplicasImpl( + ctx, desc, priority, reason, details, chgs, repl); err != nil { return err } rangeUsageInfo := rangeUsageInfoForRepl(repl) diff --git a/pkg/kv/kvserver/store_rebalancer.go b/pkg/kv/kvserver/store_rebalancer.go index d28f86450a8a..43a661a5898e 100644 --- a/pkg/kv/kvserver/store_rebalancer.go +++ b/pkg/kv/kvserver/store_rebalancer.go @@ -345,7 +345,7 @@ func (sr *StoreRebalancer) rebalanceStore( timeout := sr.rq.processTimeoutFunc(sr.st, replWithStats.repl) if err := contextutil.RunWithTimeout(ctx, "relocate range", timeout, func(ctx context.Context) error { - return sr.rq.store.AdminRelocateRange(ctx, *descBeforeRebalance, voterTargets, nonVoterTargets) + return replWithStats.repl.AdminRelocateRange(ctx, *descBeforeRebalance, voterTargets, nonVoterTargets) }); err != nil { log.Errorf(ctx, "unable to relocate range to %v: %+v", voterTargets, err) continue diff --git a/pkg/roachpb/metadata.go b/pkg/roachpb/metadata.go index 63b21b88558c..8cc5c558b799 100644 --- a/pkg/roachpb/metadata.go +++ b/pkg/roachpb/metadata.go @@ -485,6 +485,15 @@ func (r ReplicaDescriptor) IsVoterNewConfig() bool { } } +func (r ReplicaDescriptor) IsAnyVoter() bool { + switch r.GetType() { + case VOTER_FULL, VOTER_INCOMING, VOTER_OUTGOING, VOTER_DEMOTING_NON_VOTER, VOTER_DEMOTING_LEARNER: + return true + default: + return false + } +} + // PercentilesFromData derives percentiles from a slice of data points. // Sorts the input data if it isn't already sorted. func PercentilesFromData(data []float64) Percentiles { diff --git a/pkg/roachpb/metadata_replicas.go b/pkg/roachpb/metadata_replicas.go index 2262f32ff955..e82c057833f1 100644 --- a/pkg/roachpb/metadata_replicas.go +++ b/pkg/roachpb/metadata_replicas.go @@ -524,17 +524,7 @@ func CheckCanReceiveLease(wouldbeLeaseholder ReplicaDescriptor, rngDesc *RangeDe repDesc, ok := rngDesc.GetReplicaDescriptorByID(wouldbeLeaseholder.ReplicaID) if !ok { return errReplicaNotFound - } else if t := repDesc.GetType(); t != VOTER_FULL { - // NB: there's no harm in transferring the lease to a VOTER_INCOMING, - // but we disallow it anyway. On the other hand, transferring to - // VOTER_OUTGOING would be a pretty bad idea since those voters are - // dropped when transitioning out of the joint config, which then - // amounts to removing the leaseholder without any safety precautions. - // This would either wedge the range or allow illegal reads to be - // served. - // - // Since the leaseholder can't remove itself and is a VOTER_FULL, we - // also know that in any configuration there's at least one VOTER_FULL. + } else if !repDesc.IsVoterNewConfig() { return errReplicaCannotHoldLease } return nil diff --git a/pkg/testutils/testcluster/testcluster.go b/pkg/testutils/testcluster/testcluster.go index 5e48ee123a21..e364f7871c71 100644 --- a/pkg/testutils/testcluster/testcluster.go +++ b/pkg/testutils/testcluster/testcluster.go @@ -883,7 +883,7 @@ func (tc *TestCluster) TransferRangeLeaseOrFatal( t testing.TB, rangeDesc roachpb.RangeDescriptor, dest roachpb.ReplicationTarget, ) { if err := tc.TransferRangeLease(rangeDesc, dest); err != nil { - t.Fatalf(`could transfer lease for range %s error is %+v`, rangeDesc, err) + t.Fatalf(`couldn not transfer lease for range %s error is %+v`, rangeDesc, err) } } @@ -892,12 +892,8 @@ func (tc *TestCluster) RemoveLeaseHolderOrFatal( t testing.TB, rangeDesc roachpb.RangeDescriptor, src roachpb.ReplicationTarget, - dest roachpb.ReplicationTarget, ) { testutils.SucceedsSoon(t, func() error { - if err := tc.TransferRangeLease(rangeDesc, dest); err != nil { - return err - } if _, err := tc.RemoveVoters(rangeDesc.StartKey.AsRawKey(), src); err != nil { if strings.Contains(err.Error(), "to remove self (leaseholder)") { return err