Skip to content

Commit

Permalink
kv: add TestTransferLeaseDuringJointConfigWithDeadIncomingVoter
Browse files Browse the repository at this point in the history
This commit adds a new test which ensures that the lease transfer
performed during a joint config replication change that is replacing the
existing leaseholder does not get stuck even if the existing leaseholder
cannot prove that the incoming leaseholder is caught up on its log. It
does so by killing the incoming leaseholder before it receives the lease
and ensuring that the range is able to exit the joint configuration.

Currently, the range exits by bypassing safety checks during the lease
transfer, sending the lease to the dead incoming voter, letting the
lease expire, acquiring the lease on one of the non-demoting voters, and
exiting. The details here may change in the future, but the goal of this
test will not.
  • Loading branch information
nvanbenschoten committed Oct 5, 2022
1 parent 3ed2295 commit ec37e5e
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 0 deletions.
96 changes: 96 additions & 0 deletions pkg/kv/kvserver/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
"github.com/cockroachdb/errors"
"github.com/stretchr/testify/require"
"go.etcd.io/etcd/raft/v3"
"go.etcd.io/etcd/raft/v3/tracker"
)

// TestStoreRangeLease verifies that regular ranges (not some special ones at
Expand Down Expand Up @@ -369,6 +371,100 @@ func TestTransferLeaseToVoterDemotingWithIncoming(t *testing.T) {
})
}

// TestTransferLeaseDuringJointConfigWithDeadIncomingVoter ensures that the
// lease transfer performed during a joint config replication change that is
// replacing the existing leaseholder does not get stuck even if the existing
// leaseholder cannot prove that the incoming leaseholder is caught up on its
// log. It does so by killing the incoming leaseholder before it receives the
// lease and ensuring that the range is able to exit the joint configuration.
//
// Currently, the range exits by bypassing safety checks during the lease
// transfer, sending the lease to the dead incoming voter, letting the lease
// expire, acquiring the lease on one of the non-demoting voters, and exiting.
// The details here may change in the future, but the goal of this test will
// not.
func TestTransferLeaseDuringJointConfigWithDeadIncomingVoter(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

// The lease request timeout depends on the Raft election timeout, so we set
// it low to get faster lease expiration (800 ms) and speed up the test.
var raftCfg base.RaftConfig
raftCfg.SetDefaults()
raftCfg.RaftHeartbeatIntervalTicks = 1
raftCfg.RaftElectionTimeoutTicks = 2

knobs, ltk := makeReplicationTestKnobs()
tc := testcluster.StartTestCluster(t, 4, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
RaftConfig: raftCfg,
Knobs: knobs,
},
ReplicationMode: base.ReplicationManual,
})
defer tc.Stopper().Stop(ctx)

key := tc.ScratchRange(t)
desc := tc.AddVotersOrFatal(t, key, tc.Targets(1, 2)...)
// Make sure n1 has the lease to start with.
err := tc.Server(0).DB().AdminTransferLease(ctx, key, tc.Target(0).StoreID)
require.NoError(t, err)
store0, repl0 := getFirstStoreReplica(t, tc.Server(0), key)

// The test proceeds as follows:
//
// - Send an AdminChangeReplicasRequest to remove n1 (leaseholder) and add n4
// - Stop the replication change after entering the joint configuration
// - Kill n4 and wait until n1 notices
// - Complete the replication change

// Enter joint config.
ltk.withStopAfterJointConfig(func() {
tc.RebalanceVoterOrFatal(ctx, t, key, tc.Target(0), tc.Target(3))
})
desc = tc.LookupRangeOrFatal(t, key)
require.Len(t, desc.Replicas().Descriptors(), 4)
require.True(t, desc.Replicas().InAtomicReplicationChange(), desc)

// Kill n4.
tc.StopServer(3)

// Wait for n1 to notice.
testutils.SucceedsSoon(t, func() error {
// Manually report n4 as unreachable to speed up the test.
require.NoError(t, repl0.RaftReportUnreachable(4))
// Check the Raft progress.
s := repl0.RaftStatus()
require.Equal(t, raft.StateLeader, s.RaftState)
p := s.Progress
require.Len(t, p, 4)
require.Contains(t, p, uint64(4))
if p[4].State != tracker.StateProbe {
return errors.Errorf("dead replica not state probe")
}
return nil
})

// Run the range through the replicate queue on n1.
trace, processErr, err := store0.Enqueue(
ctx, "replicate", repl0, true /* skipShouldQueue */, false /* async */)
require.NoError(t, err)
require.NoError(t, processErr)
formattedTrace := trace.String()
expectedMessages := []string{
`transitioning out of joint configuration`,
`leaseholder .* is being removed through an atomic replication change, transferring lease to`,
`lease transfer to .* complete`,
}
require.NoError(t, testutils.MatchInOrder(formattedTrace, expectedMessages...))

// Verify that the joint configuration has completed.
desc = tc.LookupRangeOrFatal(t, key)
require.Len(t, desc.Replicas().VoterDescriptors(), 3)
require.False(t, desc.Replicas().InAtomicReplicationChange(), desc)
}

// TestTransferLeaseFailureDuringJointConfig reproduces
// https://github.com/cockroachdb/cockroach/issues/83687
// and makes sure that if lease transfer fails during a joint configuration
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/closed_timestamp_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,10 @@ func countNotLeaseHolderErrors(ba roachpb.BatchRequest, repls []*kvserver.Replic
const testingTargetDuration = 300 * time.Millisecond

const testingSideTransportInterval = 100 * time.Millisecond

// TODO(nvanbenschoten): this is a pretty bad variable name to leak into the
// global scope of the kvserver_test package. At least one test was using it
// unintentionally. Remove it.
const numNodes = 3

func replsForRange(
Expand Down
7 changes: 7 additions & 0 deletions pkg/kv/kvserver/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,13 @@ func (r *Replica) RaftUnlock() {
r.raftMu.Unlock()
}

func (r *Replica) RaftReportUnreachable(id roachpb.ReplicaID) error {
return r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) {
raftGroup.ReportUnreachable(uint64(id))
return false /* unquiesceAndWakeLeader */, nil
})
}

// LastAssignedLeaseIndexRLocked returns the last assigned lease index.
func (r *Replica) LastAssignedLeaseIndex() uint64 {
r.mu.RLock()
Expand Down

0 comments on commit ec37e5e

Please sign in to comment.