Skip to content

Commit

Permalink
kvserver: replace multiTestContext with TestCluster in client_raft_test
Browse files Browse the repository at this point in the history
Makes progress on cockroachdb#8299
Fixes cockroachdb#40351

multiTestContext is a legacy construct that is deprecated in favor of running
tests via TestCluster. This is one PR out of many to remove the usage of
multiTestContext in the client_raft test cases. This does not remove all the
uses of mtc, just the simple ones. Leaving the more complex uses cases for a later PR.

With this switch we can also clean up some TestingKnobs and TestServer interfaces.
    - DisablePeriodicGossips flag is removed, it does not work with TestCluster
      and is no longer used
    - DontPreventUseOfOldLeaseOnStart flag is removed, it did not work consistently
      in TestCluster. This flag tries to leave the Lease on the same node after a
      restart, but CRDB makes no such guarantees in the real world and artificially
      testing it does not prove anything. The affected tests were re-worked to
      not rely on this condition and can deal with a lease holder moving on a restart.

Release note: None
  • Loading branch information
lunevalex committed Nov 20, 2020
1 parent 189bdaa commit 748ee9a
Show file tree
Hide file tree
Showing 9 changed files with 1,290 additions and 1,177 deletions.
2,293 changes: 1,157 additions & 1,136 deletions pkg/kv/kvserver/client_raft_test.go

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions pkg/kv/kvserver/node_liveness_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
Expand Down Expand Up @@ -66,6 +67,18 @@ func pauseNodeLivenessHeartbeatLoops(mtc *multiTestContext) func() {
}
}

func pauseNodeLivenessHeartbeatLoopsTC(tc *testcluster.TestCluster) func() {
var enableFns []func()
for _, server := range tc.Servers {
enableFns = append(enableFns, server.NodeLiveness().(*liveness.NodeLiveness).PauseHeartbeatLoopForTest())
}
return func() {
for _, fn := range enableFns {
fn()
}
}
}

func TestNodeLiveness(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
Expand Down
4 changes: 0 additions & 4 deletions pkg/kv/kvserver/replica_gossip.go
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,6 @@ func (r *Replica) maybeGossipFirstRange(ctx context.Context) *roachpb.Error {
log.Errorf(ctx, "failed to gossip cluster ID: %+v", err)
}

if r.store.cfg.TestingKnobs.DisablePeriodicGossips {
return nil
}

hasLease, pErr := r.getLeaseForGossip(ctx)
if pErr != nil {
return pErr
Expand Down
26 changes: 10 additions & 16 deletions pkg/kv/kvserver/replica_init.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,22 +172,16 @@ func (r *Replica) loadRaftMuLockedReplicaMuLocked(desc *roachpb.RangeDescriptor)

r.setDescLockedRaftMuLocked(ctx, desc)

// Init the minLeaseProposedTS such that we won't use an existing lease (if
// any). This is so that, after a restart, we don't propose under old leases.
// If the replica is being created through a split, this value will be
// overridden.
if !r.store.cfg.TestingKnobs.DontPreventUseOfOldLeaseOnStart {
// Only do this if there was a previous lease. This shouldn't be important
// to do but consider that the first lease which is obtained is back-dated
// to a zero start timestamp (and this de-flakes some tests). If we set the
// min proposed TS here, this lease could not be renewed (by the semantics
// of minLeaseProposedTS); and since minLeaseProposedTS is copied on splits,
// this problem would multiply to a number of replicas at cluster bootstrap.
// Instead, we make the first lease special (which is OK) and the problem
// disappears.
if r.mu.state.Lease.Sequence > 0 {
r.mu.minLeaseProposedTS = r.Clock().Now()
}
// Only do this if there was a previous lease. This shouldn't be important
// to do but consider that the first lease which is obtained is back-dated
// to a zero start timestamp (and this de-flakes some tests). If we set the
// min proposed TS here, this lease could not be renewed (by the semantics
// of minLeaseProposedTS); and since minLeaseProposedTS is copied on splits,
// this problem would multiply to a number of replicas at cluster bootstrap.
// Instead, we make the first lease special (which is OK) and the problem
// disappears.
if r.mu.state.Lease.Sequence > 0 {
r.mu.minLeaseProposedTS = r.Clock().Now()
}

ssBase := r.Engine().GetAuxiliaryDir()
Expand Down
16 changes: 10 additions & 6 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,16 @@ func (sc *StoreConfig) LeaseExpiration() int64 {
return 2 * (sc.RangeLeaseActiveDuration() + maxOffset).Nanoseconds()
}

// RaftElectionTimeoutTicks exposed for testing.
func (s *Store) RaftElectionTimeoutTicks() int {
return s.cfg.RaftElectionTimeoutTicks
}

// CoalescedHeartbeatsInterval exposed for testing.
func (s *Store) CoalescedHeartbeatsInterval() time.Duration {
return s.cfg.CoalescedHeartbeatsInterval
}

// NewStore returns a new instance of a store.
func NewStore(
ctx context.Context, cfg StoreConfig, eng storage.Engine, nodeDesc *roachpb.NodeDescriptor,
Expand Down Expand Up @@ -1632,12 +1642,6 @@ func (s *Store) startGossip() {
return pErr.GoError()
}

if s.cfg.TestingKnobs.DisablePeriodicGossips {
wakeReplica = func(context.Context, *Replica) error {
return errPeriodicGossipsDisabled
}
}

gossipFns := []struct {
key roachpb.Key
fn func(context.Context, *Replica) error
Expand Down
7 changes: 0 additions & 7 deletions pkg/kv/kvserver/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,6 @@ type StoreTestingKnobs struct {
// should get rid of such practices once we make TestServer take a
// ManualClock.
DisableMaxOffsetCheck bool
// DontPreventUseOfOldLeaseOnStart disables the initialization of
// replica.mu.minLeaseProposedTS on replica.Init(). This has the effect of
// allowing the replica to use the lease that it had in a previous life (in
// case the tests persisted the engine used in said previous life).
DontPreventUseOfOldLeaseOnStart bool
// DisableAutomaticLeaseRenewal enables turning off the background worker
// that attempts to automatically renew expiration-based leases.
DisableAutomaticLeaseRenewal bool
Expand Down Expand Up @@ -131,8 +126,6 @@ type StoreTestingKnobs struct {
DisableConsistencyQueue bool
// DisableScanner disables the replica scanner.
DisableScanner bool
// DisablePeriodicGossips disables periodic gossiping.
DisablePeriodicGossips bool
// DisableLeaderFollowsLeaseholder disables attempts to transfer raft
// leadership when it diverges from the range's leaseholder.
DisableLeaderFollowsLeaseholder bool
Expand Down
8 changes: 8 additions & 0 deletions pkg/server/testserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,14 @@ func (ts *TestServer) RaftTransport() *kvserver.RaftTransport {
return nil
}

// NodeDialer returns the NodeDialer used by the TestServer.
func (ts *TestServer) NodeDialer() *nodedialer.Dialer {
if ts != nil {
return ts.nodeDialer
}
return nil
}

// Start starts the TestServer by bootstrapping an in-memory store
// (defaults to maximum of 100M). The server is started, launching the
// node RPC server and all HTTP endpoints. Use the value of
Expand Down
1 change: 1 addition & 0 deletions pkg/testutils/testcluster/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ go_library(
"//pkg/util/timeutil",
"//vendor/github.com/cockroachdb/errors",
"//vendor/github.com/cockroachdb/logtags",
"//vendor/go.etcd.io/etcd/raft/v3:raft",
],
)

Expand Down
99 changes: 91 additions & 8 deletions pkg/testutils/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"go.etcd.io/etcd/raft/v3"
)

// TestCluster represents a set of TestServers. The hope is that it can be used
Expand Down Expand Up @@ -123,6 +124,14 @@ func (tc *TestCluster) StopServer(idx int) {
}
}

// ServerStopped determines if a server has been explicitly
// stopped by StopServer(s).
func (tc *TestCluster) ServerStopped(idx int) bool {
tc.mu.Lock()
defer tc.mu.Unlock()
return tc.mu.serverStoppers[idx] == nil
}

// StartTestCluster creates and starts up a TestCluster made up of `nodes`
// in-memory testing servers.
// The cluster should be stopped using TestCluster.Stopper().Stop().
Expand Down Expand Up @@ -340,7 +349,11 @@ func checkServerArgsForCluster(
// cluster's ReplicationMode.
func (tc *TestCluster) AddAndStartServer(t testing.TB, serverArgs base.TestServerArgs) {
if serverArgs.JoinAddr == "" && len(tc.Servers) > 0 {
serverArgs.JoinAddr = tc.Servers[0].ServingRPCAddr()
serv := tc.getFirstLiveServer()
if serv == nil {
serv = tc.Servers[0]
}
serverArgs.JoinAddr = serv.ServingRPCAddr()
}
_, err := tc.AddServer(serverArgs)
if err != nil {
Expand Down Expand Up @@ -708,7 +721,7 @@ func (tc *TestCluster) RemoveNonVoters(
func (tc *TestCluster) TransferRangeLease(
rangeDesc roachpb.RangeDescriptor, dest roachpb.ReplicationTarget,
) error {
err := tc.Servers[0].DB().AdminTransferLease(context.TODO(),
err := tc.getFirstLiveServer().DB().AdminTransferLease(context.TODO(),
rangeDesc.StartKey.AsRawKey(), dest.StoreID)
if err != nil {
return errors.Wrapf(err, "%q: transfer lease unexpected error", rangeDesc.StartKey)
Expand Down Expand Up @@ -746,8 +759,8 @@ func (tc *TestCluster) FindRangeLease(
// Find the server indicated by the hint and send a LeaseInfoRequest through
// it.
var hintServer *server.TestServer
for _, s := range tc.Servers {
if s.GetNode().Descriptor.NodeID == hint.NodeID {
for i, s := range tc.Servers {
if s.GetNode().Descriptor.NodeID == hint.NodeID && !tc.ServerStopped(i) {
hintServer = s
break
}
Expand Down Expand Up @@ -991,10 +1004,14 @@ func (tc *TestCluster) ToggleReplicateQueues(active bool) {

// readIntFromStores reads the current integer value at the given key
// from all configured engines, filling in zeros when the value is not
// found.
// found. This method ignores all the explicitly stopped servers and will
// only return values for live ones.
func (tc *TestCluster) readIntFromStores(key roachpb.Key) []int64 {
results := make([]int64, 0, len(tc.Servers))
for _, server := range tc.Servers {
var results []int64
for i, server := range tc.Servers {
if tc.ServerStopped(i) {
continue
}
err := server.Stores().VisitStores(func(s *kvserver.Store) error {
val, _, err := storage.MVCCGet(context.Background(), s.Engine(), key,
server.Clock().Now(), storage.MVCCGetOptions{})
Expand All @@ -1020,7 +1037,8 @@ func (tc *TestCluster) readIntFromStores(key roachpb.Key) []int64 {

// WaitForValues waits up to the given duration for the integer values
// at the given key to match the expected slice (across all stores).
// Fails the test if they do not match.
// Fails the test if they do not match. This method ignores all the explicitly
// stopped servers and will only return values for live ones.
func (tc *TestCluster) WaitForValues(t testing.TB, key roachpb.Key, expected []int64) {
t.Helper()
testutils.SucceedsSoon(t, func() error {
Expand All @@ -1042,6 +1060,71 @@ func (tc *TestCluster) GetFirstStoreFromServer(t testing.TB, server int) *kvserv
return store
}

// getFirsLiveServer returns the first server in the list that has not been
// explicitly stopped. If all the servers are stopped, it returns the first one.
func (tc *TestCluster) getFirstLiveServer() *server.TestServer {
for i := range tc.Servers {
if !tc.ServerStopped(i) {
return tc.Servers[i]
}
}
return nil
}

// GetRaftLeader returns the replica that is the current raft leader for the
// specified key.
func (tc *TestCluster) GetRaftLeader(t testing.TB, key roachpb.RKey) *kvserver.Replica {
t.Helper()
var raftLeaderRepl *kvserver.Replica
testutils.SucceedsSoon(t, func() error {
var latestTerm uint64
for i := range tc.Servers {
err := tc.Servers[i].Stores().VisitStores(func(store *kvserver.Store) error {
repl := store.LookupReplica(key)
if repl == nil {
// Replica does not exist on this store or there is no raft
// status yet.
return nil
}
raftStatus := repl.RaftStatus()
if raftStatus.Term > latestTerm || (raftLeaderRepl == nil && raftStatus.Term == latestTerm) {
// If we find any newer term, it means any previous election is
// invalid.
raftLeaderRepl = nil
latestTerm = raftStatus.Term
if raftStatus.RaftState == raft.StateLeader {
raftLeaderRepl = repl
}
}
return nil
})
if err != nil {
return err
}
}
if latestTerm == 0 || raftLeaderRepl == nil {
return errors.Errorf("could not find a raft leader for key %s", key)
}
return nil
})
return raftLeaderRepl
}

// Restart stops and then starts all the servers in the cluster.
func (tc *TestCluster) Restart(t testing.TB, serverArgsPerNode map[int]base.TestServerArgs) {
for i := range tc.Servers {
tc.StopServer(i)
}
for _, args := range serverArgsPerNode {
for _, specs := range args.StoreSpecs {
if specs.StickyInMemoryEngineID == "" {
t.Fatalf("Restart can only be used when the servers were started with a sticky engine")
}
}
tc.AddAndStartServer(t, args)
}
}

type testClusterFactoryImpl struct{}

// TestClusterFactory can be passed to serverutils.InitTestClusterFactory
Expand Down

0 comments on commit 748ee9a

Please sign in to comment.