From 2c899155d0db2a7a775f5c2c90ce6819f147323b Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 20:57:01 -0500 Subject: [PATCH 1/6] kvserver: use leader leases in TestFlowControlCrashedNode This test sets a very large number for RaftElectionTimeoutTicks to keep leadership sticky. Naively, this prevents us from using leader leases in this test, as replicas won't campaign if they don't have StoreLiveness support, which isn't kicked off until the first tick. Conveniently, raft fortification is yet another way to keep leadership sticky -- so explicitly using leader leases should achieve the same goal. References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- pkg/kv/kvserver/flow_control_integration_test.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index c4e63185e10c..ba7441e7e9a4 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/admission" "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/randutil" @@ -592,18 +593,16 @@ func TestFlowControlCrashedNode(t *testing.T) { // mechanism below, and for quiesced ranges, that can effectively disable // the last-updated mechanism since quiesced ranges aren't being ticked, and // we only check the last-updated state when ticked. So we disable range - // quiescence. - kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, true) - kvflowcontrol.Enabled.Override(ctx, &st.SV, true) - + // quiescence by turning on leader leases, regardless of any metamorphism. + kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Settings: st, RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, // Reduce the RangeLeaseDuration to speeds up failure detection // below. RangeLeaseDuration: time.Second, @@ -628,6 +627,9 @@ func TestFlowControlCrashedNode(t *testing.T) { return true }, }, + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, }, }, }) From 37956d53dec8217d28014e17ae8917d1eea8f17e Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 21:00:46 -0500 Subject: [PATCH 2/6] kvserver: use leader leases in TestFlowControlCrashedNode2 This test sets a very large number for RaftElectionTimeoutTicks to keep leadership sticky. Naively, this prevents us from using leader leases in this test, as replicas won't campaign if they don't have StoreLiveness support, which isn't kicked off until the first tick. Conveniently, raft fortification is yet another way to keep leadership sticky -- so explicitly using leader leases should achieve the same goal. References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- pkg/kv/kvserver/flow_control_integration_test.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index ba7441e7e9a4..7b197b7460e8 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -2695,15 +2695,19 @@ func TestFlowControlCrashedNodeV2(t *testing.T) { }, func(t *testing.T, mode kvflowcontrol.ModeT) { ctx := context.Background() settings := cluster.MakeTestingClusterSettings() - kvserver.ExpirationLeasesOnly.Override(ctx, &settings.SV, true) + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay + // sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, 2, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Settings: settings, RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, // Reduce the RangeLeaseDuration to speeds up failure detection // below. RangeLeaseDuration: time.Second, @@ -2723,6 +2727,9 @@ func TestFlowControlCrashedNodeV2(t *testing.T) { return true }, }, + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, }, }, }) From 0f1d0a0d5342522a2054d33d52b1031b0fd484da Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 21:05:11 -0500 Subject: [PATCH 3/6] kvserver: use leader leases for TestFlowControlRaftSnapshotV2 This test sets a very large number for RaftElectionTimeoutTicks to keep leadership sticky. Naively, this prevents us from using leader leases in this test, as replicas won't campaign if they don't have StoreLiveness support, which isn't kicked off until the first tick. Conveniently, raft fortification is yet another way to keep leadership sticky -- so explicitly using leader leases should achieve the same goal. References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- pkg/kv/kvserver/flow_control_integration_test.go | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index 7b197b7460e8..db4e768b0321 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -2817,6 +2817,14 @@ func TestFlowControlRaftSnapshotV2(t *testing.T) { bypassReplicaUnreachable.Store(false) ctx := context.Background() settings := cluster.MakeTestingClusterSettings() + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay + // sticky. + manualClock := hlc.NewHybridManualClock() for i := 0; i < numServers; i++ { stickyServerArgs[i] = base.TestServerArgs{ Settings: settings, @@ -2826,14 +2834,10 @@ func TestFlowControlRaftSnapshotV2(t *testing.T) { StickyVFSID: strconv.FormatInt(int64(i), 10), }, }, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ Server: &server.TestingKnobs{ StickyVFSRegistry: fs.NewStickyRegistry(), + WallClock: manualClock, }, Store: &kvserver.StoreTestingKnobs{ RaftReportUnreachableBypass: func(_ roachpb.ReplicaID) bool { From 9decbb617f4ebe55475ca532cccb0d4a8579fa1f Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 21:10:13 -0500 Subject: [PATCH 4/6] kvserver: use leader leases for TestFlowControlRaftMembershipV2 This test sets a very large number for RaftElectionTimeoutTicks to keep leadership sticky. Naively, this prevents us from using leader leases in this test, as replicas won't campaign if they don't have StoreLiveness support, which isn't kicked off until the first tick. Conveniently, raft fortification is yet another way to keep leadership sticky -- so explicitly using leader leases should achieve the same goal. References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- pkg/kv/kvserver/flow_control_integration_test.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index db4e768b0321..419dd413d577 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -3087,15 +3087,18 @@ func TestFlowControlRaftMembershipV2(t *testing.T) { settings := cluster.MakeTestingClusterSettings() var disableWorkQueueGranting atomic.Bool disableWorkQueueGranting.Store(true) + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay + // sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, 5, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Settings: settings, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to deal - // with leadership changing hands unless intentional. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ @@ -3111,6 +3114,9 @@ func TestFlowControlRaftMembershipV2(t *testing.T) { return disableWorkQueueGranting.Load() }, }, + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, }, }, }) From 3013817e2d47dc3dfb9da776b739765ad304127c Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 21:13:02 -0500 Subject: [PATCH 5/6] kvserver: use epoch based leases in tests that want quiesced ranges We'll soon enable leader leases metamorphically. There's a few tests that want to always use a lease type that allows for quiescence -- so instead of disabling expiration based leases, they should instead explicitly set its lease type to be epoch based. The tests are: - TestFlowControlUnquiescedRangeV2 - TestFlowControlUnquiescedRange - TestFlowControlQuiescedRange References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- pkg/kv/kvserver/flow_control_integration_test.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index 419dd413d577..0afd6cfd1522 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -1596,7 +1596,7 @@ func TestFlowControlQuiescedRange(t *testing.T) { disableFallbackTokenDispatch.Store(true) st := cluster.MakeTestingClusterSettings() - kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism + kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseEpoch) // override metamorphism kvflowcontrol.Enabled.Override(ctx, &st.SV, true) tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ @@ -1736,7 +1736,7 @@ func TestFlowControlUnquiescedRange(t *testing.T) { disablePiggybackTokenDispatch.Store(true) st := cluster.MakeTestingClusterSettings() - kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism + kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseEpoch) // override metamorphism kvflowcontrol.Enabled.Override(ctx, &st.SV, true) tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ @@ -3472,7 +3472,7 @@ func TestFlowControlUnquiescedRangeV2(t *testing.T) { settings := cluster.MakeTestingClusterSettings() // Override metamorphism to allow range quiescence. - kvserver.ExpirationLeasesOnly.Override(ctx, &settings.SV, false) + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseEpoch) tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ From 4d9dbf38c249dbf92eaeb257bd0557c8a26e79c0 Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Mon, 25 Nov 2024 21:19:53 -0500 Subject: [PATCH 6/6] kvserver: use leader leases in some flow control tests Much like the previous commits, a bunch of other tests were setting a very large number for RaftElectionTimeoutTicks to keep leadership sticky. Naively, this prevents us from using leader leases in these tests, as replicas won't campaign unless they have StoreLiveness support, which they won't until the first tick kicks off a request. Conveniently, raft fortification is yet another way to keep leadership sticky -- so explicitly using leader leases should achieve the same goal. The tests modified here are: - TestFlowControlSendQueue - TestFlowControlRepeatedlySwitchMode - TestFlowControlRaftSnapshot - TestFlowControlRaftTransportBreak - TestFlowControlAdmissionPostSplitMergeV2 - TestFlowControlV1ToV2Transition References https://github.com/cockroachdb/cockroach/issues/133763 Release note: None --- .../kvserver/flow_control_integration_test.go | 90 ++++++++++++------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/pkg/kv/kvserver/flow_control_integration_test.go b/pkg/kv/kvserver/flow_control_integration_test.go index 0afd6cfd1522..6dd659f27203 100644 --- a/pkg/kv/kvserver/flow_control_integration_test.go +++ b/pkg/kv/kvserver/flow_control_integration_test.go @@ -714,7 +714,13 @@ func TestFlowControlRaftSnapshot(t *testing.T) { st := cluster.MakeTestingClusterSettings() kvflowcontrol.Enabled.Override(ctx, &st.SV, true) kvflowcontrol.Mode.Override(ctx, &st.SV, kvflowcontrol.ApplyToAll) - + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() for i := 0; i < numServers; i++ { stickyServerArgs[i] = base.TestServerArgs{ Settings: st, @@ -724,14 +730,10 @@ func TestFlowControlRaftSnapshot(t *testing.T) { StickyVFSID: strconv.FormatInt(int64(i), 10), }, }, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ Server: &server.TestingKnobs{ StickyVFSRegistry: fs.NewStickyRegistry(), + WallClock: manualClock, }, Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ @@ -1005,17 +1007,21 @@ func TestFlowControlRaftTransportBreak(t *testing.T) { st := cluster.MakeTestingClusterSettings() kvflowcontrol.Enabled.Override(ctx, &st.SV, true) - + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Settings: st, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ UseOnlyForScratchRanges: true, @@ -2543,16 +2549,21 @@ func TestFlowControlAdmissionPostSplitMergeV2(t *testing.T) { var disableWorkQueueGranting atomic.Bool disableWorkQueueGranting.Store(true) settings := cluster.MakeTestingClusterSettings() + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay + // sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ - Settings: settings, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ UseOnlyForScratchRanges: true, @@ -3930,18 +3941,23 @@ func TestFlowControlV1ToV2Transition(t *testing.T) { settings := cluster.MakeTestingClusterSettings() argsPerServer := make(map[int]base.TestServerArgs) + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() for i := range serverLevels { // Every node starts off using the v1 protocol but we will ratchet up the // levels on servers at different times as we go to test the transition. serverLevels[i].Store(kvflowcontrol.V2NotEnabledWhenLeader) argsPerServer[i] = base.TestServerArgs{ Settings: settings, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to deal - // with leadership changing hands unintentionally. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ UseOnlyForScratchRanges: true, @@ -4569,6 +4585,13 @@ func TestFlowControlSendQueue(t *testing.T) { kvflowcontrol.ElasticTokensPerStream.Override(ctx, &settings.SV, 2<<20) kvflowcontrol.RegularTokensPerStream.Override(ctx, &settings.SV, 4<<20) + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() stickyArgsPerServer := make(map[int]base.TestServerArgs) for i := range disableWorkQueueGrantingServers { // Start with admission (logical token return) disabled across all nodes. @@ -4581,14 +4604,10 @@ func TestFlowControlSendQueue(t *testing.T) { StickyVFSID: strconv.FormatInt(int64(i), 10), }, }, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to deal - // with leadership changing hands unintentionally. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ Server: &server.TestingKnobs{ StickyVFSRegistry: fs.NewStickyRegistry(), + WallClock: manualClock, }, Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ @@ -4942,15 +4961,17 @@ func TestFlowControlRepeatedlySwitchMode(t *testing.T) { ctx := context.Background() settings := cluster.MakeTestingClusterSettings() + // This test doesn't want leadership changing hands, and leader leases (by + // virtue of raft fortification) help ensure this. Override to disable any + // metamorphosis. + kvserver.OverrideDefaultLeaseType(ctx, &settings.SV, roachpb.LeaseLeader) + // Using a manual clock here ensures that StoreLiveness support, once + // established, never expires. By extension, leadership should stay sticky. + manualClock := hlc.NewHybridManualClock() tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{ ReplicationMode: base.ReplicationManual, ServerArgs: base.TestServerArgs{ Settings: settings, - RaftConfig: base.RaftConfig{ - // Suppress timeout-based elections. This test doesn't want to - // deal with leadership changing hands. - RaftElectionTimeoutTicks: 1000000, - }, Knobs: base.TestingKnobs{ Store: &kvserver.StoreTestingKnobs{ FlowControlTestingKnobs: &kvflowcontrol.TestingKnobs{ @@ -4962,6 +4983,9 @@ func TestFlowControlRepeatedlySwitchMode(t *testing.T) { }, }, }, + Server: &server.TestingKnobs{ + WallClock: manualClock, + }, }, }, })