Skip to content

Commit

Permalink
Merge #133353
Browse files Browse the repository at this point in the history
133353: kvserver: metamorphically enable leader leases r=nvanbenschoten a=arulajmani

Rebased version of #131623 by `@nvanbenschoten.` 

Part of #123847.

See individual commits.

Release note: None

Co-authored-by: Arul Ajmani <[email protected]>
Co-authored-by: Nathan VanBenschoten <[email protected]>
  • Loading branch information
3 people committed Dec 2, 2024
2 parents c14af3c + 3904daa commit 4f654c7
Show file tree
Hide file tree
Showing 15 changed files with 77 additions and 13 deletions.
1 change: 1 addition & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ go_library(
"//pkg/storage",
"//pkg/storage/enginepb",
"//pkg/storage/fs",
"//pkg/testutils",
"//pkg/testutils/bazelcodecover",
"//pkg/testutils/serverutils",
"//pkg/ts",
Expand Down
11 changes: 9 additions & 2 deletions pkg/cli/debug_recover_loss_of_quorum_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/server/serverpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage/fs"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/listenerutil"
Expand Down Expand Up @@ -182,13 +183,18 @@ func TestLossOfQuorumRecovery(t *testing.T) {
// would not be able to progress, but we will apply recovery procedure and
// mark on replicas on node 1 as designated survivors. After that, starting
// single node should succeed.
st := cluster.MakeTestingClusterSettings()
// We currently don't clear out the LeadEpoch field when recovering from a
// loss of quorum, so we can't run with leader leases on in this test.
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
tcBefore := testcluster.NewTestCluster(t, 3, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
// This logic is specific to the storage layer.
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
Settings: st,
},
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {StoreSpecs: []base.StoreSpec{{Path: dir + "/store-1"}}},
0: {Settings: st, StoreSpecs: []base.StoreSpec{{Path: dir + "/store-1"}}},
},
})
tcBefore.Start(t)
Expand Down Expand Up @@ -255,10 +261,11 @@ func TestLossOfQuorumRecovery(t *testing.T) {
ServerArgs: base.TestServerArgs{
// This logic is specific to the storage layer.
DefaultTestTenant: base.TestIsSpecificToStorageLayerAndNeedsASystemTenant,
Settings: st,
},
ReplicationMode: base.ReplicationManual,
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {StoreSpecs: []base.StoreSpec{{Path: dir + "/store-1"}}},
0: {Settings: st, StoreSpecs: []base.StoreSpec{{Path: dir + "/store-1"}}},
},
})
// NB: If recovery is not performed, new cluster will just hang on startup.
Expand Down
10 changes: 10 additions & 0 deletions pkg/cli/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -182,6 +183,15 @@ func newCLITestWithArgs(params TestCLIParams, argsFn func(args *base.TestServerA

log.Infof(context.Background(), "server started at %s", c.Server.AdvRPCAddr())
log.Infof(context.Background(), "SQL listener at %s", c.Server.AdvSQLAddr())

// When run under leader leases, requests will not heartbeat NodeLiveness on
// the lease acquisition codepath. This may then cause CLI commands
// (such as status or ls) which require a NodeLiveness record to fail. Explicitly
// heartbeat the NodeLiveness record to prevent tests from flaking.
err = testutils.SucceedsSoonError(c.Server.HeartbeatNodeLiveness)
if err != nil {
log.Fatalf(context.Background(), "Couldn't heartbeat node liveness: %s", err)
}
}

if params.TenantArgs != nil && params.SharedProcessTenantArgs != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvclient/kvcoord/dist_sender_ambiguous_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,7 @@ func TestTransactionUnexpectedlyCommitted(t *testing.T) {

ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)

// Disable closed timestamps for control over when transaction gets bumped.
closedts.TargetDuration.Override(ctx, &st.SV, 1*time.Hour)
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/client_lease_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -916,9 +916,13 @@ func TestLeaseholderRelocate(t *testing.T) {
locality("us"),
}

// TODO(arul): figure out why this test is flaky under leader leases.
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
const numNodes = 4
for i := 0; i < numNodes; i++ {
serverArgs[i] = base.TestServerArgs{
Settings: st,
Locality: localities[i],
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
Expand Down
7 changes: 5 additions & 2 deletions pkg/kv/kvserver/client_merge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -492,8 +492,8 @@ func mergeCheckingTimestampCaches(
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
// This test explicitly sets up a leader/leaseholder partition, which doesn't
// work with expiration leases (the lease expires).
kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism
// work with expiration leases or leader leases (the lease expires).
kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseEpoch)
tc := testcluster.StartTestCluster(t, 3,
base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
Expand Down Expand Up @@ -2995,9 +2995,12 @@ func TestStoreRangeMergeAbandonedFollowersAutomaticallyGarbageCollected(t *testi
defer log.Scope(t).Close(t)

ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
tc := testcluster.StartTestCluster(t, 3,
base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{},
})
defer tc.Stopper().Stop(ctx)
scratch := tc.ScratchRange(t)
Expand Down
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/client_raft_epoch_leases_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ func TestRaftPreVoteEpochLeases(t *testing.T) {
// transfers, to avoid range writes.
st := cluster.MakeTestingClusterSettings()
kvserver.TransferExpirationLeasesFirstEnabled.Override(ctx, &st.SV, false)
kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // disable metamorphism
kvserver.OverrideDefaultLeaseType(ctx, &st.SV, roachpb.LeaseEpoch) // disable metamorphism

tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
Expand Down
4 changes: 4 additions & 0 deletions pkg/kv/kvserver/client_raft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2665,6 +2665,7 @@ func TestWedgedReplicaDetection(t *testing.T) {
base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
Settings: settings,
RaftConfig: raftConfig,
Knobs: base.TestingKnobs{
Server: &server.TestingKnobs{
Expand Down Expand Up @@ -5722,6 +5723,8 @@ func TestElectionAfterRestart(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)

// We use a single node to avoid rare flakes due to dueling elections.
// The code is set up to support multiple nodes, though the test will
Expand All @@ -5743,6 +5746,7 @@ func TestElectionAfterRestart(t *testing.T) {
ReplicationMode: replMode,
ParallelStart: parallel,
ServerArgs: base.TestServerArgs{
Settings: st,
RaftConfig: base.RaftConfig{
RaftElectionTimeoutTicks: electionTimeoutTicks,
RaftTickInterval: raftTickInterval,
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/client_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ func TestStoreLoadReplicaQuiescent(t *testing.T) {

ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, expOnly)

tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{
Expand Down
5 changes: 5 additions & 0 deletions pkg/kv/kvserver/loqrecovery/collect_raft_log_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/loqrecovery/loqrecoverypb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
Expand Down Expand Up @@ -160,8 +161,11 @@ func checkRaftLog(
RaftLogTruncationThreshold: math.MaxInt64,
}

st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
tc := testcluster.NewTestCluster(t, 2, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
Settings: st,
Knobs: base.TestingKnobs{
Store: &kvserver.StoreTestingKnobs{
DisableGCQueue: true,
Expand All @@ -183,6 +187,7 @@ func checkRaftLog(
StoreSpecs: []base.StoreSpec{{InMemory: true}},
RaftConfig: testRaftConfig,
Insecure: true,
Settings: st,
},
},
})
Expand Down
5 changes: 3 additions & 2 deletions pkg/kv/kvserver/replica_range_lease.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ func OverrideDefaultLeaseType(ctx context.Context, sv *settings.Values, typ roac
switch typ {
case roachpb.LeaseExpiration:
ExpirationLeasesOnly.Override(ctx, sv, true)
RaftLeaderFortificationFractionEnabled.Override(ctx, sv, 0.0)
case roachpb.LeaseEpoch:
ExpirationLeasesOnly.Override(ctx, sv, false)
RaftLeaderFortificationFractionEnabled.Override(ctx, sv, 0.0)
Expand All @@ -179,9 +180,9 @@ func OverrideDefaultLeaseType(ctx context.Context, sv *settings.Values, typ roac

// OverrideLeaderLeaseMetamorphism overrides the default lease type to be
// epoch based leases, regardless of whether leader leases were metamorphically
// enabled or not. Any
// enabled or not.
func OverrideLeaderLeaseMetamorphism(ctx context.Context, sv *settings.Values) {
OverrideDefaultLeaseType(ctx, sv, roachpb.LeaseEpoch)
RaftLeaderFortificationFractionEnabled.Override(ctx, sv, 0.0)
}

// leaseRequestHandle is a handle to an asynchronous lease request.
Expand Down
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/replica_rangefeed_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1467,6 +1467,8 @@ func TestRangefeedCheckpointsRecoverFromLeaseExpiration(t *testing.T) {

st := cluster.MakeTestingClusterSettings()
kvserver.ExpirationLeasesOnly.Override(ctx, &st.SV, false) // override metamorphism
// TODO(arul): Dig into why this isn't passing.
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)

var storeLivenessHeartbeatsOff atomic.Value
cargs := aggressiveResolvedTimestampManuallyReplicatedClusterArgs
Expand Down
8 changes: 4 additions & 4 deletions pkg/kv/kvserver/replica_store_liveness.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ import (
"github.com/cockroachdb/cockroach/pkg/raft/raftstoreliveness"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/util/envutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/metamorphic"
)

// RaftLeaderFortificationFractionEnabled controls the fraction of ranges for
Expand All @@ -33,9 +33,9 @@ var RaftLeaderFortificationFractionEnabled = settings.RegisterFloatSetting(
"by extension, use Leader leases for all ranges which do not require "+
"expiration-based leases. Set to a value between 0.0 and 1.0 to gradually "+
"roll out Leader leases across the ranges in a cluster.",
// TODO(nvanbenschoten): make this a metamorphic constant once raft leader
// fortification and leader leases are sufficiently stable.
envutil.EnvOrDefaultFloat64("COCKROACH_LEADER_FORTIFICATION_FRACTION_ENABLED", 0.0),
metamorphic.ConstantWithTestChoice("kv.raft.leader_fortification.fraction_enabled",
0.0, /* defaultValue */
1.0 /* otherValues */),
settings.FloatInRange(0.0, 1.0),
settings.WithPublic,
)
Expand Down
18 changes: 17 additions & 1 deletion pkg/server/storage_api/health_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,16 @@ func TestHealthAPI(t *testing.T) {
return srvtestutils.GetAdminJSONProto(ts, "health", &resp)
})

// Make the SQL listener appear unavailable. Verify that health fails after that.
// Before we start the test, ensure that the health check succeeds. This is
// contingent on the Server heartbeating its liveness record, which can race
// with this check here.
testutils.SucceedsSoon(t, func() error {
var resp serverpb.HealthResponse
return srvtestutils.GetAdminJSONProto(ts, "health?ready=1", &resp)
})

// Make the SQL listener appear unavailable. Verify that health fails after
// that.
ts.SetReady(false)
var resp serverpb.HealthResponse
err := srvtestutils.GetAdminJSONProto(ts, "health?ready=1", &resp)
Expand All @@ -71,6 +80,13 @@ func TestHealthAPI(t *testing.T) {
t.Fatal(err)
}

// Before we start the test, ensure that the health check succeeds. This is
// contingent on the Server heartbeating its liveness record, and we want
// that to happen before we pause heartbeats.
testutils.SucceedsSoon(t, func() error {
return srvtestutils.GetAdminJSONProto(s, "health?ready=1", &resp)
})

// Expire this node's liveness record by pausing heartbeats and advancing the
// server's clock.
nl := s.NodeLiveness().(*liveness.NodeLiveness)
Expand Down
11 changes: 10 additions & 1 deletion pkg/sql/show_ranges_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ import (
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/sqltestutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
Expand Down Expand Up @@ -244,10 +246,17 @@ func TestShowRangesUnavailableReplicas(t *testing.T) {

const numNodes = 3
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
kvserver.OverrideLeaderLeaseMetamorphism(ctx, &st.SV)
tc := testcluster.StartTestCluster(
// Manual replication will prevent the leaseholder for the unavailable range
// from moving a different node.
t, numNodes, base.TestClusterArgs{ReplicationMode: base.ReplicationManual},
t, numNodes, base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgs: base.TestServerArgs{
Settings: st,
},
},
)
defer tc.Stopper().Stop(ctx)

Expand Down

0 comments on commit 4f654c7

Please sign in to comment.