Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kv: consolidate range lease validation #126182

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pkg/kv/kvserver/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,6 @@ go_library(
"//pkg/kv/kvserver/multiqueue",
"//pkg/kv/kvserver/raftentry",
"//pkg/kv/kvserver/raftlog",
"//pkg/kv/kvserver/raftutil",
"//pkg/kv/kvserver/rangefeed",
"//pkg/kv/kvserver/rditer",
"//pkg/kv/kvserver/readsummary",
Expand Down Expand Up @@ -418,6 +417,7 @@ go_test(
"//pkg/kv/kvserver/kvserverbase",
"//pkg/kv/kvserver/kvserverpb",
"//pkg/kv/kvserver/kvstorage",
"//pkg/kv/kvserver/leases",
"//pkg/kv/kvserver/liveness",
"//pkg/kv/kvserver/liveness/livenesspb",
"//pkg/kv/kvserver/load",
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/batcheval/cmd_lease_request.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func RequestLease(

// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
// TODO(nvanbenschoten): move this into leases.Verify.
wasLastLeaseholder := prevLease.Replica.StoreID == newLease.Replica.StoreID
if err := roachpb.CheckCanReceiveLease(
newLease.Replica, cArgs.EvalCtx.Desc().Replicas(), wasLastLeaseholder,
Expand Down
1 change: 1 addition & 0 deletions pkg/kv/kvserver/batcheval/cmd_lease_transfer.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ func TransferLease(

// If this check is removed at some point, the filtering of learners on the
// sending side would have to be removed as well.
// TODO(nvanbenschoten): move this into leases.Verify.
if err := roachpb.CheckCanReceiveLease(
newLease.Replica, cArgs.EvalCtx.Desc().Replicas(), false, /* wasLastLeaseholder */
); err != nil {
Expand Down
3 changes: 2 additions & 1 deletion pkg/kv/kvserver/client_replica_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/lock"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/leases"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/protectedts/ptpb"
Expand Down Expand Up @@ -1850,7 +1851,7 @@ func TestLeaseExpirationBasedDrainTransferWithProscribed(t *testing.T) {
target := filterArgs.Req.Requests[0].GetTransferLease().Lease.Replica
if target == l.replica0Desc {
failedOnce.Do(func() { close(failedCh) })
return kvpb.NewError(kvserver.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(
return kvpb.NewError(leases.NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(
target, raftutil.ReplicaStateProbe))
}
}
Expand Down
12 changes: 12 additions & 0 deletions pkg/kv/kvserver/leases/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ go_library(
name = "leases",
srcs = [
"build.go",
"errors.go",
"status.go",
"verify.go",
],
importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/leases",
visibility = ["//visibility:public"],
Expand All @@ -13,6 +15,8 @@ go_library(
"//pkg/kv/kvserver/kvserverpb",
"//pkg/kv/kvserver/liveness",
"//pkg/kv/kvserver/liveness/livenesspb",
"//pkg/kv/kvserver/raftutil",
"//pkg/raft",
"//pkg/roachpb",
"//pkg/util/hlc",
"//pkg/util/log",
Expand All @@ -26,14 +30,22 @@ go_test(
srcs = [
"build_test.go",
"status_test.go",
"verify_test.go",
],
embed = [":leases"],
deps = [
"//pkg/kv/kvpb",
"//pkg/kv/kvserver/kvserverpb",
"//pkg/kv/kvserver/liveness",
"//pkg/kv/kvserver/liveness/livenesspb",
"//pkg/kv/kvserver/raftutil",
"//pkg/raft",
"//pkg/raft/raftpb",
"//pkg/raft/tracker",
"//pkg/roachpb",
"//pkg/testutils/zerofields",
"//pkg/util/hlc",
"@com_github_cockroachdb_errors//:errors",
"@com_github_stretchr_testify//require",
],
)
101 changes: 91 additions & 10 deletions pkg/kv/kvserver/leases/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
package leases

import (
"context"
"time"

"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/raft"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/errors"
Expand All @@ -25,11 +27,39 @@ import (
// Settings is the set of settings for the leasing subsystem, used when building
// a new lease object.
type Settings struct {
UseExpirationLeases bool
// UseExpirationLeases controls whether this range should be using an
// expiration-based lease.
UseExpirationLeases bool
// TransferExpirationLeases controls whether we transfer expiration-based
// leases that are later upgraded to epoch-based ones or whether we transfer
// epoch-based leases directly.
TransferExpirationLeases bool
ExpToEpochEquiv bool
MinExpirationSupported bool
RangeLeaseDuration time.Duration
// RejectLeaseOnLeaderUnknown controls whether a replica that does not know
// the current raft leader rejects a lease request.
RejectLeaseOnLeaderUnknown bool
// DisableAboveRaftLeaseTransferSafetyChecks, if set, disables the above-raft
// lease transfer safety checks (that verify that we don't transfer leases to
// followers that need a snapshot, etc). The proposal-time checks are not
// affected by this knob.
DisableAboveRaftLeaseTransferSafetyChecks bool
// AllowLeaseProposalWhenNotLeader, if set, allows lease request proposals
// even when the replica inserting that proposal is not the Raft leader. This
// can be used in tests to allow a replica to acquire a lease without first
// moving the Raft leadership to it (e.g. it allows tests to expire leases by
// stopping the old leaseholder's liveness heartbeats and then expect other
// replicas to take the lease without worrying about Raft).
AllowLeaseProposalWhenNotLeader bool
// ExpToEpochEquiv indicates whether an expiration-based lease can be
// considered equivalent to an epoch-based lease during a promotion from
// expiration-based to epoch-based. It is used for mixed-version
// compatibility.
ExpToEpochEquiv bool
// MinExpirationSupported indicates whether the cluster version supports the
// minimum expiration field in the lease object. It is used for mixed-version
// compatibility.
MinExpirationSupported bool
// RangeLeaseDuration specifies the range lease duration.
RangeLeaseDuration time.Duration
}

// NodeLiveness is a read-only interface to the node liveness subsystem.
Expand Down Expand Up @@ -60,9 +90,15 @@ type NodeLivenessManipulation struct {
type BuildInput struct {
// Information about the local replica.
LocalStoreID roachpb.StoreID
LocalReplicaID roachpb.ReplicaID
Desc *roachpb.RangeDescriptor
Now hlc.ClockTimestamp
MinLeaseProposedTS hlc.ClockTimestamp

// Information about raft.
RaftStatus *raft.Status
RaftFirstIndex kvpb.RaftIndex

// Information about the previous lease.
PrevLease roachpb.Lease
// PrevLeaseNodeLiveness is set iff PrevLease is an epoch-based lease.
Expand All @@ -72,6 +108,15 @@ type BuildInput struct {

// Information about the (requested) next lease.
NextLeaseHolder roachpb.ReplicaDescriptor

// When set to true, BypassSafetyChecks configures lease transfers to skip
// safety checks that ensure that the transfer target is known to be
// (according to the outgoing leaseholder) alive and sufficiently caught up on
// its log. This option should be used sparingly — typically only by outgoing
// leaseholders who both have some other reason to believe that the target is
// alive and caught up on its log (e.g. they just sent it a snapshot) and also
// can't tolerate rejected lease transfers.
BypassSafetyChecks bool
}

// PrevLocal returns whether the previous lease was held by the local store.
Expand Down Expand Up @@ -142,6 +187,9 @@ func (i BuildInput) validate() error {
return errors.AssertionFailedf("cannot acquire lease from another node "+
"before it has expired: %v", i.PrevLease)
}
if !i.Transfer() && i.BypassSafetyChecks {
return errors.AssertionFailedf("cannot bypass safety checks for lease acquisition/extension")
}
return nil
}

Expand All @@ -165,6 +213,24 @@ func (i BuildInput) validatePrevLeaseExpired() error {
return nil
}

// toVerifyInput converts the BuildInput to a VerifyInput, which is used to
// verify the safety of lease requests. This is a non-lossy conversion, so the
// resulting VerifyInput should be fully populated, which is verified in
// TestInputToVerifyInput.
func (i BuildInput) toVerifyInput() VerifyInput {
return VerifyInput{
LocalStoreID: i.LocalStoreID,
LocalReplicaID: i.LocalReplicaID,
Desc: i.Desc,
RaftStatus: i.RaftStatus,
RaftFirstIndex: i.RaftFirstIndex,
PrevLease: i.PrevLease,
PrevLeaseExpired: i.PrevLeaseExpired,
NextLeaseHolder: i.NextLeaseHolder,
BypassSafetyChecks: i.BypassSafetyChecks,
}
}

// Output is the set of outputs for the lease acquisition process.
type Output struct {
NextLease roachpb.Lease
Expand All @@ -180,16 +246,31 @@ func (o Output) validate(i BuildInput) error {
return nil
}

// Build constructs a new lease based on the input settings and parameters. The
// resulting output will contain the lease to be proposed and any necessary node
// liveness manipulations that must be performed before the lease can be
// requested.
func Build(st Settings, nl NodeLiveness, i BuildInput) (Output, error) {
// Validate the input.
// VerifyAndBuild checks the safety of a lease acquisition or transfer request.
// If the safety checks fail, it returns an error.
//
// If the safety checks pass, it constructs a new lease based on the input
// settings and parameters. The resulting output will contain the lease to be
// proposed and any necessary node liveness manipulations that must be performed
// before the lease can be requested.
func VerifyAndBuild(
ctx context.Context, st Settings, nl NodeLiveness, i BuildInput,
) (Output, error) {
if err := i.validate(); err != nil {
return Output{}, err
}
if i.Transfer() && !st.DisableAboveRaftLeaseTransferSafetyChecks {
// TODO(nvanbenschoten): support build-time verification for lease
// acquisition, not just lease transfers. This currently breaks various
// tests. See #118435.
if err := Verify(ctx, st, i.toVerifyInput()); err != nil {
return Output{}, err
}
}
return build(st, nl, i)
}

func build(st Settings, nl NodeLiveness, i BuildInput) (Output, error) {
// Construct the next lease.
nextLease := roachpb.Lease{
Replica: leaseReplica(i),
Expand Down
62 changes: 56 additions & 6 deletions pkg/kv/kvserver/leases/build_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,17 @@ import (

"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness/livenesspb"
"github.com/cockroachdb/cockroach/pkg/raft"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils/zerofields"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/stretchr/testify/require"
)

var (
repl1 = roachpb.ReplicaDescriptor{NodeID: 1, StoreID: 1, ReplicaID: 1}
repl2 = roachpb.ReplicaDescriptor{NodeID: 2, StoreID: 2, ReplicaID: 2}
desc = roachpb.RangeDescriptor{InternalReplicas: []roachpb.ReplicaDescriptor{repl1, repl2}}
cts10 = hlc.ClockTimestamp{WallTime: 10}
cts20 = hlc.ClockTimestamp{WallTime: 20}
cts30 = hlc.ClockTimestamp{WallTime: 30}
Expand Down Expand Up @@ -111,6 +114,18 @@ func TestInputValidation(t *testing.T) {
},
expErr: "cannot acquire lease from another node before it has expired",
},
{
name: "bypass safety checks for acquisition",
input: BuildInput{
LocalStoreID: repl1.StoreID,
Now: cts20,
PrevLease: roachpb.Lease{Replica: repl2, Expiration: &ts10},
PrevLeaseExpired: true,
NextLeaseHolder: repl1,
BypassSafetyChecks: true,
},
expErr: "cannot bypass safety checks for lease acquisition/extension",
},
{
name: "valid acquisition",
input: BuildInput{
Expand Down Expand Up @@ -158,11 +173,12 @@ func TestInputValidation(t *testing.T) {

func defaultSettings() Settings {
return Settings{
UseExpirationLeases: false,
TransferExpirationLeases: true,
ExpToEpochEquiv: true,
MinExpirationSupported: true,
RangeLeaseDuration: 20,
UseExpirationLeases: false,
TransferExpirationLeases: true,
RejectLeaseOnLeaderUnknown: false,
ExpToEpochEquiv: true,
MinExpirationSupported: true,
RangeLeaseDuration: 20,
}
}

Expand Down Expand Up @@ -222,7 +238,7 @@ func TestBuild(t *testing.T) {
if nl == nil {
nl = defaultNodeLiveness()
}
out, err := Build(st, nl, tt.input)
out, err := build(st, nl, tt.input)
if tt.expErr == "" {
require.NoError(t, err)
require.Equal(t, tt.expOutput, out)
Expand Down Expand Up @@ -658,3 +674,37 @@ func TestBuild(t *testing.T) {
})
})
}

func TestInputToVerifyInput(t *testing.T) {
cts := hlc.ClockTimestamp{WallTime: 1, Logical: 1}
ts := cts.ToTimestamp()
noZeroBuildInput := BuildInput{
LocalStoreID: 1,
LocalReplicaID: 1,
Desc: &roachpb.RangeDescriptor{},
RaftStatus: &raft.Status{},
RaftFirstIndex: 1,
PrevLease: roachpb.Lease{
Start: cts,
Expiration: &ts,
Replica: roachpb.ReplicaDescriptor{
NodeID: 1, StoreID: 1, ReplicaID: 1, Type: 1,
},
DeprecatedStartStasis: &ts,
ProposedTS: cts,
Epoch: 1,
Sequence: 1,
AcquisitionType: 1,
MinExpiration: ts,
Term: 1,
},
PrevLeaseExpired: true,
NextLeaseHolder: roachpb.ReplicaDescriptor{
NodeID: 1, StoreID: 1, ReplicaID: 1, Type: 1,
},
BypassSafetyChecks: true,
}
verifyInput := noZeroBuildInput.toVerifyInput()
require.NoError(t, zerofields.NoZeroField(verifyInput),
"make sure you update BuildInput.toVerifyInput for the new field")
}
36 changes: 36 additions & 0 deletions pkg/kv/kvserver/leases/errors.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package leases

import (
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/errors"
)

// ErrMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot indicates that the
// lease transfer failed because the current leaseholder could not prove that
// the lease transfer target did not need a Raft snapshot. In order to prove
// this, the current leaseholder must also be the Raft leader, which is
// periodically requested in maybeTransferRaftLeadershipToLeaseholderLocked.
var ErrMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot = errors.New(
"lease transfer rejected because the target may need a snapshot")

// NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError return an error
// indicating that a lease transfer failed because the current leaseholder could
// not prove that the lease transfer target did not need a Raft snapshot.
func NewLeaseTransferRejectedBecauseTargetMayNeedSnapshotError(
target roachpb.ReplicaDescriptor, snapStatus raftutil.ReplicaNeedsSnapshotStatus,
) error {
err := errors.Errorf("refusing to transfer lease to %d because target may need a Raft snapshot: %s",
target, snapStatus)
return errors.Mark(err, ErrMarkLeaseTransferRejectedBecauseTargetMayNeedSnapshot)
}
Loading
Loading