diff --git a/WORKSPACE b/WORKSPACE index 21af5efc9c91..1d740d4e9b33 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -158,15 +158,15 @@ load( go_download_sdk( name = "go_sdk", sdks = { - "darwin_amd64": ("go1.17.10.darwin-amd64.tar.gz", "84979d5985c70cee6f303050a7e811440aad7f304efdf28665b200f096b01945"), - "darwin_arm64": ("go1.17.10.darwin-arm64.tar.gz", "32098bea40117ea1ec23e7124cd188db6bdddd0ea41e2ec9bea3ba35a487e39c"), - "freebsd_amd64": ("go1.17.10.freebsd-amd64.tar.gz", "33794d96f58608fdc023c5114ae9baeeb4111a74720c8830ff25029debe508f0"), - "linux_amd64": ("go1.17.10.linux-amd64.tar.gz", "87fc728c9c731e2f74e4a999ef53cf07302d7ed3504b0839027bd9c10edaa3fd"), - "linux_arm64": ("go1.17.10.linux-arm64.tar.gz", "649141201efa7195403eb1301b95dc79c5b3e65968986a391da1370521701b0c"), - "windows_amd64": ("go1.17.10.windows-amd64.zip", "ba9198a29fa5c4f322212d21569e8507165c3b34e1ed1f1f9cf6dfb71ddcdeb2"), + "darwin_amd64": ("go1.17.11.darwin-amd64.tar.gz", "4f924c534230de8f0e1c7369f611c0310efd21fc2d9438b13bc2703af9dda25a"), + "darwin_arm64": ("go1.17.11.darwin-arm64.tar.gz", "b8e1ab009c2ff8dea462c7a1263d1f3f38e90ab5262e74c76d70e41a4db320be"), + "freebsd_amd64": ("go1.17.11.freebsd-amd64.tar.gz", "da78bcd5efa24cfa8ca3ccf0d222f7d66b755c4200d404869984ebdcfc7b6aa7"), + "linux_amd64": ("go1.17.11.linux-amd64.tar.gz", "d69a4fe2694f795d8e525c72b497ededc209cb7185f4c3b62d7a98dd6227b3fe"), + "linux_arm64": ("go1.17.11.linux-arm64.tar.gz", "adefa7412c6798f9cad02d1e8336fc2242f5bade30c5b32781759181e01961b7"), + "windows_amd64": ("go1.17.11.windows-amd64.zip", "88e60b92069d8e0932ca5d8bd8227d1693b9570fa2afbedadcc680749c428d54"), }, urls = ["https://storage.googleapis.com/public-bazel-artifacts/go/{}"], - version = "1.17.10", + version = "1.17.11", ) # To point to a local SDK path, use the following instead. We'll call the @@ -237,7 +237,7 @@ seed_yarn_cache(name = "yarn_cache") # Install external dependencies for NPM packages in pkg/ui/ as separate bazel # repositories, to avoid version conflicts between those packages. -# Unfortunately Bazel's rules_nodejs does not support yarn workspaces, so +# Unfortunately Bazel's rules_nodejs does not support yarn workspaces, so # packages have isolated dependencies and must be installed as isolated # Bazel repositories. yarn_install( diff --git a/build/bazelutil/distdir_files.bzl b/build/bazelutil/distdir_files.bzl index 910fdd78c058..650522162ea0 100644 --- a/build/bazelutil/distdir_files.bzl +++ b/build/bazelutil/distdir_files.bzl @@ -973,12 +973,12 @@ DISTDIR_FILES = { "https://storage.googleapis.com/public-bazel-artifacts/c-deps/20220520-181309/libproj_foreign.macos.20220520-181309.tar.gz": "aa618a6525c0df669ce1231c837dc46a1a789d54231ae3dcfcada599d0845b22", "https://storage.googleapis.com/public-bazel-artifacts/c-deps/20220520-181309/libproj_foreign.macosarm.20220520-181309.tar.gz": "e3206fa7c2544d1817103c3c61f132d342b5d4a07404d1de4582da968b622749", "https://storage.googleapis.com/public-bazel-artifacts/c-deps/20220520-181309/libproj_foreign.windows.20220520-181309.tar.gz": "4d935601d1b989ff020c66011355a0e2985abacf36f9d952f7f1ce34d54684ad", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.darwin-amd64.tar.gz": "84979d5985c70cee6f303050a7e811440aad7f304efdf28665b200f096b01945", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.darwin-arm64.tar.gz": "32098bea40117ea1ec23e7124cd188db6bdddd0ea41e2ec9bea3ba35a487e39c", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.freebsd-amd64.tar.gz": "33794d96f58608fdc023c5114ae9baeeb4111a74720c8830ff25029debe508f0", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.linux-amd64.tar.gz": "87fc728c9c731e2f74e4a999ef53cf07302d7ed3504b0839027bd9c10edaa3fd", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.linux-arm64.tar.gz": "649141201efa7195403eb1301b95dc79c5b3e65968986a391da1370521701b0c", - "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.10.windows-amd64.zip": "ba9198a29fa5c4f322212d21569e8507165c3b34e1ed1f1f9cf6dfb71ddcdeb2", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.darwin-amd64.tar.gz": "4f924c534230de8f0e1c7369f611c0310efd21fc2d9438b13bc2703af9dda25a", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.darwin-arm64.tar.gz": "b8e1ab009c2ff8dea462c7a1263d1f3f38e90ab5262e74c76d70e41a4db320be", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.freebsd-amd64.tar.gz": "da78bcd5efa24cfa8ca3ccf0d222f7d66b755c4200d404869984ebdcfc7b6aa7", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.linux-amd64.tar.gz": "d69a4fe2694f795d8e525c72b497ededc209cb7185f4c3b62d7a98dd6227b3fe", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.linux-arm64.tar.gz": "adefa7412c6798f9cad02d1e8336fc2242f5bade30c5b32781759181e01961b7", + "https://storage.googleapis.com/public-bazel-artifacts/go/go1.17.11.windows-amd64.zip": "88e60b92069d8e0932ca5d8bd8227d1693b9570fa2afbedadcc680749c428d54", "https://storage.googleapis.com/public-bazel-artifacts/gomod/github.com/bazelbuild/buildtools/v0.0.0-20200718160251-b1667ff58f71/buildtools-v0.0.0-20200718160251-b1667ff58f71.tar.gz": "a9ef5103739dfb5ed2a5b47ab1654842a89695812e4af09e57d7015a5caf97e0", "https://storage.googleapis.com/public-bazel-artifacts/java/railroad/rr-1.63-java8.zip": "d2791cd7a44ea5be862f33f5a9b3d40aaad9858455828ebade7007ad7113fb41", "https://storage.googleapis.com/public-bazel-artifacts/js/node/v16.13.0/node-v16.13.0-darwin-arm64.tar.gz": "46d83fc0bd971db5050ef1b15afc44a6665dee40bd6c1cbaec23e1b40fa49e6d", diff --git a/build/bootstrap/bootstrap-debian.sh b/build/bootstrap/bootstrap-debian.sh index af24c34871a0..7d611c7d9ddd 100755 --- a/build/bootstrap/bootstrap-debian.sh +++ b/build/bootstrap/bootstrap-debian.sh @@ -46,9 +46,9 @@ sudo tar -C /usr --strip-components=1 -zxf /tmp/cmake.tgz && rm /tmp/cmake.tgz # Install Go. trap 'rm -f /tmp/go.tgz' EXIT -curl -fsSL https://dl.google.com/go/go1.17.10.linux-amd64.tar.gz > /tmp/go.tgz +curl -fsSL https://dl.google.com/go/go1.17.11.linux-amd64.tar.gz > /tmp/go.tgz sha256sum -c - < /tmp/go.tgz +curl -fsSL https://dl.google.com/go/go1.17.11.linux-amd64.tar.gz > /tmp/go.tgz sha256sum -c - < /tmp/go_old.tgz sha256sum -c - < /tmp/go.tgz`, + ctx, t, c, node, "download go", `curl -fsSL https://dl.google.com/go/go1.17.11.linux-amd64.tar.gz > /tmp/go.tgz`, ); err != nil { t.Fatal(err) } if err := repeatRunE( ctx, t, c, node, "verify tarball", `sha256sum -c - <= raftStatus.Commit) { - return false - } - } - return true -} - -// replicaMayNeedSnapshot determines whether the replica referred to by -// `replicaID` may be in need of a raft snapshot. If this function is called -// with an empty or nil `raftStatus` (as will be the case when its called by a -// replica that is not the raft leader), we pessimistically assume that -// `replicaID` may need a snapshot. -func replicaMayNeedSnapshot(raftStatus *raft.Status, replica roachpb.ReplicaDescriptor) bool { - if raftStatus == nil || len(raftStatus.Progress) == 0 { - return true - } - if progress, ok := raftStatus.Progress[uint64(replica.ReplicaID)]; ok { - // We can only reasonably assume that the follower replica is not in need of - // a snapshot iff it is in `StateReplicate`. However, even this is racey - // because we can still possibly have an ill-timed log truncation between - // when we make this determination and when we act on it. - return progress.State != tracker.StateReplicate - } - return true -} - // excludeReplicasInNeedOfSnapshots filters out the `replicas` that may be in // need of a raft snapshot. VOTER_INCOMING replicas are not filtered out. // Other replicas may be filtered out if this function is called with the // `raftStatus` of a non-raft leader replica. func excludeReplicasInNeedOfSnapshots( - ctx context.Context, raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor, + ctx context.Context, st *raft.Status, firstIndex uint64, replicas []roachpb.ReplicaDescriptor, ) []roachpb.ReplicaDescriptor { filled := 0 for _, repl := range replicas { - if replicaMayNeedSnapshot(raftStatus, repl) { + if raftutil.ReplicaMayNeedSnapshot(st, firstIndex, repl.ReplicaID) != raftutil.NoSnapshotNeeded { log.VEventf( ctx, 5, diff --git a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go index 2018a73623c1..fe0106903437 100644 --- a/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go +++ b/pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go @@ -1664,6 +1664,7 @@ func (r *mockRepl) RaftStatus() *raft.Status { raftStatus := &raft.Status{ Progress: make(map[uint64]tracker.Progress), } + raftStatus.RaftState = raft.StateLeader for i := int32(1); i <= r.replicationFactor; i++ { state := tracker.StateReplicate if _, ok := r.replsInNeedOfSnapshot[roachpb.ReplicaID(i)]; ok { @@ -1674,6 +1675,10 @@ func (r *mockRepl) RaftStatus() *raft.Status { return raftStatus } +func (r *mockRepl) GetFirstIndex() uint64 { + return 0 +} + func (r *mockRepl) StoreID() roachpb.StoreID { return r.storeID } @@ -7138,6 +7143,7 @@ func TestFilterBehindReplicas(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = c.leader + status.RaftState = raft.StateLeader status.Commit = c.commit var replicas []roachpb.ReplicaDescriptor for j, v := range c.progress { @@ -7210,6 +7216,7 @@ func TestFilterUnremovableReplicas(t *testing.T) { // Use an invalid replica ID for the leader. TestFilterBehindReplicas covers // valid replica IDs. status.Lead = 99 + status.RaftState = raft.StateLeader status.Commit = c.commit var replicas []roachpb.ReplicaDescriptor for j, v := range c.progress { @@ -7267,6 +7274,7 @@ func TestSimulateFilterUnremovableReplicas(t *testing.T) { // Use an invalid replica ID for the leader. TestFilterBehindReplicas covers // valid replica IDs. status.Lead = 99 + status.RaftState = raft.StateLeader status.Commit = c.commit var replicas []roachpb.ReplicaDescriptor for j, v := range c.progress { diff --git a/pkg/kv/kvserver/allocator_impl_test.go b/pkg/kv/kvserver/allocator_impl_test.go index 44a829765b6a..b13af5c72c19 100644 --- a/pkg/kv/kvserver/allocator_impl_test.go +++ b/pkg/kv/kvserver/allocator_impl_test.go @@ -170,6 +170,8 @@ func TestAllocatorRebalanceTarget(t *testing.T) { status := &raft.Status{ Progress: make(map[uint64]tracker.Progress), } + status.Lead = 1 + status.RaftState = raft.StateLeader status.Commit = 10 for _, replica := range replicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ diff --git a/pkg/kv/kvserver/deprecated_store_rebalancer.go b/pkg/kv/kvserver/deprecated_store_rebalancer.go index e47dad57f8be..d65957d88899 100644 --- a/pkg/kv/kvserver/deprecated_store_rebalancer.go +++ b/pkg/kv/kvserver/deprecated_store_rebalancer.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -108,7 +109,7 @@ func (sr *StoreRebalancer) deprecatedChooseLeaseToTransfer( if raftStatus == nil { raftStatus = sr.getRaftStatusFn(replWithStats.repl) } - if allocatorimpl.ReplicaIsBehind(raftStatus, candidate.ReplicaID) { + if raftutil.ReplicaIsBehind(raftStatus, candidate.ReplicaID) { log.VEventf(ctx, 3, "%v is behind or this store isn't the raft leader for r%d; raftStatus: %v", candidate, desc.RangeID, raftStatus) continue @@ -297,7 +298,7 @@ func (sr *StoreRebalancer) deprecatedChooseRangeToRebalance( if raftStatus == nil { raftStatus = sr.getRaftStatusFn(replWithStats.repl) } - if allocatorimpl.ReplicaIsBehind(raftStatus, replica.ReplicaID) { + if raftutil.ReplicaIsBehind(raftStatus, replica.ReplicaID) { continue } } diff --git a/pkg/kv/kvserver/deprecated_store_rebalancer_test.go b/pkg/kv/kvserver/deprecated_store_rebalancer_test.go index aff73ed7754e..901d10134b89 100644 --- a/pkg/kv/kvserver/deprecated_store_rebalancer_test.go +++ b/pkg/kv/kvserver/deprecated_store_rebalancer_test.go @@ -105,6 +105,7 @@ func TestDeprecatedChooseLeaseToTransfer(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -219,6 +220,7 @@ func TestDeprecatedChooseRangeToRebalanceBalanceScore(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -290,6 +292,7 @@ func TestDeprecatedChooseRangeToRebalance(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -652,6 +655,7 @@ func TestDeprecatedNoLeaseTransferToBehindReplicas(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { match := uint64(1) diff --git a/pkg/kv/kvserver/raftutil/BUILD.bazel b/pkg/kv/kvserver/raftutil/BUILD.bazel new file mode 100644 index 000000000000..c3a2cdc3077d --- /dev/null +++ b/pkg/kv/kvserver/raftutil/BUILD.bazel @@ -0,0 +1,24 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") + +go_library( + name = "raftutil", + srcs = ["util.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil", + visibility = ["//visibility:public"], + deps = [ + "//pkg/roachpb", + "@io_etcd_go_etcd_raft_v3//:raft", + "@io_etcd_go_etcd_raft_v3//tracker", + ], +) + +go_test( + name = "raftutil_test", + srcs = ["util_test.go"], + embed = [":raftutil"], + deps = [ + "@com_github_stretchr_testify//require", + "@io_etcd_go_etcd_raft_v3//:raft", + "@io_etcd_go_etcd_raft_v3//tracker", + ], +) diff --git a/pkg/kv/kvserver/raftutil/util.go b/pkg/kv/kvserver/raftutil/util.go new file mode 100644 index 000000000000..029ae4d3e04e --- /dev/null +++ b/pkg/kv/kvserver/raftutil/util.go @@ -0,0 +1,185 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package raftutil + +import ( + "github.com/cockroachdb/cockroach/pkg/roachpb" + "go.etcd.io/etcd/raft/v3" + "go.etcd.io/etcd/raft/v3/tracker" +) + +// ReplicaIsBehind returns whether the given peer replica is considered behind +// according to the raft log. If this function is called with a raft.Status that +// indicates that our local replica is not the raft leader, we pessimistically +// assume that replicaID is behind on its log. +func ReplicaIsBehind(st *raft.Status, replicaID roachpb.ReplicaID) bool { + if st == nil { + // Testing only. + return true + } + if st.RaftState != raft.StateLeader { + // If we aren't the Raft leader, we aren't tracking the replica's progress, + // so we can't be sure it's not behind. + return true + } + progress, ok := st.Progress[uint64(replicaID)] + if !ok { + return true + } + if uint64(replicaID) == st.Lead { + // If the replica is the leader, it cannot be behind on the log. + return false + } + if progress.State == tracker.StateReplicate && progress.Match >= st.Commit { + // If the replica has a matching log entry at or above the current commit + // index, it is caught up on its log. + return false + } + return true +} + +// ReplicaNeedsSnapshotStatus enumerates the possible states that a peer replica +// may be in when the local replica makes a determination of whether the peer +// may be in need of a Raft snapshot. NoSnapshotNeeded indicates that a Raft +// snapshot is definitely not needed. All other values indicate that for one +// reason or another the local replica cannot say with certainty that the peer +// does not need a snapshot, either because it knows that the peer does need a +// snapshot or because it does not know. +type ReplicaNeedsSnapshotStatus int + +const ( + // NoSnapshotNeeded means that the local replica knows for sure that the peer + // replica does not need a Raft snapshot. This status is only possible if the + // local replica is the Raft leader, because only the Raft leader tracks the + // progress of other replicas. + // + // There are two ways that a peer replica that is at some point seen to be in + // this state can later end up in need of a Raft snapshot: + // 1. a log truncation cuts the peer replica off from the log, preventing it + // from catching up using log entries. The Raft log queue attempts to avoid + // cutting any follower off from connectivity with the leader's log, but + // there are cases where it does so. + // 2. raft leadership is transferred to a replica whose log does not extend + // back as far as the current raft leader's log. This is possible because + // different replicas can have Raft logs with different starting points + // ("first indexes"). See discussion about #35701 in #81561. + NoSnapshotNeeded ReplicaNeedsSnapshotStatus = iota + + // LocalReplicaNotLeader means that the local replica is not the Raft leader, + // so it does not keep track of enough progress information about peers to + // determine whether they are in need of a Raft snapshot or not. + LocalReplicaNotLeader + + // ReplicaUnknown means that the peer replica is not known by the Raft leader + // and is not part of the Raft group. + ReplicaUnknown + + // ReplicaStateProbe means that the local Raft leader is still probing the + // peer replica to determine the index of matching tail of its log. + ReplicaStateProbe + + // ReplicaStateSnapshot means that the local Raft leader has determined that + // the peer replica needs a Raft snapshot. + ReplicaStateSnapshot + + // ReplicaMatchBelowLeadersFirstIndex means that the local Raft leader has + // determined that the peer replica's latest matching log index is below the + // leader's log's current first index. This can happen if a peer replica is + // initially connected to the Raft leader's log but gets disconnected due to a + // log truncation. etcd/raft will notice this state after sending the next + // MsgApp and move the peer to StateSnapshot. + ReplicaMatchBelowLeadersFirstIndex + + // NoRaftStatusAvailable is only possible in tests. + NoRaftStatusAvailable +) + +func (s ReplicaNeedsSnapshotStatus) String() string { + switch s { + case NoSnapshotNeeded: + return "no snapshot needed" + case LocalReplicaNotLeader: + return "local replica not raft leader" + case ReplicaUnknown: + return "replica unknown" + case ReplicaStateProbe: + return "replica in StateProbe" + case ReplicaStateSnapshot: + return "replica in StateSnapshot" + case ReplicaMatchBelowLeadersFirstIndex: + return "replica's match index below leader's first index" + case NoRaftStatusAvailable: + return "no raft status available" + default: + return "unknown ReplicaNeedsSnapshotStatus" + } +} + +// ReplicaMayNeedSnapshot determines whether the given peer replica may be in +// need of a raft snapshot. If this function is called with a raft.Status that +// indicates that our local replica is not the raft leader, we pessimistically +// assume that replicaID may need a snapshot. +func ReplicaMayNeedSnapshot( + st *raft.Status, firstIndex uint64, replicaID roachpb.ReplicaID, +) ReplicaNeedsSnapshotStatus { + if st == nil { + // Testing only. + return NoRaftStatusAvailable + } + if st.RaftState != raft.StateLeader { + // If we aren't the Raft leader, we aren't tracking the replica's progress, + // so we can't be sure it does not need a snapshot. + return LocalReplicaNotLeader + } + progress, ok := st.Progress[uint64(replicaID)] + if !ok { + // We don't know about the specified replica. + return ReplicaUnknown + } + switch progress.State { + case tracker.StateReplicate: + // We can only reasonably assume that the follower replica is not in need of + // a snapshot if it is in StateReplicate. + case tracker.StateProbe: + // If the follower is in StateProbe then we are still in the process of + // determining where our logs match. + return ReplicaStateProbe + case tracker.StateSnapshot: + // If the follower is in StateSnapshot then it needs a snapshot. + return ReplicaStateSnapshot + default: + panic("unknown tracker.StateType") + } + if progress.Match+1 < firstIndex { + // Even if the follower is in StateReplicate, it could have been cut off + // from the log by a recent log truncation that hasn't been recognized yet + // by raft. Confirm that this is not the case. + return ReplicaMatchBelowLeadersFirstIndex + } + // Even if we get here, this can still be racy because: + // 1. we may think we are the Raft leader but may have been or will be voted + // out without realizing. If another peer takes over as the Raft leader, it + // may commit additional log entries and then cut the peer off from the log. + // 2. we can still have an ill-timed log truncation between when we make this + // determination and when we act on it. + // + // In order to eliminate the potential for a race when acting on this + // information, we must ensure: + // 1. that any action we take is conditional on still being the Raft leader. + // In practice, this means that we should check this condition immediately + // before proposing a Raft command, so we can be sure that the command is + // not redirected through another Raft leader. That way, if we were + // replaced as Raft leader, the proposal will fail. + // 2. that we do not perform a log truncation between now and when our action + // goes into effect. In practice, this means serializing with Raft log + // truncation operations using latching. + return NoSnapshotNeeded +} diff --git a/pkg/kv/kvserver/raftutil/util_test.go b/pkg/kv/kvserver/raftutil/util_test.go new file mode 100644 index 000000000000..529d018f1336 --- /dev/null +++ b/pkg/kv/kvserver/raftutil/util_test.go @@ -0,0 +1,207 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. + +package raftutil + +import ( + "testing" + + "github.com/stretchr/testify/require" + "go.etcd.io/etcd/raft/v3" + "go.etcd.io/etcd/raft/v3/tracker" +) + +func TestReplicaIsBehind(t *testing.T) { + const replicaID = 3 + makeStatus := func(f func(*raft.Status)) *raft.Status { + st := new(raft.Status) + st.Commit = 10 + st.Progress = make(map[uint64]tracker.Progress) + f(st) + return st + } + + tests := []struct { + name string + st *raft.Status + expect bool + }{ + { + name: "local follower", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateFollower + }), + expect: true, + }, + { + name: "local candidate", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateCandidate + }), + expect: true, + }, + { + name: "local leader, no progress for peer", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + }), + expect: true, + }, + { + name: "local leader, peer leader", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate} + st.Lead = replicaID + }), + expect: false, + }, + { + name: "local leader, peer state probe", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateProbe} + }), + expect: true, + }, + { + name: "local leader, peer state snapshot", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateSnapshot} + }), + expect: true, + }, + { + name: "local leader, peer state replicate, match < commit", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 9} + }), + expect: true, + }, + { + name: "local leader, peer state replicate, match == commit", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 10} + }), + expect: false, + }, + { + name: "local leader, peer state replicate, match > commit", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 11} + }), + expect: false, + }, + { + name: "nil raft status", + st: nil, + expect: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.expect, ReplicaIsBehind(tt.st, replicaID)) + }) + } +} + +func TestReplicaMayNeedSnapshot(t *testing.T) { + const firstIndex = 10 + const replicaID = 3 + makeStatus := func(f func(*raft.Status)) *raft.Status { + st := new(raft.Status) + st.Commit = 10 + st.Progress = make(map[uint64]tracker.Progress) + f(st) + return st + } + + tests := []struct { + name string + st *raft.Status + expect ReplicaNeedsSnapshotStatus + }{ + { + name: "local follower", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateFollower + }), + expect: LocalReplicaNotLeader, + }, + { + name: "local candidate", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateCandidate + }), + expect: LocalReplicaNotLeader, + }, + { + name: "local leader, no progress for peer", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + }), + expect: ReplicaUnknown, + }, + { + name: "local leader, peer state probe", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateProbe} + }), + expect: ReplicaStateProbe, + }, + { + name: "local leader, peer state snapshot", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateSnapshot} + }), + expect: ReplicaStateSnapshot, + }, + { + name: "local leader, peer state replicate, match+1 < firstIndex", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 8} + }), + expect: ReplicaMatchBelowLeadersFirstIndex, + }, + { + name: "local leader, peer state replicate, match+1 == firstIndex", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 9} + }), + expect: NoSnapshotNeeded, + }, + { + name: "local leader, peer state replicate, match+1 == firstIndex", + st: makeStatus(func(st *raft.Status) { + st.RaftState = raft.StateLeader + st.Progress[replicaID] = tracker.Progress{State: tracker.StateReplicate, Match: 10} + }), + expect: NoSnapshotNeeded, + }, + { + name: "nil raft status", + st: nil, + expect: NoRaftStatusAvailable, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + require.Equal(t, tt.expect, ReplicaMayNeedSnapshot(tt.st, firstIndex, replicaID)) + }) + } +} diff --git a/pkg/kv/kvserver/store_rebalancer.go b/pkg/kv/kvserver/store_rebalancer.go index 5ccc1bdc51a8..6abb6022e2e2 100644 --- a/pkg/kv/kvserver/store_rebalancer.go +++ b/pkg/kv/kvserver/store_rebalancer.go @@ -20,6 +20,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/allocatorimpl" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/allocator/storepool" + "github.com/cockroachdb/cockroach/pkg/kv/kvserver/raftutil" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" @@ -648,7 +649,7 @@ func (sr *StoreRebalancer) chooseRangeToRebalance( if raftStatus == nil { raftStatus = sr.getRaftStatusFn(replWithStats.repl) } - if allocatorimpl.ReplicaIsBehind(raftStatus, replica.ReplicaID) { + if raftutil.ReplicaIsBehind(raftStatus, replica.ReplicaID) { continue } } diff --git a/pkg/kv/kvserver/store_rebalancer_test.go b/pkg/kv/kvserver/store_rebalancer_test.go index 937e8e44b24d..80f9e25af08f 100644 --- a/pkg/kv/kvserver/store_rebalancer_test.go +++ b/pkg/kv/kvserver/store_rebalancer_test.go @@ -455,6 +455,7 @@ func loadRanges(rr *replicaRankings, s *Store, ranges []testRange) { Expiration: &hlc.MaxTimestamp, Replica: repl.mu.state.Desc.InternalReplicas[0], } + repl.mu.state.TruncatedState = &roachpb.RaftTruncatedState{} for _, storeID := range r.nonVoters { repl.mu.state.Desc.InternalReplicas = append(repl.mu.state.Desc.InternalReplicas, roachpb.ReplicaDescriptor{ NodeID: roachpb.NodeID(storeID), @@ -517,6 +518,7 @@ func TestChooseLeaseToTransfer(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -786,6 +788,7 @@ func TestChooseRangeToRebalanceRandom(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -1064,6 +1067,7 @@ func TestChooseRangeToRebalanceAcrossHeterogeneousZones(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -1312,6 +1316,7 @@ func TestChooseRangeToRebalanceOffHotNodes(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { status.Progress[uint64(replica.ReplicaID)] = tracker.Progress{ @@ -1394,6 +1399,7 @@ func TestNoLeaseTransferToBehindReplicas(t *testing.T) { Progress: make(map[uint64]tracker.Progress), } status.Lead = uint64(r.ReplicaID()) + status.RaftState = raft.StateLeader status.Commit = 1 for _, replica := range r.Desc().InternalReplicas { match := uint64(1) diff --git a/pkg/sql/pgwire/conn.go b/pkg/sql/pgwire/conn.go index 752c370d37a9..f92e125c17ab 100644 --- a/pkg/sql/pgwire/conn.go +++ b/pkg/sql/pgwire/conn.go @@ -1474,6 +1474,9 @@ func writeErrFields( msgBuilder.putErrFieldMsg(pgwirebase.ServerErrFieldSeverity) msgBuilder.writeTerminatedString(pgErr.Severity) + msgBuilder.putErrFieldMsg(pgwirebase.ServerErrFieldSeverityNonLocalized) + msgBuilder.writeTerminatedString(pgErr.Severity) + msgBuilder.putErrFieldMsg(pgwirebase.ServerErrFieldSQLState) msgBuilder.writeTerminatedString(pgErr.Code) diff --git a/pkg/sql/pgwire/pgwirebase/msg.go b/pkg/sql/pgwire/pgwirebase/msg.go index fa50ba48e4f9..b4ee25dab53b 100644 --- a/pkg/sql/pgwire/pgwirebase/msg.go +++ b/pkg/sql/pgwire/pgwirebase/msg.go @@ -61,15 +61,16 @@ type ServerErrFieldType byte // http://www.postgresql.org/docs/current/static/protocol-error-fields.html const ( - ServerErrFieldSeverity ServerErrFieldType = 'S' - ServerErrFieldSQLState ServerErrFieldType = 'C' - ServerErrFieldMsgPrimary ServerErrFieldType = 'M' - ServerErrFieldDetail ServerErrFieldType = 'D' - ServerErrFieldHint ServerErrFieldType = 'H' - ServerErrFieldSrcFile ServerErrFieldType = 'F' - ServerErrFieldSrcLine ServerErrFieldType = 'L' - ServerErrFieldSrcFunction ServerErrFieldType = 'R' - ServerErrFieldConstraintName ServerErrFieldType = 'n' + ServerErrFieldSeverity ServerErrFieldType = 'S' + ServerErrFieldSeverityNonLocalized ServerErrFieldType = 'V' + ServerErrFieldSQLState ServerErrFieldType = 'C' + ServerErrFieldMsgPrimary ServerErrFieldType = 'M' + ServerErrFieldDetail ServerErrFieldType = 'D' + ServerErrFieldHint ServerErrFieldType = 'H' + ServerErrFieldSrcFile ServerErrFieldType = 'F' + ServerErrFieldSrcLine ServerErrFieldType = 'L' + ServerErrFieldSrcFunction ServerErrFieldType = 'R' + ServerErrFieldConstraintName ServerErrFieldType = 'n' ) // PrepareType represents a subtype for prepare messages. diff --git a/pkg/sql/pgwire/pgwirebase/servererrfieldtype_string.go b/pkg/sql/pgwire/pgwirebase/servererrfieldtype_string.go index 7b4546d406ab..8aa43ba59cc4 100644 --- a/pkg/sql/pgwire/pgwirebase/servererrfieldtype_string.go +++ b/pkg/sql/pgwire/pgwirebase/servererrfieldtype_string.go @@ -9,6 +9,7 @@ func _() { // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[ServerErrFieldSeverity-83] + _ = x[ServerErrFieldSeverityNonLocalized-86] _ = x[ServerErrFieldSQLState-67] _ = x[ServerErrFieldMsgPrimary-77] _ = x[ServerErrFieldDetail-68] @@ -25,7 +26,8 @@ const ( _ServerErrFieldType_name_2 = "ServerErrFieldHint" _ServerErrFieldType_name_3 = "ServerErrFieldSrcLineServerErrFieldMsgPrimary" _ServerErrFieldType_name_4 = "ServerErrFieldSrcFunctionServerErrFieldSeverity" - _ServerErrFieldType_name_5 = "ServerErrFieldConstraintName" + _ServerErrFieldType_name_5 = "ServerErrFieldSeverityNonLocalized" + _ServerErrFieldType_name_6 = "ServerErrFieldConstraintName" ) var ( @@ -49,8 +51,10 @@ func (i ServerErrFieldType) String() string { case 82 <= i && i <= 83: i -= 82 return _ServerErrFieldType_name_4[_ServerErrFieldType_index_4[i]:_ServerErrFieldType_index_4[i+1]] - case i == 110: + case i == 86: return _ServerErrFieldType_name_5 + case i == 110: + return _ServerErrFieldType_name_6 default: return "ServerErrFieldType(" + strconv.FormatInt(int64(i), 10) + ")" } diff --git a/pkg/sql/pgwire/testdata/pgtest/notice b/pkg/sql/pgwire/testdata/pgtest/notice index 9640250a9d9a..19cba11422be 100644 --- a/pkg/sql/pgwire/testdata/pgtest/notice +++ b/pkg/sql/pgwire/testdata/pgtest/notice @@ -55,7 +55,7 @@ Query {"String": "DROP INDEX t_x_idx"} until crdb_only CommandComplete ---- -{"Severity":"NOTICE","SeverityUnlocalized":"","Code":"00000","Message":"the data for dropped indexes is reclaimed asynchronously","Detail":"","Hint":"The reclamation delay can be customized in the zone configuration for the table.","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"drop_index.go","Line":572,"Routine":"dropIndexByName","UnknownFields":null} +{"Severity":"NOTICE","SeverityUnlocalized":"NOTICE","Code":"00000","Message":"the data for dropped indexes is reclaimed asynchronously","Detail":"","Hint":"The reclamation delay can be customized in the zone configuration for the table.","Position":0,"InternalPosition":0,"InternalQuery":"","Where":"","SchemaName":"","TableName":"","ColumnName":"","DataTypeName":"","ConstraintName":"","File":"drop_index.go","Line":572,"Routine":"dropIndexByName","UnknownFields":null} {"Type":"CommandComplete","CommandTag":"DROP INDEX"} until noncrdb_only