From cfd625ef1d4d9d9fcc8b1d28f5a43d57088233bc Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Tue, 14 Dec 2021 14:36:37 +0100
Subject: [PATCH] kvserver: improve Store comment

This incorporates post-merge feedback on #72745.

Release note: None
---
 pkg/kv/kvserver/addressing.go                 |  18 ++-
 .../replica_application_state_machine.go      |  15 ++-
 pkg/kv/kvserver/store.go                      | 110 +++++++++++-------
 3 files changed, 97 insertions(+), 46 deletions(-)

diff --git a/pkg/kv/kvserver/addressing.go b/pkg/kv/kvserver/addressing.go
index 33ac948bdc49..7f8c892a2369 100644
--- a/pkg/kv/kvserver/addressing.go
+++ b/pkg/kv/kvserver/addressing.go
@@ -50,8 +50,22 @@ func mergeRangeAddressing(b *kv.Batch, left, merged *roachpb.RangeDescriptor) er
 	return rangeAddressing(b, merged, putMeta)
 }
 
-// updateRangeAddressing overwrites the meta1 and meta2 range addressing
-// records for the descriptor.
+// updateRangeAddressing overwrites the meta1 and meta2 range addressing records
+// for the descriptor. These are copies of the range descriptor but keyed by the
+// EndKey (for historical reasons relating to ReverseScan not having been
+// available); we may use the StartKey if starting from scratch today). While
+// they are usually in sync with the main copy under
+// RangeDescriptorKey(StartKey), after loss-of-quorum recovery a resurrected
+// range may be operating under a newer descriptor (containing fewer replicas).
+// This is why updateRangeAddressing uses Put over ConditionalPut, thus allowing
+// up-replication to "repair" the meta copies of the descriptor as a by-product.
+//
+// The fact that the meta2 ranges can split is the source of considerable
+// complication such as storing the descriptor for the last meta2 range twice,
+// once at Meta1KeyMax and another time at RangeMetaKey(desc.EndKey); see
+// #18998.
+//
+// See also kv.RangeLookup for more information on the meta ranges.
 func updateRangeAddressing(b *kv.Batch, desc *roachpb.RangeDescriptor) error {
 	return rangeAddressing(b, desc, putMeta)
 }
diff --git a/pkg/kv/kvserver/replica_application_state_machine.go b/pkg/kv/kvserver/replica_application_state_machine.go
index a376607c570c..4a9fbcc52283 100644
--- a/pkg/kv/kvserver/replica_application_state_machine.go
+++ b/pkg/kv/kvserver/replica_application_state_machine.go
@@ -1313,7 +1313,7 @@ func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *re
 			// implies that any previous config changes are durably known to be
 			// committed. That is, a commit index is persisted (and synced) that
 			// encompasses any earlier config changes before a new config change is
-			// appended.
+			// appended[1].
 			//
 			// Together, these invariants ensure that a follower that is behind by
 			// multiple configuration changes will be using one of the two most recent
@@ -1334,6 +1334,19 @@ func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *re
 			// that it will apply all the way up to and including the second most
 			// recent configuration change, which is compatible with the most recent
 			// one.
+			//
+			// [1]: this rests on shaky and, in particular, untested foundations in
+			// etcd/raft and our syncing behavior. The argument goes as follows:
+			// because the leader will have at most one config change in flight at a
+			// given time, it will definitely wait until the previous config change is
+			// committed until accepting the next one. `etcd/raft` will always attach
+			// the optimal commit index to appends to followers, so each config change
+			// will mark the previous one as committed upon receipt, since we sync on
+			// append (as we have to) we make that HardState.Commit durable. Finally,
+			// when a follower is catching up on larger chunks of the historical log,
+			// it will receive batches of entries together with a committed index
+			// encompassing the entire batch, again making sure that these batches are
+			// durably committed upon receipt.
 			rn.ApplyConfChange(cmd.confChange.ConfChangeI)
 			return true, nil
 		})
diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go
index 55d7374d4084..b106db35bafa 100644
--- a/pkg/kv/kvserver/store.go
+++ b/pkg/kv/kvserver/store.go
@@ -459,7 +459,8 @@ these operations at some point will
 - update the RangeDescriptor (for example, to reflect a split, or a change
 to the Replicas comprising the members of the Range)
 
-- update the meta ranges (which form a search index used for request routing)
+- update the meta ranges (which form a search index used for request routing, see
+  kv.RangeLookup and updateRangeAddressing for details)
 
 - commit with a roachpb.InternalCommitTrigger.
 
@@ -479,7 +480,8 @@ ResolveIntent acting on the RangeDescriptor.
 
 INVARIANT: A Store never contains two Replicas from the same Range, nor do the
 key ranges for any two of its Replicas overlap. (Note that there is no
-requirement that these Replicas come from a consistent set of Ranges).
+requirement that these Replicas come from a consistent set of Ranges; the
+RangeDescriptor.Generation orders overlapping descriptors).
 
 To illustrate this last invariant, consider a split of a Replica [a-z) into two
 Replicas, [a-c) and [c,z). The Store will never contain both; it has to swap
@@ -516,39 +518,43 @@ straightforward but when Ranges are being reconfigured, an understanding the
 Replica Lifecycle becomes important and upholding the Store's invariants becomes
 more complex.
 
-A first phenomenon to understand is that of uninitialized Replicas. Such a
-Replica corresponds to a State Machine that has yet to apply any entries, and in
-particular has no notion of an active RangeDescriptor yet. This state exists for
-various reasons:
-
-- A newly added voter already needs to be able to cast a vote even before it has
-had time to be caught up by other nodes. In the extreme case, the very first
-Raft message received at a Store leading to the creation of an (uninitialized)
-Replica may be a request for a vote that is required for quorum. In practice we
-mostly sidestep this voting requirement by adding new members through an
-intermediate state (see roachpb.LEARNER) that allows them to catch up and become
-initialized before making them voters, but at the protocol level this still
-relies on the concept of a Replica without state that can act as a Raft peer.
-
-- For practical reasons, we don't preserve the entire committed replicated log
-for eternity. It will periodically get truncated, i.e. a prefix discarded, and
-this leads to followers occasionally being forced to catch up on the log via a
-Raft Snapshot, i.e. a copy of the replicated data in the State Machine as of
-some log position, from which the recipient Replica can then continue to receive
-and apply log entries.
-
-- In CockroachDB, a newly created Range (say at cluster bootstrap, or during a
-Range split) is never empty - it has to contain at least the RangeDescriptor.
-For the split case, indeed it contains around half of the data originally housed
-in the pre-split Range, and it is burdensome to distribute potentially hundreds
-of megabytes through the Raft log. So what we do in CockroachDB is to never use
-Raft entries zero through nine (see raftInitialLogIndex), which means that a
-newly added Replica will need a first Snapshot to obtain access to the log.
-
-Externally, uninitialized Replicas should be viewed as an implementation
-detail that is (or should be!) invisible to most access to a Store in the
-context of processing requests coming from the KV API, as uninitialized
-Replicas cannot serve such requests.
+A first phenomenon to understand is that of uninitialized Replicas, which is the
+State Machine at applied index zero, i.e. has an empty state. In CockroachDB, an
+uninitialized Replica can only advance to a nonzero log position ("become
+initialized") via a Raft snapshot (this is because we initialize all Ranges in
+the system at log index raftInitialLogIndex which allows us to write arbitrary
+amounts of data into the initial state without having to worry about the size
+of individual log entries; see WriteInitialReplicaState).
+
+An uninitialized Replica has has no notion of ts active RangeDescriptor yet, has
+no data, and should be thought of as a pure raft peer that can react to incoming
+messages (it can't send messages, as it is unaware of who the other members of
+the group are; its configuration is zero, consistent with the configuration
+represented by "no RangeDescriptor"); in particular it may be asked to cast a
+vote to determine Raft leadership. (This should not be required in CockroachDB
+today since we always add an uninitialized Replica as a roachpb.LEARNER first
+and only promote it after it has successfully received a Raft snapshot; we
+don't rely on this fact today)
+
+Uninitialized Replicas should be viewed as an implementation detail that is (or
+should be!) invisible to most access to a Store in the context of processing
+requests coming from the KV API, as uninitialized Replicas cannot serve such
+requests. At the time of writing, uninitialized Replicas need to be handled
+in many code paths that should never encounter them in the first place. We
+will be addressing this with #72374.
+
+Raft snapshots can also be directed at initialized Replicas. For practical
+reasons, we cannot preserve the entire committed replicated log forever and
+periodically purge a prefix that is known to be durably applied on a quorum of
+peers. Under certain circumstances (see newTruncateDecision) this may cut a
+follower off from the log, and this follower will require a Raft snapshot before
+being able to resume replication. Another (rare) source of snapshots occurs
+around Range splits. During a split, the involved Replicas shrink and
+simultaneously instantiate the right-hand side Replica, but since Replicas carry
+out this operation at different times, a "faster" initialized right-hand side
+might contact a "slow" store on which the right-hand side has not yet been
+instantiated, leading to the creation of an uninitialized Replica that may
+request a snapshot. See maybeDelaySplitToAvoidSnapshot.
 
 The diagram is a lot to take in. The various transitions are discussed in
 prose below, and the source .dot file is in store_doc_replica_lifecycle.dot.
@@ -589,7 +595,6 @@ prose below, and the source .dot file is in store_doc_replica_lifecycle.dot.
                                 |                                                                                             | <-------------------------------+
                                 +---------------------------------------------------------------------------------------------+
 
-
 When a Store starts, it iterates through all RangeDescriptors it can find on its
 Engine. Finding a RangeDescriptor by definition implies that the Replica is
 initialized. Raft state (a raftpb.HardState) for uninitialized Replicas may
@@ -626,7 +631,7 @@ ReplicaID, the uninitialized Replica is removed. There is currently no
 general-purpose mechanism to determine whether an uninitialized Replica is
 outdated; an uninitialized Replica could in principle leak "forever" if the
 Range quickly changes its members such that the triggers mentioned here don't
-apply A full scan of meta2 would be required as there is no RangeID-keyed index
+apply. A full scan of meta2 would be required as there is no RangeID-keyed index
 on the RangeDescriptors (the meta ranges are keyed on the EndKey, which allows
 routing requests based on the key ranges they touch). The fact that
 uninitialized Replicas can be removed has to be taken into account by splits as
@@ -656,13 +661,32 @@ co-located on the same Stores as well as are all initialized.
 
 INVARIANT: An initialized Replica's RangeDescriptor always includes it as a member.
 
-INVARIANT: A Replica's ReplicaID is constant.
+INVARIANT: RangeDescriptor.StartKey is immutable (splits and merges mutate the
+EndKey only). In particular, an initialized Replica's StartKey is immutable.
 
-Together, these invariants significantly reduce complexity since we do not need
-to handle the excluded situations. Particularly a changing replicaID is
-bug-inducing since at the Replication layer a change in replica ID is a complete
-change of identity, and re-use of in-memory structures poses the threat of
-erroneously re-using cached information.
+INVARIANT: A Replica's ReplicaID is constant.
+NOTE: the way to read this is that a Replica object will not be re-used for
+multiple ReplicaIDs. Instead, the existing Replica will be destroyed and a new
+Replica instantiated.
+
+These invariants significantly reduce complexity since we do not need to handle
+the excluded situations. Particularly a changing replicaID is bug-inducing since
+at the Replication layer a change in replica ID is a complete change of
+identity, and re-use of in-memory structures poses the threat of erroneously
+re-using cached information.
+
+INVARIANT: for each key `k` in the replicated key space, the Generation of the
+RangeDescriptor containing is strictly increasing over time (see
+RangeDescriptor.Generation).
+NOTE: we rely on this invariant for cache coherency in `kvcoord`; it currently
+plays no role in `kvserver` though we could use it to improve the handling of
+snapshot overlaps; see Store.checkSnapshotOverlapLocked.
+NOTE: a key may be owned by different RangeIDs at different points in time due
+to Range splits and merges.
+
+INVARIANT: on each Store and for each RangeID, the ReplicaID is strictly
+increasing over time (see Replica.setTombstoneKey).
+NOTE: to the best of our knowledge, we don't rely on this invariant.
 */
 type Store struct {
 	Ident              *roachpb.StoreIdent // pointer to catch access before Start() is called