From 89162cb558006893a56599f4f287c20aec929e04 Mon Sep 17 00:00:00 2001
From: Tobias Grieger <tobias.b.grieger@gmail.com>
Date: Mon, 15 Nov 2021 12:33:57 +0100
Subject: [PATCH] kvserver: document replica lifecycle

Part of the research for #72374, which in turn was inspired by #38322.

Release note: None
---
 .../replica_application_state_machine.go      |  45 ++++
 pkg/kv/kvserver/store.go                      | 253 +++++++++++++++++-
 .../kvserver/store_doc_replica_lifecycle.dot  |  40 +++
 3 files changed, 336 insertions(+), 2 deletions(-)
 create mode 100644 pkg/kv/kvserver/store_doc_replica_lifecycle.dot

diff --git a/pkg/kv/kvserver/replica_application_state_machine.go b/pkg/kv/kvserver/replica_application_state_machine.go
index 225425da3945..a82fa62cbad8 100644
--- a/pkg/kv/kvserver/replica_application_state_machine.go
+++ b/pkg/kv/kvserver/replica_application_state_machine.go
@@ -469,6 +469,10 @@ func (b *replicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error
 	// Acquire the split or merge lock, if necessary. If a split or merge
 	// command was rejected with a below-Raft forced error then its replicated
 	// result was just cleared and this will be a no-op.
+	//
+	// TODO(tbg): can't this happen in splitPreApply which is called from
+	// b.runPreApplyTriggersAfterStagingWriteBatch and similar for merges? That
+	// way, it would become less of a one-off.
 	if splitMergeUnlock, err := b.r.maybeAcquireSplitMergeLock(ctx, cmd.raftCmd); err != nil {
 		var err error
 		if cmd.raftCmd.ReplicatedEvalResult.Split != nil {
@@ -1259,6 +1263,47 @@ func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *re
 			return nil
 		}
 		return sm.r.withRaftGroup(true, func(rn *raft.RawNode) (bool, error) {
+			// NB: `etcd/raft` configuration changes diverge from the official Raft way
+			// in that a configuration change becomes active when the corresponding log
+			// entry is applied (rather than appended). This ultimately enables the way
+			// we do things where the state machine's view of the range descriptor always
+			// dictates the active replication config but it is much trickier to prove
+			// correct. See:
+			//
+			// https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411
+			//
+			// INVARIANT: a leader will not append a config change to its logs when it
+			// hasn't applied all previous config changes in its logs.
+			//
+			// INVARIANT: a node will not campaign until it has applied any
+			// configuration changes with indexes less than or equal to its committed
+			// index.
+			//
+			// INVARIANT: appending a config change to the log (at leader or follower)
+			// implies that any previous config changes are durably known to be
+			// committed. That is, a commit index is persisted (and synced) that
+			// encompasses any earlier config changes before a new config change is
+			// appended.
+			//
+			// Together, these invariants ensure that a follower that is behind by
+			// multiple configuration changes will be using one of the two most recent
+			// configuration changes "by the time it matters", which is what is
+			// required for correctness (configuration changes are sequenced so that
+			// neighboring configurations are mutually compatible, i.e. don't cause
+			// split brain). To see this, consider a follower that is behind by
+			// multiple configuration changes. This is fine unless this follower
+			// becomes the leader (as it would then make quorum determinations based
+			// on its active config). To become leader, it needs to campaign, and
+			// thanks to the second invariant, it will only do so once it has applied
+			// all the configuration changes in its committed log. If it is to win the
+			// election, it will also have all committed configuration changes in its
+			// log (though not necessarily knowing that they are all committed). But
+			// the third invariant implies that when the follower received the most
+			// recent configuration change into its log, the one preceding it was
+			// durably marked as committed on the follower. In summary, we now know
+			// that it will apply all the way up to and including the second most
+			// recent configuration change, which is compatible with the most recent
+			// one.
 			rn.ApplyConfChange(cmd.confChange.ConfChangeI)
 			return true, nil
 		})
diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go
index e1a7dd64b968..2d6b17fe188f 100644
--- a/pkg/kv/kvserver/store.go
+++ b/pkg/kv/kvserver/store.go
@@ -411,8 +411,257 @@ func (rs *storeReplicaVisitor) EstimatedCount() int {
 	return len(rs.repls) - rs.visited
 }
 
-// A Store maintains a map of ranges by start key. A Store corresponds
-// to one physical device.
+/*
+A Store maintains a set of Replicas whose data is stored on a storage.Engine
+usually corresponding to a dedicated storage medium. It also houses a collection
+of subsystems that, in broad terms, perform maintenance of each Replica on the
+Store when required. In particular, this includes various queues such as the
+split, merge, rebalance, GC, and replicaGC queues (to name just a few).
+
+INVARIANT: the set of all Ranges (as determined by, e.g. a transactionally
+consistent scan of the meta index ranges) always exactly covers the addressable
+keyspace roachpb.KeyMin (inclusive) to roachpb.KeyMax (exclusive).
+
+Ranges
+
+Each Replica is part of a Range, i.e. corresponds to what other systems would
+call a shard. A Range is a consensus group backed by Raft, i.e. each Replica is
+a state machine backed by a replicated log of commands. In CockroachDB, Ranges
+own a contiguous chunk of addressable keyspace, where the word "addressable" is
+a fine-print that interested readers can learn about in keys.Addr; in short the
+Replica data is logically contiguous, but not contiguous when viewed as Engine
+kv pairs.
+
+Considerable complexity is incurred by the features of dynamic re-sharding and
+relocation, i.e. range splits, range merges, and range relocation (also called
+replication changes, or, in the case of lateral movement, rebalancing). Each of
+these interact heavily with the Range as a consensus group (of which each
+Replica is a member). All of these intricacies are described at a high level in
+this comment.
+
+RangeDescriptor
+
+A roachpb.RangeDescriptor is the configuration of a Range. It is an
+MVCC-backed key-value pair (where the key is derived from the StartKey via
+keys.RangeDescriptorKey, which in particular resides on the Range itself)
+that is accessible to the transactional KV API much like any other, but is
+for internal use only (and in particular plays no role at the SQL layer).
+
+Splits, Merges, and Rebalances are all carried out as distributed
+transactions. They are all complex in their own right but they share the
+basic approach of transactionally acting on the RangeDescriptor. Each of
+these operations at some point will
+
+- start a transaction
+
+- update the RangeDescriptor (for example, to reflect a split, or a change
+to the Replicas comprising the members of the Range)
+
+- update the meta ranges (which form a search index used for request routing)
+
+- commit with a roachpb.InternalCommitTrigger.
+
+In particular, note that the RangeDescriptor is the first write issued in the
+transaction, and so the transaction record will be created on the affected
+range. This allows us to establish a helpful invariant:
+
+INVARIANT: an intent on keys.RangeDescriptorKey is resolved atomically with
+the (application of the) roachpb.EndTxnRequest committing the transaction.
+
+A Replica's active configuration is dictated by its visible version of the
+RangeDescriptor, and the above invariant simplifies this. Without the invariant,
+the new RangeDescriptor would come into effect when the intent is resolved,
+which is a) later (so requests might get routed to this Replica without it being
+ready to handle them appropriately) and b) requires special-casing around
+ResolveIntent acting on the RangeDescriptor.
+
+INVARIANT: A Store never contains two Replicas from the same Range, nor do the
+key ranges for any two of its Replicas overlap. (Note that there is no
+requirement that these Replicas come from a consistent set of Ranges).
+
+To illustrate this last invariant, consider a split of a Replica [a-z) into two
+Replicas, [a-c) and [c,z). The Store will never contain both; it has to swap
+directly from [a-z) to [a,c)+[c,z). For a similar example, consider accepting a
+new Replica [b,d) when a Replica [a,c) is already present, which is similarly
+not allowed. To understand how such situations could arise, consider that a
+series of changes to the RangeDescriptor is observed asynchronously by
+followers. This means that a follower may have Replica [a-z) while any number of
+splits and replication changes are already known to the leaseholder, and the new
+Ranges created in the process may be attempting to add a Replica to the slow
+follower.
+
+With the invariant, we effectively allow looking up a unique (if it exists)
+Replica for any given key, and ensure that no two Replicas on a Store operate on
+shared keyspace (as seen by the storage.Engine). Refer to the Replica Lifecycle
+diagram below for details on how this invariant is upheld.
+
+Replica Lifecycle
+
+A Replica should be thought of primarily as a State Machine applying commands
+from a replicated log (the log being replicated across the members of the
+Range). The Store's RaftTransport receives Raft messages from Replicas residing
+on other Stores and routes them to the appropriate Replicas via
+Store.HandleRaftRequest (which is part of the RaftMessageHandler interface),
+ultimately resulting in a call to Replica.handleRaftReadyRaftMuLocked, which
+houses the integration with the etcd/raft library (raft.RawNode). This may
+generate Raft messages to be sent to other Stores; these are handed to
+Replica.sendRaftMessages which ultimately hands them to the Store's
+RaftTransport.SendAsync method. Raft uses message passing (not
+request-response), and outgoing messages will use a gRPC stream that differs
+from that used for incoming messages (which makes asymmetric partitions more
+likely in case of stream-specific problems). The steady state is relatively
+straightforward but when Ranges are being reconfigured, an understanding the
+Replica Lifecycle becomes important and upholding the Store's invariants becomes
+more complex.
+
+A first phenomenon to understand is that of uninitialized Replicas. Such a
+Replica corresponds to a State Machine that has yet to apply any entries, and in
+particular has no notion of an active RangeDescriptor yet. This state exists for
+various reasons:
+
+- A newly added voter already needs to be able to cast a vote even before it has
+had time to be caught up by other nodes. In the extreme case, the very first
+Raft message received at a Store leading to the creation of an (uninitialized)
+Replica may be a request for a vote that is required for quorum. In practice we
+mostly sidestep this voting requirement by adding new members through an
+intermediate state (see roachpb.LEARNER) that allows them to catch up and become
+initialized before making them voters, but at the protocol level this still
+relies on the concept of a Replica without state that can act as a Raft peer.
+
+- For practical reasons, we don't preserve the entire committed replicated log
+for eternity. It will periodically get truncated, i.e. a prefix discarded, and
+this leads to followers occasionally being forced to catch up on the log via a
+Raft Snapshot, i.e. a copy of the replicated data in the State Machine as of
+some log position, from which the recipient Replica can then continue to receive
+and apply log entries.
+
+- In CockroachDB, a newly created Range (say at cluster bootstrap, or during a
+Range split) is never empty - it has to contain at least the RangeDescriptor.
+For the split case, indeed it contains around half of the data originally housed
+in the pre-split Range, and it is burdensome to distribute potentially hundreds
+of megabytes through the Raft log. So what we do in CockroachDB is to never use
+Raft entries zero through nine (see raftInitialLogIndex), which means that a
+newly added Replica will need a first Snapshot to obtain access to the log.
+
+Externally, uninitialized Replicas should be viewed as an implementation
+detail that is (or should be!) invisible to most access to a Store in the
+context of processing requests coming from the KV API, as uninitialized
+Replicas cannot serve such requests.
+
+The diagram is a lot to take in. The various transitions are discussed in
+prose below, and the source .dot file is in store_doc_replica_lifecycle.dot.
+
+                                +---------------------+
+            +------------------ |       Absent        | ---------------------------------------------------------------------------------------------------+
+            |                   +---------------------+                                                                                                    |
+            |                     |                        Subsume              Crash          applySnapshot                                               |
+            |                     | Store.Start          +---------------+    +---------+    +---------------+                                             |
+            |                     v                      v               |    v         |    v               |                                             |
+            |                   +-----------------------------------------------------------------------------------------------------------------------+  |
+  +---------+------------------ |                                                                                                                       |  |
+  |         |                   |                                                      Initialized                                                      |  |
+  |         |                   |                                                                                                                       |  |
+  |    +----+------------------ |                                                                                                                       | -+----+
+  |    |    |                   +-----------------------------------------------------------------------------------------------------------------------+  |    |
+  |    |    |                     |                      ^                    ^                                   |              |                    |    |    |
+  |    |    | Raft msg            | Crash                | applySnapshot      | post-split                        |              |                    |    |    |
+  |    |    |                     v                      |                    |                                   |              |                    |    |    |
+  |    |    |                   +---------------------------------------------------------+  pre-split            |              |                    |    |    |
+  |    |    +-----------------> |                                                         | <---------------------+--------------+--------------------+----+    |
+  |    |                        |                                                         |                       |              |                    |         |
+  |    |                        |                      Uninitialized                      |   Raft msg            |              |                    |         |
+  |    |                        |                                                         | -----------------+    |              |                    |         |
+  |    |                        |                                                         |                  |    |              |                    |         |
+  |    |                        |                                                         | <----------------+    |              |                    |         |
+  |    |                        +---------------------------------------------------------+                       |              | apply removal      |         |
+  |    |                          |                      |                                                        |              |                    |         |
+  |    |                          | ReplicaTooOldError   | higher ReplicaID                                       | Replica GC   |                    |         |
+  |    |                          v                      v                                                        v              |                    |         |
+  |    |   Merged (snapshot)    +---------------------------------------------------------------------------------------------+  |                    |         |
+  |    +----------------------> |                                                                                             | <+                    |         |
+  |                             |                                                                                             |                       |         |
+  |        apply Merge          |                                                                                             |  ReplicaTooOld        |         |
+  +---------------------------> |                                           Removed                                           | <---------------------+         |
+                                |                                                                                             |                                 |
+                                |                                                                                             |  higher ReplicaID               |
+                                |                                                                                             | <-------------------------------+
+                                +---------------------------------------------------------------------------------------------+
+
+
+When a Store starts, it iterates through all RangeDescriptors it can find on its
+Engine. Finding a RangeDescriptor by definition implies that the Replica is
+initialized. Raft state (a raftpb.HardState) for uninitialized Replicas may
+exist, however it would be ignored until a message arrives addressing that
+Replica, or a split trigger applies that instantiates and then initializes it.
+
+Uninitialized Replicas principally arise when a replication change occurs and a
+new member is added to a Range. This new member will be contacted by the Raft
+leader, creating an uninitialized Replica on the recipient which will then
+negotiate a snapshot to become initialized and to receive the replicated log. A
+split can be understood as a special case of that, except that all members of
+the Range are created at around the same time (though in practice with arbitrary
+delays due to the usual distributed systems reasons), and the uninitialized
+Replica creation is triggered by the split trigger executing on the left-hand
+side of the split, or a Replica of the right-hand side (already initialized by
+the split trigger on another Store) reaching out, whichever occurs first.
+
+An uninitialized Replica requires a snapshot to become initialized. The case in
+which the Replica is the right-hand side of a split can be understood as the
+application of a snapshot as well (though this is not reflected in code at the
+time of writing) where the left-hand side applies the split by (logically)
+moving any data past the split point to the right-hand side Replica, thus
+initializing it. In principle, since writes to the state machine do not need to
+be made durable, it is conceivable that a split or snapshot could "unapply" due
+to an ill-timed crash (though snapshots currently use SST ingestion, which the
+storage engine performs durably). Similarly, entry application (which only
+occurs on initialized Replicas) is not synced and so a suffix of applied entries
+may need to be re-applied following a crash.
+
+If an uninitialized Replica receives a Raft message from a peer informing it
+that it is no longer part of a more up-to-date Range configuration (via a
+ReplicaTooOldError) or that it has been removed and re-added under a higher
+ReplicaID, the uninitialized Replica is removed. There is currently no
+general-purpose mechanism to determine whether an uninitialized Replica is
+outdated; an uninitialized Replica could in principle leak "forever" if the
+Range quickly changes its members such that the triggers mentioned here don't
+apply A full scan of meta2 would be required as there is no RangeID-keyed index
+on the RangeDescriptors (the meta ranges are keyed on the EndKey, which allows
+routing requests based on the key ranges they touch). The fact that
+uninitialized Replicas can be removed has to be taken into account by splits as
+well; the split trigger may find that the right-hand side uninitialized Replica
+has already been removed, in which case the right half of the split has to be
+discarded (see acquireSplitLock and splitPostApply).
+
+Initialized Replicas represent the common case. They can apply snapshots
+(required if they get cut off from the raft log via log truncation) which will
+always move the applied log position forward, however this is rare. A Replica
+typically spends most of its life applying log entries and/or serving reads. The
+mechanisms that can lead to removal of uninitialized Replicas apply to
+initialized Replicas in the exact same way, but there are additional ways for an
+initialized Replica to be removed. The most general such mechanism is the
+replica GC queue, which periodically checks the meta2 copy of the Range's
+descriptor (using a consistent read, i.e. reading the latest version). If this
+indicates that the Replica on the local Store should no longer exist, the
+Replica is removed. Additionally, the Replica may directly witness a change that
+indicates a need for a removal. For example, it may apply a change to the range
+descriptor that removes it, or it may be destroyed by the application of a merge
+on its left neighboring Replica, which may also occur through a snapshot. Merges
+are the single most complex reconfiguration operation and can only be touched
+upon here. At their core, they will at some point "freeze" the right-hand side
+Replicas (via roachpb.SubsumeRequest) to prevent additional read or write
+activity, and also ensure that the two sets of Ranges to be merged are
+co-located on the same Stores as well as are all initialized.
+
+INVARIANT: An initialized Replica's RangeDescriptor always includes it as a member.
+
+INVARIANT: A Replica's ReplicaID is constant.
+
+Together, these invariants significantly reduce complexity since we do not need
+to handle the excluded situations. Particularly a changing replicaID is
+bug-inducing since at the Replication layer a change in replica ID is a complete
+change of identity, and re-use of in-memory structures poses the threat of
+erroneously re-using cached information.
+*/
 type Store struct {
 	Ident              *roachpb.StoreIdent // pointer to catch access before Start() is called
 	cfg                StoreConfig
diff --git a/pkg/kv/kvserver/store_doc_replica_lifecycle.dot b/pkg/kv/kvserver/store_doc_replica_lifecycle.dot
new file mode 100644
index 000000000000..27644dff6069
--- /dev/null
+++ b/pkg/kv/kvserver/store_doc_replica_lifecycle.dot
@@ -0,0 +1,40 @@
+digraph finite_state_machine {
+
+/*
+This is the source file for the diagram on the `type Store`
+comment.
+
+Generated via:
+
+docker run -i tsub/graph-easy --from=dot --as=ascii < \
+  store_doc_replica_lifecycle.dot
+*/
+
+Absent -> Uninitialized [label = "Raft msg"];
+Absent -> Initialized [label = "Store.Start"];
+
+Uninitialized -> Uninitialized [label = "Raft msg"];
+
+Uninitialized -> Initialized [label = "applySnapshot"];
+Initialized -> Initialized [label = "applySnapshot"];
+
+Absent -> Uninitialized [label = "pre-split "];
+Uninitialized -> Initialized [label = "post-split"];
+
+Initialized -> Initialized [label = "Subsume"];
+
+Initialized -> Initialized [label = "Crash"];
+Initialized -> Uninitialized [label = "Crash"];
+
+Initialized -> Removed [label = "Replica GC"];
+Initialized -> Removed [label = "apply removal"];
+Initialized -> Removed [label = "ReplicaTooOld"];
+Initialized -> Removed [label = "Merged (snapshot)"];
+Initialized -> Removed [label = "apply Merge"];
+Initialized -> Removed [label = "higher ReplicaID"];
+
+Uninitialized -> Removed [label = "ReplicaTooOldError"];
+Uninitialized -> Removed [label = "higher ReplicaID"];
+
+}
+