server: always create a liveness record before starting up

Previously it used to be the case that it was possible for a node to be up and running, and for there to be no corresponding liveness record for it. This was a very transient situation as liveness records are created for a given node as soon as it out its first heartbeat. Still, given that this could take a few seconds, it lent to a lot of complexity in our handling of node liveness where we had to always anticipate the possibility of there being no corresponding liveness record for a given node (and thus creating it if necessary). Having a liveness record for each node always present is a crucial building block for long running migrations (#48843). There the intention is to have the orchestrator process look towards the list of liveness records for an authoritative view of cluster membership. Previously when it was possible for an active member of the cluster to not have a corresponding liveness record (no matter how unlikely or short-lived in practice), we could not generate such a view. --- This is an alternative implementation for #53805. Here we choose to manually write the liveness record for the bootstrapping node when writing initial cluster data. For all other nodes, we do it on the server-side of the join RPC. We're also careful to do it in the legacy codepath when joining a cluster through gossip. Release note: None
cockroachdb · Sep 10, 2020 · 64c3b57 · 64c3b57
1 parent 0c89e7b
commit 64c3b57
Show file tree

Hide file tree

Showing 10 changed files with 197 additions and 51 deletions.
diff --git a/pkg/cmd/roachtest/decommission.go b/pkg/cmd/roachtest/decommission.go
@@ -309,21 +309,6 @@ func runDecommissionRandomized(ctx context.Context, t *test, c *cluster) {
 		Multiplier:     2,
 	}
 
-	// This is a pretty gross hack to let the bootstrap info (cluster ID,
-	// liveness records) disseminate through the cluster. Since it's no longer
-	// happening through gossip, it takes a bit longer to happen. We should do
-	// two things to improve our story here:
-	//
-	// - We should opportunistically write to the liveness table when adding a
-	//   node through the Join RPC. This would also simplify the handling of
-	//   empty liveness records (they would no longer exist).
-	// - We should add roachtest helpers that wait until each node has received
-	//   cluster ID information, and use it in all the tests that need it (which
-	//   may very well be all the tests).
-	//
-	// TODO(irfansharif): Do the above.
-	time.Sleep(30 * time.Second)
-
 	// Partially decommission then recommission a random node, from another
 	// random node. Run a couple of status checks while doing so.
 	{

diff --git a/pkg/kv/kvserver/client_test.go b/pkg/kv/kvserver/client_test.go
@@ -1009,6 +1009,15 @@ func (m *multiTestContext) addStore(idx int) {
 	}{
 		ch: make(chan struct{}),
 	}
+	if idx != 0 {
+		// Given multiTestContext does not make use of the join RPC, we have to
+		// manually write out liveness records for each node to maintain the
+		// invariant that all nodes have liveness records present before they
+		// start heartbeating.
+		if err := m.nodeLivenesses[idx].CreateLivenessRecord(ctx, nodeID); err != nil {
+			m.t.Fatal(err)
+		}
+	}
 	m.nodeLivenesses[idx].StartHeartbeat(ctx, stopper, m.engines[idx:idx+1], func(ctx context.Context) {
 		now := clock.Now()
 		if err := store.WriteLastUpTimestamp(ctx, now); err != nil {

diff --git a/pkg/kv/kvserver/node_liveness.go b/pkg/kv/kvserver/node_liveness.go
@@ -450,6 +450,56 @@ type livenessUpdate struct {
 	oldRaw []byte
 }
 
+// CreateLivenessRecord creates a liveness record for the node specified by the
+// given node ID. This is typically used when adding a new node to a running
+// cluster, or when bootstrapping a cluster through a given node.
+//
+// This is a pared down version of StartHeartbeat; it exists only to durably
+// persist a liveness to record the node's existence. Nodes will heartbeat their
+// records after starting up, and incrementing to epoch=1 when doing so, at
+// which point we'll set an appropriate expiration timestamp, gossip the
+// liveness record, and update our in-memory representation of it.
+//
+// NB: An existing liveness record is not overwritten by this method, we return
+// an error instead.
+func (nl *NodeLiveness) CreateLivenessRecord(ctx context.Context, nodeID roachpb.NodeID) error {
+	// We start off at epoch=0, entrusting the initial heartbeat to increment it
+	// to epoch=1 to signal the very first time the node is up and running.
+	liveness := kvserverpb.Liveness{NodeID: nodeID, Epoch: 0}
+
+	// We skip adding an expiration, we only really care about the liveness
+	// record existing within KV.
+
+	v := new(roachpb.Value)
+	if err := nl.db.Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
+		b := txn.NewBatch()
+		key := keys.NodeLivenessKey(nodeID)
+		if err := v.SetProto(&liveness); err != nil {
+			log.Fatalf(ctx, "failed to marshall proto: %s", err)
+		}
+		// Given we're looking to create a new liveness record here, we don't
+		// expect to find anything.
+		b.CPut(key, v, nil)
+
+		// We don't bother adding a gossip trigger, that'll happen with the
+		// first heartbeat. We still keep it as a 1PC commit to avoid leaving
+		// write intents.
+		b.AddRawRequest(&roachpb.EndTxnRequest{
+			Commit:     true,
+			Require1PC: true,
+		})
+		return txn.Run(ctx, b)
+	}); err != nil {
+		return err
+	}
+
+	// We'll learn about this liveness record through gossip eventually, so we
+	// don't bother updating our in-memory view of node liveness.
+
+	log.Infof(ctx, "created liveness record for n%d", nodeID)
+	return nil
+}
+
 func (nl *NodeLiveness) setMembershipStatusInternal(
 	ctx context.Context,
 	nodeID roachpb.NodeID,
@@ -461,16 +511,10 @@ func (nl *NodeLiveness) setMembershipStatusInternal(
 	if oldLivenessRec.Liveness == (kvserverpb.Liveness{}) {
 		// Liveness record didn't previously exist, so we create one.
 		//
-		// TODO(irfansharif): This code feels a bit unwieldy because it's
-		// possible for a liveness record to not exist previously. It is just
-		// generally difficult to write it at startup. When a node joins the
-		// cluster, this completes before it has had a chance to write its
-		// liveness record. If it gets decommissioned immediately, there won't
-		// be one yet. The Connect RPC can solve this though, I think? We can
-		// bootstrap clusters with a liveness record for n1. Any other node at
-		// some point has to join the cluster for the first time via the Connect
-		// RPC, which as part of its job can make sure the liveness record
-		// exists before responding to the new node.
+		// TODO(irfansharif): The above is now no longer possible. We always
+		// create one (see CreateLivenessRecord, WriteInitialClusterData) when
+		// adding a node to the cluster. We should clean up all this logic that
+		// tries to work around the liveness record possibly not existing.
 		newLiveness = kvserverpb.Liveness{
 			NodeID: nodeID,
 			Epoch:  1,
@@ -587,11 +631,11 @@ func (nl *NodeLiveness) StartHeartbeat(
 				func(ctx context.Context) error {
 					// Retry heartbeat in the event the conditional put fails.
 					for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
-						liveness, err := nl.Self()
+						oldLiveness, err := nl.Self()
 						if err != nil && !errors.Is(err, ErrNoLivenessRecord) {
 							log.Errorf(ctx, "unexpected error getting liveness: %+v", err)
 						}
-						if err := nl.heartbeatInternal(ctx, liveness, incrementEpoch); err != nil {
+						if err := nl.heartbeatInternal(ctx, oldLiveness, incrementEpoch); err != nil {
 							if errors.Is(err, ErrEpochIncremented) {
 								log.Infof(ctx, "%s; retrying", err)
 								continue
@@ -737,7 +781,7 @@ func (nl *NodeLiveness) heartbeatInternal(
 
 	// If we are not intending to increment the node's liveness epoch, detect
 	// whether this heartbeat is needed anymore. It is possible that we queued
-	// for long enough on the sempahore such that other heartbeat attempts ahead
+	// for long enough on the semaphore such that other heartbeat attempts ahead
 	// of us already incremented the expiration past what we wanted. Note that
 	// if we allowed the heartbeat to proceed in this case, we know that it
 	// would hit a ConditionFailedError and return a errNodeAlreadyLive down
@@ -749,20 +793,39 @@ func (nl *NodeLiveness) heartbeatInternal(
 		}
 	}
 
-	// Let's compute what our new liveness record should be.
-	var newLiveness kvserverpb.Liveness
 	if oldLiveness == (kvserverpb.Liveness{}) {
-		// Liveness record didn't previously exist, so we create one.
-		newLiveness = kvserverpb.Liveness{
-			NodeID: nodeID,
-			Epoch:  1,
+		// We don't yet know about our own liveness record (which does exist, we
+		// maintain the invariant that there's always a liveness record for
+		// every given node). Let's retrieve it from KV before proceeding.
+		//
+		// If we didn't previously know about our liveness record, it indicates
+		// that we're heartbeating for the very first time.
+		kv, err := nl.db.Get(ctx, keys.NodeLivenessKey(nodeID))
+		if err != nil {
+			return errors.Wrap(err, "unable to get liveness")
 		}
-	} else {
-		newLiveness = oldLiveness
-		if incrementEpoch {
-			newLiveness.Epoch++
-			newLiveness.Draining = false // Clear draining field.
+		if kv.Value == nil {
+			return ErrNoLivenessRecord
+		}
+		if err := kv.Value.GetProto(&oldLiveness); err != nil {
+			return errors.Wrap(err, "invalid liveness record")
 		}
+
+		oldLivenessRec := LivenessRecord{
+			Liveness: oldLiveness,
+			raw:      kv.Value.TagAndDataBytes(),
+		}
+
+		// Offer it to make sure that when we actually try to update the
+		// liveness, the previous view is correct.
+		nl.maybeUpdate(oldLivenessRec)
+	}
+
+	// Let's compute what our new liveness record should be.
+	newLiveness := oldLiveness
+	if incrementEpoch {
+		newLiveness.Epoch++
+		newLiveness.Draining = false // Clear draining field.
 	}
 
 	// Grab a new clock reading to compute the new expiration time,

diff --git a/pkg/kv/kvserver/node_liveness_test.go b/pkg/kv/kvserver/node_liveness_test.go
@@ -159,6 +159,49 @@ func TestNodeLivenessInitialIncrement(t *testing.T) {
 	verifyEpochIncremented(t, mtc, 0)
 }
 
+// TestNodeLivenessAppearsAtStart tests that liveness records are written right
+// when nodes are added to the cluster (during bootstrap, and when connecting to
+// a bootstrapped node). The test verifies that the liveness records found are
+// what we expect them to be.
+func TestNodeLivenessAppearsAtStart(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	defer log.Scope(t).Close(t)
+
+	ctx := context.Background()
+	tc := testcluster.StartTestCluster(t, 3, base.TestClusterArgs{})
+	defer tc.Stopper().Stop(ctx)
+
+	// At this point StartTestCluster has waited for all nodes to become live.
+
+	// Verify liveness records exist for all nodes.
+	for i := 0; i < tc.NumServers(); i++ {
+		nodeID := tc.Server(i).NodeID()
+		nl := tc.Server(i).NodeLiveness().(*kvserver.NodeLiveness)
+
+		if live, err := nl.IsLive(nodeID); err != nil {
+			t.Fatal(err)
+		} else if !live {
+			t.Fatalf("node %d not live", nodeID)
+		}
+
+		livenessRec, err := nl.GetLiveness(nodeID)
+		if err != nil {
+			t.Fatal(err)
+		}
+		if livenessRec.NodeID != nodeID {
+			t.Fatalf("expected node ID %d, got %d", nodeID, livenessRec.NodeID)
+		}
+		// We expect epoch=1 as nodes first create a liveness record at epoch=0,
+		// and then increment it during their first heartbeat.
+		if livenessRec.Epoch != 1 {
+			t.Fatalf("expected epoch=1, got epoch=%d", livenessRec.Epoch)
+		}
+		if !livenessRec.Membership.Active() {
+			t.Fatalf("expected membership=active, got membership=%s", livenessRec.Membership)
+		}
+	}
+}
+
 func verifyEpochIncremented(t *testing.T, mtc *multiTestContext, nodeIdx int) {
 	testutils.SucceedsSoon(t, func() error {
 		liveness, err := mtc.nodeLivenesses[nodeIdx].GetLiveness(mtc.gossips[nodeIdx].NodeID.Get())

diff --git a/pkg/kv/kvserver/store_bootstrap.go b/pkg/kv/kvserver/store_bootstrap.go
@@ -14,6 +14,7 @@ import (
 	"context"
 
 	"github.com/cockroachdb/cockroach/pkg/keys"
+	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
 	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
 	"github.com/cockroachdb/cockroach/pkg/roachpb"
@@ -96,16 +97,34 @@ func WriteInitialClusterData(
 		roachpb.KeyValue{Key: keys.BootstrapVersionKey, Value: bootstrapVal})
 
 	// Initialize various sequence generators.
-	var nodeIDVal, storeIDVal, rangeIDVal roachpb.Value
-	nodeIDVal.SetInt(1) // This node has id 1.
+	var nodeIDVal, storeIDVal, rangeIDVal, livenessVal roachpb.Value
+
+	const firstNodeID = 1 // This node has id 1.
+	nodeIDVal.SetInt(firstNodeID)
 	// The caller will initialize the stores with ids 1..numStores.
 	storeIDVal.SetInt(int64(numStores))
 	// The last range has id = len(splits) + 1
 	rangeIDVal.SetInt(int64(len(splits) + 1))
+
+	// We're the the first node in the cluster, let's seed our liveness record.
+	// It's crucial that we do to maintain the invariant that there's always a
+	// liveness record for a given node. We'll do something similar through the
+	// join RPC when adding new nodes to an already bootstrapped cluster [1].
+	//
+	// We start off at epoch=0; when nodes heartbeat their liveness records for
+	// the first time it'll get incremented to epoch=1 [2].
+	//
+	// [1]: See `CreateLivenessRecord` and usages for where that happens.
+	// [2]: See `StartHeartbeat` for where that happens.
+	livenessRecord := kvserverpb.Liveness{NodeID: 1, Epoch: 0}
+	if err := livenessVal.SetProto(&livenessRecord); err != nil {
+		return err
+	}
 	initialValues = append(initialValues,
 		roachpb.KeyValue{Key: keys.NodeIDGenerator, Value: nodeIDVal},
 		roachpb.KeyValue{Key: keys.StoreIDGenerator, Value: storeIDVal},
-		roachpb.KeyValue{Key: keys.RangeIDGenerator, Value: rangeIDVal})
+		roachpb.KeyValue{Key: keys.RangeIDGenerator, Value: rangeIDVal},
+		roachpb.KeyValue{Key: keys.NodeLivenessKey(firstNodeID), Value: livenessVal})
 
 	// firstRangeMS is going to accumulate the stats for the first range, as we
 	// write the meta records for all the other ranges.

diff --git a/pkg/server/node.go b/pkg/server/node.go
@@ -377,6 +377,12 @@ func (n *Node) start(
 		log.Infof(ctxWithSpan, "new node allocated ID %d", newID)
 		span.Finish()
 		nodeID = newID
+
+		// We're joining via gossip, so we don't have a liveness record for
+		// ourselves yet. Let's create one while here.
+		if err := n.storeCfg.NodeLiveness.CreateLivenessRecord(ctx, nodeID); err != nil {
+			return err
+		}
 	}
 
 	// Inform the RPC context of the node ID.
@@ -1128,10 +1134,6 @@ func (n *Node) GossipSubscription(
 // Join implements the roachpb.InternalServer service. This is the
 // "connectivity" API; individual CRDB servers are passed in a --join list and
 // the join targets are addressed through this API.
-//
-// TODO(irfansharif): Perhaps we could opportunistically create a liveness
-// record here so as to no longer have to worry about the liveness record not
-// existing for a given node.
 func (n *Node) Join(
 	ctx context.Context, req *roachpb.JoinNodeRequest,
 ) (*roachpb.JoinNodeResponse, error) {
@@ -1153,6 +1155,18 @@ func (n *Node) Join(
 		return nil, err
 	}
 
+	// We create a liveness record here for the joining node while here. We do
+	// so to maintain the invariant that there's always a liveness record for a
+	// given node. See `WriteInitialClusterData` for the other codepath where we
+	// manually create a liveness record to maintain this same invariant.
+	//
+	// NB: This invariant will be required for when we introduce long running
+	// migrations. See https://github.com/cockroachdb/cockroach/pull/48843 for
+	// details.
+	if err := n.storeCfg.NodeLiveness.CreateLivenessRecord(ctx, nodeID); err != nil {
+		return nil, err
+	}
+
 	log.Infof(ctx, "allocated IDs: n%d, s%d", nodeID, storeID)
 
 	return &roachpb.JoinNodeResponse{

diff --git a/pkg/server/node_test.go b/pkg/server/node_test.go
@@ -77,13 +77,15 @@ func TestBootstrapCluster(t *testing.T) {
 	for _, kv := range res.KVs {
 		foundKeys = append(foundKeys, kv.Key)
 	}
+	const firstNodeID = 1
 	var expectedKeys = keySlice{
 		testutils.MakeKey(roachpb.Key("\x02"), roachpb.KeyMax),
 		testutils.MakeKey(roachpb.Key("\x03"), roachpb.KeyMax),
 		roachpb.Key("\x04bootstrap-version"),
 		roachpb.Key("\x04node-idgen"),
 		roachpb.Key("\x04range-idgen"),
 		roachpb.Key("\x04store-idgen"),
+		keys.NodeLivenessKey(firstNodeID),
 	}
 	for _, splitKey := range config.StaticSplits() {
 		meta2Key := keys.RangeMetaKey(splitKey)

diff --git a/pkg/server/server.go b/pkg/server/server.go
@@ -1400,11 +1400,9 @@ func (s *Server) Start(ctx context.Context) error {
 	// one, make sure it's the clusterID we already know (and are guaranteed to
 	// know) at this point. If it's not the same, explode.
 	//
-	// TODO(tbg): remove this when we have changed ServeAndWait() to join an
-	// existing cluster via a one-off RPC, at which point we can create gossip
-	// (and thus the RPC layer) only after the clusterID is already known. We
-	// can then rely on the RPC layer's protection against cross-cluster
-	// communication.
+	// TODO(irfansharif): The above is no longer applicable; in 21.1 we can
+	// always assume that the RPC layer will always get set up after having
+	// found out what the cluster ID is. The checks below can be removed then.
 	{
 		// We populated this above, so it should still be set. This is just to
 		// demonstrate that we're not doing anything functional here (and to

diff --git a/pkg/server/testserver.go b/pkg/server/testserver.go
@@ -342,6 +342,15 @@ func (ts *TestServer) MigrationManager() interface{} {
 	return nil
 }
 
+// NodeLiveness exposes the NodeLiveness instance used by the TestServer as an
+// interface{}.
+func (ts *TestServer) NodeLiveness() interface{} {
+	if ts != nil {
+		return ts.nodeLiveness
+	}
+	return nil
+}
+
 // RPCContext returns the rpc context used by the TestServer.
 func (ts *TestServer) RPCContext() *rpc.Context {
 	if ts != nil {

diff --git a/pkg/testutils/serverutils/test_server_shim.go b/pkg/testutils/serverutils/test_server_shim.go
@@ -110,6 +110,10 @@ type TestServerInterface interface {
 	// MigrationManager returns the *jobs.Registry as an interface{}.
 	MigrationManager() interface{}
 
+	// NodeLiveness exposes the NodeLiveness instance used by the TestServer as an
+	// interface{}.
+	NodeLiveness() interface{}
+
 	// SetDistSQLSpanResolver changes the SpanResolver used for DistSQL inside the
 	// server's executor. The argument must be a physicalplan.SpanResolver
 	// instance.