From 3c6e0c5c104849a26e0d290277fc0c43d5cc4f68 Mon Sep 17 00:00:00 2001 From: irfan sharif Date: Tue, 3 Nov 2020 18:25:22 -0500 Subject: [PATCH] [dnm,wip] server: reproduce double-allocation of store IDs We seem to be doing a few unsound things with how we allocate store IDs (courtesy of yours truly). The changes made to a recently added test stressing our multi-store behaviour demonstrates the bug. --- FAIL: TestAddNewStoresToExistingNodes (38.44s) multi_store_test.go:155: expected the 6th store to have storeID s6, found s5 Release note: None --- pkg/server/BUILD.bazel | 1 + pkg/server/multi_store_test.go | 43 ++++++++++++++++++++++++--------- pkg/server/node.go | 44 +++++++++++++++++++++++++++++----- 3 files changed, 71 insertions(+), 17 deletions(-) diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index be51b504b44d..ce40bafcce27 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -307,6 +307,7 @@ go_test( "//pkg/util/timeutil", "//pkg/util/uuid", "//vendor/github.com/cockroachdb/errors", + "//vendor/github.com/dustin/go-humanize", "//vendor/github.com/gogo/protobuf/jsonpb", "//vendor/github.com/gogo/protobuf/proto", "//vendor/github.com/grpc-ecosystem/grpc-gateway/runtime", diff --git a/pkg/server/multi_store_test.go b/pkg/server/multi_store_test.go index 6fbaa4a82276..cbaecbde56d4 100644 --- a/pkg/server/multi_store_test.go +++ b/pkg/server/multi_store_test.go @@ -12,16 +12,19 @@ package server_test import ( "context" + "sort" "testing" "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvserver" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" + "github.com/dustin/go-humanize" "github.com/stretchr/testify/require" ) @@ -71,7 +74,7 @@ func TestAddNewStoresToExistingNodes(t *testing.T) { clusterID := tc.Server(0).ClusterID() tc.Stopper().Stop(ctx) - // Add an additional store to each node. + // Add two additional stores to each node. n1s2, n1cleanup2 := testutils.TempDir(t) defer n1cleanup2() n2s2, n2cleanup2 := testutils.TempDir(t) @@ -79,6 +82,13 @@ func TestAddNewStoresToExistingNodes(t *testing.T) { n3s2, n3cleanup2 := testutils.TempDir(t) defer n3cleanup2() + n1s3, n1cleanup3 := testutils.TempDir(t) + defer n1cleanup3() + n2s3, n2cleanup3 := testutils.TempDir(t) + defer n2cleanup3() + n3s3, n3cleanup3 := testutils.TempDir(t) + defer n3cleanup3() + tcArgs = base.TestClusterArgs{ // We need ParallelStart since this is an existing cluster. If // we started sequentially, then the first node would hang forever @@ -89,17 +99,17 @@ func TestAddNewStoresToExistingNodes(t *testing.T) { ServerArgsPerNode: map[int]base.TestServerArgs{ 0: { StoreSpecs: []base.StoreSpec{ - {Path: n1s1}, {Path: n1s2}, + {Path: n1s1}, {Path: n1s2}, {Path: n1s3}, }, }, 1: { StoreSpecs: []base.StoreSpec{ - {Path: n2s1}, {Path: n2s2}, + {Path: n2s1}, {Path: n2s2}, {Path: n2s3}, }, }, 2: { StoreSpecs: []base.StoreSpec{ - {Path: n3s1}, {Path: n3s2}, + {Path: n3s1}, {Path: n3s2}, {Path: n3s3}, }, }, }, @@ -115,23 +125,34 @@ func TestAddNewStoresToExistingNodes(t *testing.T) { require.Equal(t, clusterID, srv.ClusterID()) } - // Ensure all nodes have 2 stores available. + // Ensure all nodes have all stores available, and each store has a unique + // store ID. testutils.SucceedsSoon(t, func() error { + var storeIDs []roachpb.StoreID for _, server := range tc.Servers { var storeCount = 0 - - err := server.GetStores().(*kvserver.Stores).VisitStores( + if err := server.GetStores().(*kvserver.Stores).VisitStores( func(s *kvserver.Store) error { storeCount++ + storeIDs = append(storeIDs, s.StoreID()) return nil }, - ) - if err != nil { + ); err != nil { return errors.Errorf("failed to visit all nodes, got %v", err) } - if storeCount != 2 { - return errors.Errorf("expected two stores to be available on node %v, got %v stores instead", server.NodeID(), storeCount) + if storeCount != 3 { + return errors.Errorf("expected 3 stores to be available on n%s, got %d stores instead", server.NodeID(), storeCount) + } + } + + sort.Slice(storeIDs, func(i, j int) bool { + return storeIDs[i] < storeIDs[j] + }) + for i := range storeIDs { + expStoreID := roachpb.StoreID(i + 1) + if storeIDs[i] != expStoreID { + t.Fatalf("expected the %s store to have storeID s%s, found s%s", humanize.Ordinal(i+1), expStoreID, storeIDs[i]) } } diff --git a/pkg/server/node.go b/pkg/server/node.go index 16c59b7c2cf3..af8ae7900dc4 100644 --- a/pkg/server/node.go +++ b/pkg/server/node.go @@ -423,11 +423,7 @@ func (n *Node) start( if err := s.Start(ctx, n.stopper); err != nil { return errors.Errorf("failed to start store: %s", err) } - capacity, err := s.Capacity(ctx, false /* useCached */) - if err != nil { - return errors.Errorf("could not query store capacity: %s", err) - } - log.Infof(ctx, "initialized store %s: %+v", s, capacity) + log.Infof(ctx, "initialized store s%s", s.StoreID()) n.addStore(ctx, s) } @@ -595,6 +591,11 @@ func (n *Node) bootstrapStores( // each and invoking storage.Bootstrap() to persist it and the cluster // version and to create stores. The -1 comes from the fact that our // first store ID has already been pre-allocated for us. + // + // TODO(irfansharif): Is this sound? If we're restarting an already + // bootstrapped node (but now with additional stores), we haven't + // pre-allocated a store ID for any of the additional stores. Also see + // TODO below, the usage of firstStoreID is a bit confused. storeIDAlloc := int64(len(emptyEngines)) - 1 if firstStoreID == 0 { // We lied, we don't have a firstStoreID; we'll need to allocate for @@ -606,11 +607,41 @@ func (n *Node) bootstrapStores( } startID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, storeIDAlloc, n.storeCfg.DB) if firstStoreID == 0 { + // TODO(irfansharif): Our usage of firstStoreID is pretty confused, + // but it just so happens to "work". firstStoreID, as threaded in + // from the caller, is non-zero in two cases: + // - When we starting up a node that was just bootstrapped + // (firstStoreID == 1). + // - When we're joining an existing cluster (firstStoreID provided + // to us via the join RPC). + // + // When we're restarting an already bootstrapped node, firstStoreID + // is zero. If we happen to be restarting with additional stores, + // we're starting the ID counter at what KV just told us about, + // which is probably what we want. + // + // Generally we're trying to do a few things, and our code structure + // doesn't make that obvious: + // - We're bootstrapping our first store after being handed a store + // ID via the join RPC + // - We're not bootstrapping our first store here when we're the + // node being bootstrapped, that already happens in + // `bootstrapCluster`. + // - We're bootstrapping stores other than our first, after joining + // an existing cluster via the join RPC. + // - We're bootstrapping additional stores after having our server + // bootstrapped by the operator for the very first time. + // - We're bootstrapping additional stores after being restarted + // with new stores (we were previously an already bootstrapped + // node). firstStoreID = startID } if err != nil { return errors.Errorf("error allocating store ids: %s", err) } + + log.Infof(ctx, "xxx: store-id-alloc=%d, first-store-id=%d, start-id=%d", storeIDAlloc, firstStoreID, startID) + sIdent := roachpb.StoreIdent{ ClusterID: n.clusterID.Get(), NodeID: n.Descriptor.NodeID, @@ -625,8 +656,9 @@ func (n *Node) bootstrapStores( if err := s.Start(ctx, stopper); err != nil { return err } + log.Infof(ctx, "initialized store s%s", s.StoreID()) + n.addStore(ctx, s) - log.Infof(ctx, "bootstrapped store %s", s) // Done regularly in Node.startGossiping, but this cuts down the time // until this store is used for range allocations. if err := s.GossipStore(ctx, false /* useCached */); err != nil {