Skip to content

Commit

Permalink
[dnm,wip] server: reproduce double-allocation of store IDs
Browse files Browse the repository at this point in the history
We seem to be doing a few unsound things with how we allocate store IDs
(courtesy of yours truly). The changes made to a recently added test
stressing our multi-store behaviour demonstrates the bug.

    --- FAIL: TestAddNewStoresToExistingNodes (38.44s)
        multi_store_test.go:155: expected the 6th store to have storeID s6, found s5

Release note: None
  • Loading branch information
irfansharif committed Nov 4, 2020
1 parent e533f19 commit 3c6e0c5
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 17 deletions.
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ go_test(
"//pkg/util/timeutil",
"//pkg/util/uuid",
"//vendor/github.com/cockroachdb/errors",
"//vendor/github.com/dustin/go-humanize",
"//vendor/github.com/gogo/protobuf/jsonpb",
"//vendor/github.com/gogo/protobuf/proto",
"//vendor/github.com/grpc-ecosystem/grpc-gateway/runtime",
Expand Down
43 changes: 32 additions & 11 deletions pkg/server/multi_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@ package server_test

import (
"context"
"sort"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/dustin/go-humanize"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -71,14 +74,21 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
clusterID := tc.Server(0).ClusterID()
tc.Stopper().Stop(ctx)

// Add an additional store to each node.
// Add two additional stores to each node.
n1s2, n1cleanup2 := testutils.TempDir(t)
defer n1cleanup2()
n2s2, n2cleanup2 := testutils.TempDir(t)
defer n2cleanup2()
n3s2, n3cleanup2 := testutils.TempDir(t)
defer n3cleanup2()

n1s3, n1cleanup3 := testutils.TempDir(t)
defer n1cleanup3()
n2s3, n2cleanup3 := testutils.TempDir(t)
defer n2cleanup3()
n3s3, n3cleanup3 := testutils.TempDir(t)
defer n3cleanup3()

tcArgs = base.TestClusterArgs{
// We need ParallelStart since this is an existing cluster. If
// we started sequentially, then the first node would hang forever
Expand All @@ -89,17 +99,17 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {
StoreSpecs: []base.StoreSpec{
{Path: n1s1}, {Path: n1s2},
{Path: n1s1}, {Path: n1s2}, {Path: n1s3},
},
},
1: {
StoreSpecs: []base.StoreSpec{
{Path: n2s1}, {Path: n2s2},
{Path: n2s1}, {Path: n2s2}, {Path: n2s3},
},
},
2: {
StoreSpecs: []base.StoreSpec{
{Path: n3s1}, {Path: n3s2},
{Path: n3s1}, {Path: n3s2}, {Path: n3s3},
},
},
},
Expand All @@ -115,23 +125,34 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
require.Equal(t, clusterID, srv.ClusterID())
}

// Ensure all nodes have 2 stores available.
// Ensure all nodes have all stores available, and each store has a unique
// store ID.
testutils.SucceedsSoon(t, func() error {
var storeIDs []roachpb.StoreID
for _, server := range tc.Servers {
var storeCount = 0

err := server.GetStores().(*kvserver.Stores).VisitStores(
if err := server.GetStores().(*kvserver.Stores).VisitStores(
func(s *kvserver.Store) error {
storeCount++
storeIDs = append(storeIDs, s.StoreID())
return nil
},
)
if err != nil {
); err != nil {
return errors.Errorf("failed to visit all nodes, got %v", err)
}

if storeCount != 2 {
return errors.Errorf("expected two stores to be available on node %v, got %v stores instead", server.NodeID(), storeCount)
if storeCount != 3 {
return errors.Errorf("expected 3 stores to be available on n%s, got %d stores instead", server.NodeID(), storeCount)
}
}

sort.Slice(storeIDs, func(i, j int) bool {
return storeIDs[i] < storeIDs[j]
})
for i := range storeIDs {
expStoreID := roachpb.StoreID(i + 1)
if storeIDs[i] != expStoreID {
t.Fatalf("expected the %s store to have storeID s%s, found s%s", humanize.Ordinal(i+1), expStoreID, storeIDs[i])
}
}

Expand Down
44 changes: 38 additions & 6 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,7 @@ func (n *Node) start(
if err := s.Start(ctx, n.stopper); err != nil {
return errors.Errorf("failed to start store: %s", err)
}
capacity, err := s.Capacity(ctx, false /* useCached */)
if err != nil {
return errors.Errorf("could not query store capacity: %s", err)
}
log.Infof(ctx, "initialized store %s: %+v", s, capacity)
log.Infof(ctx, "initialized store s%s", s.StoreID())

n.addStore(ctx, s)
}
Expand Down Expand Up @@ -595,6 +591,11 @@ func (n *Node) bootstrapStores(
// each and invoking storage.Bootstrap() to persist it and the cluster
// version and to create stores. The -1 comes from the fact that our
// first store ID has already been pre-allocated for us.
//
// TODO(irfansharif): Is this sound? If we're restarting an already
// bootstrapped node (but now with additional stores), we haven't
// pre-allocated a store ID for any of the additional stores. Also see
// TODO below, the usage of firstStoreID is a bit confused.
storeIDAlloc := int64(len(emptyEngines)) - 1
if firstStoreID == 0 {
// We lied, we don't have a firstStoreID; we'll need to allocate for
Expand All @@ -606,11 +607,41 @@ func (n *Node) bootstrapStores(
}
startID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, storeIDAlloc, n.storeCfg.DB)
if firstStoreID == 0 {
// TODO(irfansharif): Our usage of firstStoreID is pretty confused,
// but it just so happens to "work". firstStoreID, as threaded in
// from the caller, is non-zero in two cases:
// - When we starting up a node that was just bootstrapped
// (firstStoreID == 1).
// - When we're joining an existing cluster (firstStoreID provided
// to us via the join RPC).
//
// When we're restarting an already bootstrapped node, firstStoreID
// is zero. If we happen to be restarting with additional stores,
// we're starting the ID counter at what KV just told us about,
// which is probably what we want.
//
// Generally we're trying to do a few things, and our code structure
// doesn't make that obvious:
// - We're bootstrapping our first store after being handed a store
// ID via the join RPC
// - We're not bootstrapping our first store here when we're the
// node being bootstrapped, that already happens in
// `bootstrapCluster`.
// - We're bootstrapping stores other than our first, after joining
// an existing cluster via the join RPC.
// - We're bootstrapping additional stores after having our server
// bootstrapped by the operator for the very first time.
// - We're bootstrapping additional stores after being restarted
// with new stores (we were previously an already bootstrapped
// node).
firstStoreID = startID
}
if err != nil {
return errors.Errorf("error allocating store ids: %s", err)
}

log.Infof(ctx, "xxx: store-id-alloc=%d, first-store-id=%d, start-id=%d", storeIDAlloc, firstStoreID, startID)

sIdent := roachpb.StoreIdent{
ClusterID: n.clusterID.Get(),
NodeID: n.Descriptor.NodeID,
Expand All @@ -625,8 +656,9 @@ func (n *Node) bootstrapStores(
if err := s.Start(ctx, stopper); err != nil {
return err
}
log.Infof(ctx, "initialized store s%s", s.StoreID())

n.addStore(ctx, s)
log.Infof(ctx, "bootstrapped store %s", s)
// Done regularly in Node.startGossiping, but this cuts down the time
// until this store is used for range allocations.
if err := s.GossipStore(ctx, false /* useCached */); err != nil {
Expand Down

0 comments on commit 3c6e0c5

Please sign in to comment.