Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[dnm,wip] server: reproduce double-allocation of store IDs #56271

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ go_test(
"//pkg/util/timeutil",
"//pkg/util/uuid",
"//vendor/github.com/cockroachdb/errors",
"//vendor/github.com/dustin/go-humanize",
"//vendor/github.com/gogo/protobuf/jsonpb",
"//vendor/github.com/gogo/protobuf/proto",
"//vendor/github.com/grpc-ecosystem/grpc-gateway/runtime",
Expand Down
43 changes: 32 additions & 11 deletions pkg/server/multi_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,19 @@ package server_test

import (
"context"
"sort"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
"github.com/dustin/go-humanize"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -71,14 +74,21 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
clusterID := tc.Server(0).ClusterID()
tc.Stopper().Stop(ctx)

// Add an additional store to each node.
// Add two additional stores to each node.
n1s2, n1cleanup2 := testutils.TempDir(t)
defer n1cleanup2()
n2s2, n2cleanup2 := testutils.TempDir(t)
defer n2cleanup2()
n3s2, n3cleanup2 := testutils.TempDir(t)
defer n3cleanup2()

n1s3, n1cleanup3 := testutils.TempDir(t)
defer n1cleanup3()
n2s3, n2cleanup3 := testutils.TempDir(t)
defer n2cleanup3()
n3s3, n3cleanup3 := testutils.TempDir(t)
defer n3cleanup3()

tcArgs = base.TestClusterArgs{
// We need ParallelStart since this is an existing cluster. If
// we started sequentially, then the first node would hang forever
Expand All @@ -89,17 +99,17 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
ServerArgsPerNode: map[int]base.TestServerArgs{
0: {
StoreSpecs: []base.StoreSpec{
{Path: n1s1}, {Path: n1s2},
{Path: n1s1}, {Path: n1s2}, {Path: n1s3},
},
},
1: {
StoreSpecs: []base.StoreSpec{
{Path: n2s1}, {Path: n2s2},
{Path: n2s1}, {Path: n2s2}, {Path: n2s3},
},
},
2: {
StoreSpecs: []base.StoreSpec{
{Path: n3s1}, {Path: n3s2},
{Path: n3s1}, {Path: n3s2}, {Path: n3s3},
},
},
},
Expand All @@ -115,23 +125,34 @@ func TestAddNewStoresToExistingNodes(t *testing.T) {
require.Equal(t, clusterID, srv.ClusterID())
}

// Ensure all nodes have 2 stores available.
// Ensure all nodes have all stores available, and each store has a unique
// store ID.
testutils.SucceedsSoon(t, func() error {
var storeIDs []roachpb.StoreID
for _, server := range tc.Servers {
var storeCount = 0

err := server.GetStores().(*kvserver.Stores).VisitStores(
if err := server.GetStores().(*kvserver.Stores).VisitStores(
func(s *kvserver.Store) error {
storeCount++
storeIDs = append(storeIDs, s.StoreID())
return nil
},
)
if err != nil {
); err != nil {
return errors.Errorf("failed to visit all nodes, got %v", err)
}

if storeCount != 2 {
return errors.Errorf("expected two stores to be available on node %v, got %v stores instead", server.NodeID(), storeCount)
if storeCount != 3 {
return errors.Errorf("expected 3 stores to be available on n%s, got %d stores instead", server.NodeID(), storeCount)
}
}

sort.Slice(storeIDs, func(i, j int) bool {
return storeIDs[i] < storeIDs[j]
})
for i := range storeIDs {
expStoreID := roachpb.StoreID(i + 1)
if storeIDs[i] != expStoreID {
t.Fatalf("expected the %s store to have storeID s%s, found s%s", humanize.Ordinal(i+1), expStoreID, storeIDs[i])
}
}

Expand Down
44 changes: 38 additions & 6 deletions pkg/server/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -423,11 +423,7 @@ func (n *Node) start(
if err := s.Start(ctx, n.stopper); err != nil {
return errors.Errorf("failed to start store: %s", err)
}
capacity, err := s.Capacity(ctx, false /* useCached */)
if err != nil {
return errors.Errorf("could not query store capacity: %s", err)
}
log.Infof(ctx, "initialized store %s: %+v", s, capacity)
log.Infof(ctx, "initialized store s%s", s.StoreID())

n.addStore(ctx, s)
}
Expand Down Expand Up @@ -595,6 +591,11 @@ func (n *Node) bootstrapStores(
// each and invoking storage.Bootstrap() to persist it and the cluster
// version and to create stores. The -1 comes from the fact that our
// first store ID has already been pre-allocated for us.
//
// TODO(irfansharif): Is this sound? If we're restarting an already
// bootstrapped node (but now with additional stores), we haven't
// pre-allocated a store ID for any of the additional stores. Also see
// TODO below, the usage of firstStoreID is a bit confused.
storeIDAlloc := int64(len(emptyEngines)) - 1
if firstStoreID == 0 {
// We lied, we don't have a firstStoreID; we'll need to allocate for
Expand All @@ -606,11 +607,41 @@ func (n *Node) bootstrapStores(
}
startID, err := allocateStoreIDs(ctx, n.Descriptor.NodeID, storeIDAlloc, n.storeCfg.DB)
if firstStoreID == 0 {
// TODO(irfansharif): Our usage of firstStoreID is pretty confused,
// but it just so happens to "work". firstStoreID, as threaded in
// from the caller, is non-zero in two cases:
// - When we starting up a node that was just bootstrapped
// (firstStoreID == 1).
// - When we're joining an existing cluster (firstStoreID provided
// to us via the join RPC).
//
// When we're restarting an already bootstrapped node, firstStoreID
// is zero. If we happen to be restarting with additional stores,
// we're starting the ID counter at what KV just told us about,
// which is probably what we want.
//
// Generally we're trying to do a few things, and our code structure
// doesn't make that obvious:
// - We're bootstrapping our first store after being handed a store
// ID via the join RPC
// - We're not bootstrapping our first store here when we're the
// node being bootstrapped, that already happens in
// `bootstrapCluster`.
// - We're bootstrapping stores other than our first, after joining
// an existing cluster via the join RPC.
// - We're bootstrapping additional stores after having our server
// bootstrapped by the operator for the very first time.
// - We're bootstrapping additional stores after being restarted
// with new stores (we were previously an already bootstrapped
// node).
firstStoreID = startID
}
if err != nil {
return errors.Errorf("error allocating store ids: %s", err)
}

log.Infof(ctx, "xxx: store-id-alloc=%d, first-store-id=%d, start-id=%d", storeIDAlloc, firstStoreID, startID)

sIdent := roachpb.StoreIdent{
ClusterID: n.clusterID.Get(),
NodeID: n.Descriptor.NodeID,
Expand All @@ -625,8 +656,9 @@ func (n *Node) bootstrapStores(
if err := s.Start(ctx, stopper); err != nil {
return err
}
log.Infof(ctx, "initialized store s%s", s.StoreID())

n.addStore(ctx, s)
log.Infof(ctx, "bootstrapped store %s", s)
// Done regularly in Node.startGossiping, but this cuts down the time
// until this store is used for range allocations.
if err := s.GossipStore(ctx, false /* useCached */); err != nil {
Expand Down