Skip to content

Commit

Permalink
kvserver: test full validation of constraints on replica replacement
Browse files Browse the repository at this point in the history
This change adds tests using `store.AllocatorCheckRange(..)` to
investigate the behavior or attempting to decommission a node when doing
so would cause constraints to be broken. This test is needed because it
was uncovered recently that a partially constrained range (i.e. a range
configured with `num_replicas = 3, constraint = '{<some constraint>:
1}'` may not have the constraint fully respected if the only node that
satisfies said constraint is being decommissioned.

Part of cockroachdb#94809

Depends on cockroachdb#94024.

Release note: None
  • Loading branch information
AlexTalks committed Feb 8, 2023
1 parent 87d1fed commit e2d5ed5
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 24 deletions.
127 changes: 127 additions & 0 deletions pkg/kv/kvserver/allocator_impl_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,133 @@ var threeStores = []*roachpb.StoreDescriptor{
},
}

var fourSingleStoreRacks = []*roachpb.StoreDescriptor{
{
StoreID: 1,
Node: roachpb.NodeDescriptor{
NodeID: 1,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{
Key: "cloud",
Value: "local",
},
{
Key: "region",
Value: "local",
},
{
Key: "zone",
Value: "local",
},
{
Key: "rack",
Value: "1",
},
},
},
},
Capacity: roachpb.StoreCapacity{
Capacity: 200,
Available: 100,
LogicalBytes: 100,
},
},
{
StoreID: 2,
Node: roachpb.NodeDescriptor{
NodeID: 2,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{
Key: "cloud",
Value: "local",
},
{
Key: "region",
Value: "local",
},
{
Key: "zone",
Value: "local",
},
{
Key: "rack",
Value: "2",
},
},
},
},
Capacity: roachpb.StoreCapacity{
Capacity: 200,
Available: 100,
LogicalBytes: 100,
},
},
{
StoreID: 3,
Node: roachpb.NodeDescriptor{
NodeID: 3,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{
Key: "cloud",
Value: "local",
},
{
Key: "region",
Value: "local",
},
{
Key: "zone",
Value: "local",
},
{
Key: "rack",
Value: "3",
},
},
},
},
Capacity: roachpb.StoreCapacity{
Capacity: 200,
Available: 100,
LogicalBytes: 100,
},
},
{
StoreID: 4,
Node: roachpb.NodeDescriptor{
NodeID: 4,
Locality: roachpb.Locality{
Tiers: []roachpb.Tier{
{
Key: "cloud",
Value: "local",
},
{
Key: "region",
Value: "local",
},
{
Key: "zone",
Value: "local",
},
{
Key: "rack",
Value: "4",
},
},
},
},
Capacity: roachpb.StoreCapacity{
Capacity: 200,
Available: 100,
LogicalBytes: 100,
},
},
}

// TestAllocatorRebalanceTarget could help us to verify whether we'll rebalance
// to a target that we'll immediately remove.
func TestAllocatorRebalanceTarget(t *testing.T) {
Expand Down
3 changes: 3 additions & 0 deletions pkg/kv/kvserver/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2015,6 +2015,9 @@ func (s *Store) GetConfReader(ctx context.Context) (spanconfig.StoreReader, erro
if s.cfg.TestingKnobs.MakeSystemConfigSpanUnavailableToQueues {
return nil, errSysCfgUnavailable
}
if s.cfg.TestingKnobs.ConfReaderInterceptor != nil {
return s.cfg.TestingKnobs.ConfReaderInterceptor(), nil
}

if s.cfg.SpanConfigsDisabled ||
!spanconfigstore.EnabledSetting.Get(&s.ClusterSettings().SV) ||
Expand Down
144 changes: 120 additions & 24 deletions pkg/kv/kvserver/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/rpc"
"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/spanconfig"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/bootstrap"
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
Expand Down Expand Up @@ -3400,6 +3401,28 @@ func TestSnapshotRateLimit(t *testing.T) {
}
}

type mockSpanConfigReader struct {
real spanconfig.StoreReader
overrides map[string]roachpb.SpanConfig
}

func (m *mockSpanConfigReader) NeedsSplit(ctx context.Context, start, end roachpb.RKey) bool {
panic("unimplemented")
}

func (m *mockSpanConfigReader) ComputeSplitKey(ctx context.Context, start, end roachpb.RKey) roachpb.RKey {
panic("unimplemented")
}

func (m *mockSpanConfigReader) GetSpanConfigForKey(ctx context.Context, key roachpb.RKey) (roachpb.SpanConfig, error) {
if conf, ok := m.overrides[string(key)]; ok {
return conf, nil
}
return m.GetSpanConfigForKey(ctx, key)
}

var _ spanconfig.StoreReader = &mockSpanConfigReader{}

// TestAllocatorCheckRangeUnconfigured tests evaluating the allocation decisions
// for a range with a single replica using the default system configuration and
// no other available allocation targets.
Expand Down Expand Up @@ -3456,9 +3479,8 @@ func TestAllocatorCheckRange(t *testing.T) {
name string
stores []*roachpb.StoreDescriptor
existingReplicas []roachpb.ReplicaDescriptor
zoneConfig *zonepb.ZoneConfig
spanConfig *roachpb.SpanConfig
livenessOverrides map[roachpb.NodeID]livenesspb.NodeLivenessStatus
baselineExpNoop bool
expectedAction allocatorimpl.AllocatorAction
expectValidTarget bool
expectedLogMessage string
Expand Down Expand Up @@ -3549,11 +3571,12 @@ func TestAllocatorCheckRange(t *testing.T) {
{NodeID: 4, StoreID: 4, ReplicaID: 4},
{NodeID: 5, StoreID: 5, ReplicaID: 5},
},
zoneConfig: zonepb.DefaultSystemZoneConfigRef(),
spanConfig: &roachpb.SpanConfig{
NumReplicas: 5,
},
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
3: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
baselineExpNoop: true,
expectedAction: allocatorimpl.AllocatorRemoveDecommissioningVoter,
expectErr: false,
expectValidTarget: false,
Expand All @@ -3574,7 +3597,6 @@ func TestAllocatorCheckRange(t *testing.T) {
4: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
5: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
baselineExpNoop: true,
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectValidTarget: false,
expectAllocatorErr: true,
Expand Down Expand Up @@ -3635,6 +3657,88 @@ func TestAllocatorCheckRange(t *testing.T) {
expectErr: false,
expectValidTarget: true,
},
{
name: "decommissioning without satisfying partially constrained locality",
stores: fourSingleStoreRacks,
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
{NodeID: 4, StoreID: 4, ReplicaID: 3},
},
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
4: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
spanConfig: &roachpb.SpanConfig{
NumReplicas: 3,
Constraints: []roachpb.ConstraintsConjunction{
{
NumReplicas: 1,
Constraints: []roachpb.Constraint{
{
Type: roachpb.Constraint_REQUIRED,
Key: "rack",
Value: "4",
},
},
},
},
},
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
// We should get an error attempting to break constraints, but this is
// currently a bug.
// TODO(sarkesian): Change below to true once #94809 is fixed.
expectErr: false,
},
{
name: "decommissioning without satisfying fully constrained locality",
stores: fourSingleStoreRacks,
existingReplicas: []roachpb.ReplicaDescriptor{
{NodeID: 1, StoreID: 1, ReplicaID: 1},
{NodeID: 2, StoreID: 2, ReplicaID: 2},
{NodeID: 4, StoreID: 4, ReplicaID: 3},
},
livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{
4: livenesspb.NodeLivenessStatus_DECOMMISSIONING,
},
spanConfig: &roachpb.SpanConfig{
NumReplicas: 3,
Constraints: []roachpb.ConstraintsConjunction{
{
NumReplicas: 1,
Constraints: []roachpb.Constraint{
{
Type: roachpb.Constraint_REQUIRED,
Key: "rack",
Value: "1",
},
},
},
{
NumReplicas: 1,
Constraints: []roachpb.Constraint{
{
Type: roachpb.Constraint_REQUIRED,
Key: "rack",
Value: "2",
},
},
},
{
NumReplicas: 1,
Constraints: []roachpb.Constraint{
{
Type: roachpb.Constraint_REQUIRED,
Key: "rack",
Value: "4",
},
},
},
},
},
expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter,
expectAllocatorErr: true,
expectedErrStr: "replicas must match constraints",
},
} {
t.Run(tc.name, func(t *testing.T) {
// Setup store pool based on store descriptors and configure test store.
Expand All @@ -3655,11 +3759,17 @@ func TestAllocatorCheckRange(t *testing.T) {
cfg := TestStoreConfig(clock)
cfg.Gossip = g
cfg.StorePool = sp
if tc.zoneConfig != nil {
// TODO(sarkesian): This override is not great. It would be much
// preferable to provide a SpanConfig if possible. See comment in
// createTestStoreWithoutStart.
cfg.SystemConfigProvider.GetSystemConfig().DefaultZoneConfig = tc.zoneConfig
if tc.spanConfig != nil {
mockSr := &mockSpanConfigReader{
real: cfg.SystemConfigProvider.GetSystemConfig(),
overrides: map[string]roachpb.SpanConfig{
"a": *tc.spanConfig,
},
}

cfg.TestingKnobs.ConfReaderInterceptor = func() spanconfig.StoreReader {
return mockSr
}
}

s := createTestStoreWithoutStart(ctx, t, stopper, testStoreOpts{createSystemRanges: true}, &cfg)
Expand Down Expand Up @@ -3687,20 +3797,6 @@ func TestAllocatorCheckRange(t *testing.T) {
storePoolOverride = storepool.NewOverrideStorePool(sp, livenessOverride, nodeCountOverride)
}

// Check if our baseline action without overrides is a noop; i.e., the
// range is fully replicated as configured and needs no actions.
if tc.baselineExpNoop {
action, _, _, err := s.AllocatorCheckRange(ctx, desc,
false /* collectTraces */, nil, /* overrideStorePool */
)
require.NoError(t, err, "expected baseline check without error")
require.Containsf(t, []allocatorimpl.AllocatorAction{
allocatorimpl.AllocatorNoop,
allocatorimpl.AllocatorConsiderRebalance,
}, action, "expected baseline noop, got %s", action)
//require.Equalf(t, allocatorimpl.AllocatorNoop, action, "expected baseline noop, got %s", action)
}

// Execute actual allocator range repair check.
action, target, recording, err := s.AllocatorCheckRange(ctx, desc,
true /* collectTraces */, storePoolOverride,
Expand Down
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,8 @@ type StoreTestingKnobs struct {
// TODO(irfansharif): Get rid of this knob, maybe by first moving
// DisableSpanConfigs into a testing knob instead of a server arg.
UseSystemConfigSpanForQueues bool
// ConfReaderInterceptor intercepts calls to get a span config reader.
ConfReaderInterceptor func() spanconfig.StoreReader
// IgnoreStrictGCEnforcement is used by tests to op out of strict GC
// enforcement.
IgnoreStrictGCEnforcement bool
Expand Down

0 comments on commit e2d5ed5

Please sign in to comment.