From e2d5ed50403434c1865a0543deda18be64f3f4e8 Mon Sep 17 00:00:00 2001 From: Alex Sarkesian Date: Thu, 5 Jan 2023 22:10:34 -0500 Subject: [PATCH] kvserver: test full validation of constraints on replica replacement This change adds tests using `store.AllocatorCheckRange(..)` to investigate the behavior or attempting to decommission a node when doing so would cause constraints to be broken. This test is needed because it was uncovered recently that a partially constrained range (i.e. a range configured with `num_replicas = 3, constraint = '{: 1}'` may not have the constraint fully respected if the only node that satisfies said constraint is being decommissioned. Part of #94809 Depends on #94024. Release note: None --- pkg/kv/kvserver/allocator_impl_test.go | 127 ++++++++++++++++++++++ pkg/kv/kvserver/store.go | 3 + pkg/kv/kvserver/store_test.go | 144 ++++++++++++++++++++----- pkg/kv/kvserver/testing_knobs.go | 2 + 4 files changed, 252 insertions(+), 24 deletions(-) diff --git a/pkg/kv/kvserver/allocator_impl_test.go b/pkg/kv/kvserver/allocator_impl_test.go index 5490ed3d5c2f..00dd644058a6 100644 --- a/pkg/kv/kvserver/allocator_impl_test.go +++ b/pkg/kv/kvserver/allocator_impl_test.go @@ -102,6 +102,133 @@ var threeStores = []*roachpb.StoreDescriptor{ }, } +var fourSingleStoreRacks = []*roachpb.StoreDescriptor{ + { + StoreID: 1, + Node: roachpb.NodeDescriptor{ + NodeID: 1, + Locality: roachpb.Locality{ + Tiers: []roachpb.Tier{ + { + Key: "cloud", + Value: "local", + }, + { + Key: "region", + Value: "local", + }, + { + Key: "zone", + Value: "local", + }, + { + Key: "rack", + Value: "1", + }, + }, + }, + }, + Capacity: roachpb.StoreCapacity{ + Capacity: 200, + Available: 100, + LogicalBytes: 100, + }, + }, + { + StoreID: 2, + Node: roachpb.NodeDescriptor{ + NodeID: 2, + Locality: roachpb.Locality{ + Tiers: []roachpb.Tier{ + { + Key: "cloud", + Value: "local", + }, + { + Key: "region", + Value: "local", + }, + { + Key: "zone", + Value: "local", + }, + { + Key: "rack", + Value: "2", + }, + }, + }, + }, + Capacity: roachpb.StoreCapacity{ + Capacity: 200, + Available: 100, + LogicalBytes: 100, + }, + }, + { + StoreID: 3, + Node: roachpb.NodeDescriptor{ + NodeID: 3, + Locality: roachpb.Locality{ + Tiers: []roachpb.Tier{ + { + Key: "cloud", + Value: "local", + }, + { + Key: "region", + Value: "local", + }, + { + Key: "zone", + Value: "local", + }, + { + Key: "rack", + Value: "3", + }, + }, + }, + }, + Capacity: roachpb.StoreCapacity{ + Capacity: 200, + Available: 100, + LogicalBytes: 100, + }, + }, + { + StoreID: 4, + Node: roachpb.NodeDescriptor{ + NodeID: 4, + Locality: roachpb.Locality{ + Tiers: []roachpb.Tier{ + { + Key: "cloud", + Value: "local", + }, + { + Key: "region", + Value: "local", + }, + { + Key: "zone", + Value: "local", + }, + { + Key: "rack", + Value: "4", + }, + }, + }, + }, + Capacity: roachpb.StoreCapacity{ + Capacity: 200, + Available: 100, + LogicalBytes: 100, + }, + }, +} + // TestAllocatorRebalanceTarget could help us to verify whether we'll rebalance // to a target that we'll immediately remove. func TestAllocatorRebalanceTarget(t *testing.T) { diff --git a/pkg/kv/kvserver/store.go b/pkg/kv/kvserver/store.go index 72d8b09ebf6b..2a062bd64437 100644 --- a/pkg/kv/kvserver/store.go +++ b/pkg/kv/kvserver/store.go @@ -2015,6 +2015,9 @@ func (s *Store) GetConfReader(ctx context.Context) (spanconfig.StoreReader, erro if s.cfg.TestingKnobs.MakeSystemConfigSpanUnavailableToQueues { return nil, errSysCfgUnavailable } + if s.cfg.TestingKnobs.ConfReaderInterceptor != nil { + return s.cfg.TestingKnobs.ConfReaderInterceptor(), nil + } if s.cfg.SpanConfigsDisabled || !spanconfigstore.EnabledSetting.Get(&s.ClusterSettings().SV) || diff --git a/pkg/kv/kvserver/store_test.go b/pkg/kv/kvserver/store_test.go index e85e6f805a85..bb24f1056d24 100644 --- a/pkg/kv/kvserver/store_test.go +++ b/pkg/kv/kvserver/store_test.go @@ -49,6 +49,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/rpc" "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/spanconfig" "github.com/cockroachdb/cockroach/pkg/sql/catalog/bootstrap" "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" @@ -3400,6 +3401,28 @@ func TestSnapshotRateLimit(t *testing.T) { } } +type mockSpanConfigReader struct { + real spanconfig.StoreReader + overrides map[string]roachpb.SpanConfig +} + +func (m *mockSpanConfigReader) NeedsSplit(ctx context.Context, start, end roachpb.RKey) bool { + panic("unimplemented") +} + +func (m *mockSpanConfigReader) ComputeSplitKey(ctx context.Context, start, end roachpb.RKey) roachpb.RKey { + panic("unimplemented") +} + +func (m *mockSpanConfigReader) GetSpanConfigForKey(ctx context.Context, key roachpb.RKey) (roachpb.SpanConfig, error) { + if conf, ok := m.overrides[string(key)]; ok { + return conf, nil + } + return m.GetSpanConfigForKey(ctx, key) +} + +var _ spanconfig.StoreReader = &mockSpanConfigReader{} + // TestAllocatorCheckRangeUnconfigured tests evaluating the allocation decisions // for a range with a single replica using the default system configuration and // no other available allocation targets. @@ -3456,9 +3479,8 @@ func TestAllocatorCheckRange(t *testing.T) { name string stores []*roachpb.StoreDescriptor existingReplicas []roachpb.ReplicaDescriptor - zoneConfig *zonepb.ZoneConfig + spanConfig *roachpb.SpanConfig livenessOverrides map[roachpb.NodeID]livenesspb.NodeLivenessStatus - baselineExpNoop bool expectedAction allocatorimpl.AllocatorAction expectValidTarget bool expectedLogMessage string @@ -3549,11 +3571,12 @@ func TestAllocatorCheckRange(t *testing.T) { {NodeID: 4, StoreID: 4, ReplicaID: 4}, {NodeID: 5, StoreID: 5, ReplicaID: 5}, }, - zoneConfig: zonepb.DefaultSystemZoneConfigRef(), + spanConfig: &roachpb.SpanConfig{ + NumReplicas: 5, + }, livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ 3: livenesspb.NodeLivenessStatus_DECOMMISSIONING, }, - baselineExpNoop: true, expectedAction: allocatorimpl.AllocatorRemoveDecommissioningVoter, expectErr: false, expectValidTarget: false, @@ -3574,7 +3597,6 @@ func TestAllocatorCheckRange(t *testing.T) { 4: livenesspb.NodeLivenessStatus_DECOMMISSIONING, 5: livenesspb.NodeLivenessStatus_DECOMMISSIONING, }, - baselineExpNoop: true, expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter, expectValidTarget: false, expectAllocatorErr: true, @@ -3635,6 +3657,88 @@ func TestAllocatorCheckRange(t *testing.T) { expectErr: false, expectValidTarget: true, }, + { + name: "decommissioning without satisfying partially constrained locality", + stores: fourSingleStoreRacks, + existingReplicas: []roachpb.ReplicaDescriptor{ + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + {NodeID: 4, StoreID: 4, ReplicaID: 3}, + }, + livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ + 4: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + }, + spanConfig: &roachpb.SpanConfig{ + NumReplicas: 3, + Constraints: []roachpb.ConstraintsConjunction{ + { + NumReplicas: 1, + Constraints: []roachpb.Constraint{ + { + Type: roachpb.Constraint_REQUIRED, + Key: "rack", + Value: "4", + }, + }, + }, + }, + }, + expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter, + // We should get an error attempting to break constraints, but this is + // currently a bug. + // TODO(sarkesian): Change below to true once #94809 is fixed. + expectErr: false, + }, + { + name: "decommissioning without satisfying fully constrained locality", + stores: fourSingleStoreRacks, + existingReplicas: []roachpb.ReplicaDescriptor{ + {NodeID: 1, StoreID: 1, ReplicaID: 1}, + {NodeID: 2, StoreID: 2, ReplicaID: 2}, + {NodeID: 4, StoreID: 4, ReplicaID: 3}, + }, + livenessOverrides: map[roachpb.NodeID]livenesspb.NodeLivenessStatus{ + 4: livenesspb.NodeLivenessStatus_DECOMMISSIONING, + }, + spanConfig: &roachpb.SpanConfig{ + NumReplicas: 3, + Constraints: []roachpb.ConstraintsConjunction{ + { + NumReplicas: 1, + Constraints: []roachpb.Constraint{ + { + Type: roachpb.Constraint_REQUIRED, + Key: "rack", + Value: "1", + }, + }, + }, + { + NumReplicas: 1, + Constraints: []roachpb.Constraint{ + { + Type: roachpb.Constraint_REQUIRED, + Key: "rack", + Value: "2", + }, + }, + }, + { + NumReplicas: 1, + Constraints: []roachpb.Constraint{ + { + Type: roachpb.Constraint_REQUIRED, + Key: "rack", + Value: "4", + }, + }, + }, + }, + }, + expectedAction: allocatorimpl.AllocatorReplaceDecommissioningVoter, + expectAllocatorErr: true, + expectedErrStr: "replicas must match constraints", + }, } { t.Run(tc.name, func(t *testing.T) { // Setup store pool based on store descriptors and configure test store. @@ -3655,11 +3759,17 @@ func TestAllocatorCheckRange(t *testing.T) { cfg := TestStoreConfig(clock) cfg.Gossip = g cfg.StorePool = sp - if tc.zoneConfig != nil { - // TODO(sarkesian): This override is not great. It would be much - // preferable to provide a SpanConfig if possible. See comment in - // createTestStoreWithoutStart. - cfg.SystemConfigProvider.GetSystemConfig().DefaultZoneConfig = tc.zoneConfig + if tc.spanConfig != nil { + mockSr := &mockSpanConfigReader{ + real: cfg.SystemConfigProvider.GetSystemConfig(), + overrides: map[string]roachpb.SpanConfig{ + "a": *tc.spanConfig, + }, + } + + cfg.TestingKnobs.ConfReaderInterceptor = func() spanconfig.StoreReader { + return mockSr + } } s := createTestStoreWithoutStart(ctx, t, stopper, testStoreOpts{createSystemRanges: true}, &cfg) @@ -3687,20 +3797,6 @@ func TestAllocatorCheckRange(t *testing.T) { storePoolOverride = storepool.NewOverrideStorePool(sp, livenessOverride, nodeCountOverride) } - // Check if our baseline action without overrides is a noop; i.e., the - // range is fully replicated as configured and needs no actions. - if tc.baselineExpNoop { - action, _, _, err := s.AllocatorCheckRange(ctx, desc, - false /* collectTraces */, nil, /* overrideStorePool */ - ) - require.NoError(t, err, "expected baseline check without error") - require.Containsf(t, []allocatorimpl.AllocatorAction{ - allocatorimpl.AllocatorNoop, - allocatorimpl.AllocatorConsiderRebalance, - }, action, "expected baseline noop, got %s", action) - //require.Equalf(t, allocatorimpl.AllocatorNoop, action, "expected baseline noop, got %s", action) - } - // Execute actual allocator range repair check. action, target, recording, err := s.AllocatorCheckRange(ctx, desc, true /* collectTraces */, storePoolOverride, diff --git a/pkg/kv/kvserver/testing_knobs.go b/pkg/kv/kvserver/testing_knobs.go index e5ebda32f678..fb033789b283 100644 --- a/pkg/kv/kvserver/testing_knobs.go +++ b/pkg/kv/kvserver/testing_knobs.go @@ -412,6 +412,8 @@ type StoreTestingKnobs struct { // TODO(irfansharif): Get rid of this knob, maybe by first moving // DisableSpanConfigs into a testing knob instead of a server arg. UseSystemConfigSpanForQueues bool + // ConfReaderInterceptor intercepts calls to get a span config reader. + ConfReaderInterceptor func() spanconfig.StoreReader // IgnoreStrictGCEnforcement is used by tests to op out of strict GC // enforcement. IgnoreStrictGCEnforcement bool