Skip to content

Commit

Permalink
server, statusccl: de-flake span stats fan-out tests
Browse files Browse the repository at this point in the history
This commit fixes flaky behavior while running
`TestSpanStatsFanOut` and `TestTenantSpanStats` under stress.

Both tests have been updated to ensure the following behavior:
- The tests make sure range splits occur before proceeding.
- The tests will retry their assertions to give the new key-value pairs
time to replicate.

Additionally, `NewTestTenantHelper` was updated to accept a parameter
for the number of nodes in the test host cluster. `TestTenantSpanStats`
now uses a 3-node cluster to test a real fan-out.

Resolves cockroachdb#99770
Resolves cockroachdb#99559
Epic:none
Release note: None
  • Loading branch information
zachlite committed Jul 10, 2023
1 parent e2af0b8 commit 68753c7
Show file tree
Hide file tree
Showing 5 changed files with 120 additions and 93 deletions.
7 changes: 6 additions & 1 deletion pkg/ccl/serverccl/adminccl/tenant_admin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ func TestTenantAdminAPI(t *testing.T) {
StoreDisableCoalesceAdjacent: true,
}

testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs)
testHelper := serverccl.NewTestTenantHelper(
t,
3, /* tenantClusterSize */
1, /* numNodes */
knobs,
)
defer testHelper.Cleanup(ctx, t)

t.Run("tenant_jobs", func(t *testing.T) {
Expand Down
7 changes: 6 additions & 1 deletion pkg/ccl/serverccl/api_v2_tenant_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ func TestExecSQL(t *testing.T) {

ctx := context.Background()

testHelper := NewTestTenantHelper(t, 3 /* tenantClusterSize */, base.TestingKnobs{})
testHelper := NewTestTenantHelper(
t,
3, /* tenantClusterSize */
1, /* numNodes */
base.TestingKnobs{},
)
defer testHelper.Cleanup(ctx, t)

tenantCluster := testHelper.TestCluster()
Expand Down
106 changes: 60 additions & 46 deletions pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,12 @@ func TestTenantStatusAPI(t *testing.T) {
StoreDisableCoalesceAdjacent: true,
}

testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs)
testHelper := serverccl.NewTestTenantHelper(
t,
3, /* tenantClusterSize */
1, /* numNodes */
knobs,
)
defer testHelper.Cleanup(ctx, t)

// Speed up propagation of tenant capability changes.
Expand Down Expand Up @@ -133,18 +138,26 @@ func TestTenantStatusAPI(t *testing.T) {
testTenantHotRanges(ctx, t, testHelper)
})

t.Run("tenant_span_stats", func(t *testing.T) {
skip.UnderStressWithIssue(t, 99559)
skip.UnderDeadlockWithIssue(t, 99770)
testTenantSpanStats(ctx, t, testHelper)
})

t.Run("tenant_nodes_capability", func(t *testing.T) {
testTenantNodesCapability(ctx, t, testHelper)
})
}

func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.TenantTestHelper) {
func TestTenantSpanStats(t *testing.T) {
defer leaktest.AfterTest(t)()
s := log.ScopeWithoutShowLogs(t)
defer s.Close(t)
defer s.SetupSingleFileLogging()()
ctx := context.Background()
const numNodes = 3
helper := serverccl.NewTestTenantHelper(
t,
3, /* tenantClusterSize */
numNodes,
base.TestingKnobs{},
)
defer helper.Cleanup(ctx, t)

tenantA := helper.TestCluster().Tenant(0)
tenantB := helper.ControlCluster().Tenant(0)

Expand Down Expand Up @@ -212,18 +225,22 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
return bytes.Join(keys, nil)
}

controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})
require.NoError(t, err)

// Create a new range in this tenant.
_, _, err = helper.HostCluster().Server(0).SplitRange(makeKey(tPrefix, roachpb.Key("c")))
newRangeKey := makeKey(tPrefix, roachpb.Key("c"))
_, newDesc, err := helper.HostCluster().Server(0).SplitRange(newRangeKey)
require.NoError(t, err)

// Wait for the split to finish and propagate.
// Wait until the range split occurs.
testutils.SucceedsSoon(t, func() error {
desc, err := helper.HostCluster().LookupRange(newRangeKey)
require.NoError(t, err)
if !desc.StartKey.Equal(newDesc.StartKey) {
return errors.New("range has not split")
}
return nil
})

// Wait for the new range to replicate.
err = helper.HostCluster().WaitForFullReplication()
require.NoError(t, err)

Expand All @@ -237,19 +254,6 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
}
}

stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})

require.NoError(t, err)

controlSpanStats := controlStats.SpanToStats[aSpan.String()]
testSpanStats := stats.SpanToStats[aSpan.String()]
require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)

// Make a multi-span call
type spanCase struct {
span roachpb.Span
Expand All @@ -264,7 +268,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])),
},
expectedRangeCount: 1,
expectedLiveCount: 1,
expectedLiveCount: 1 * numNodes,
},
{
// "d", "f" - single range, multiple keys
Expand All @@ -273,7 +277,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])),
},
expectedRangeCount: 1,
expectedLiveCount: 2,
expectedLiveCount: 2 * numNodes,
},
{
// "bb", "e" - multiple ranges, multiple keys
Expand All @@ -282,7 +286,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])),
},
expectedRangeCount: 2,
expectedLiveCount: 2,
expectedLiveCount: 2 * numNodes,
},

{
Expand All @@ -292,7 +296,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
},
expectedRangeCount: 2,
expectedLiveCount: 3,
expectedLiveCount: 3 * numNodes,
},
}

Expand All @@ -301,19 +305,29 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
spans = append(spans, sc.span)
}

stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})
testutils.SucceedsSoon(t, func() error {
stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})

require.NoError(t, err)

for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
if sc.expectedRangeCount != spanStats.RangeCount {
return errors.Newf("mismatch on expected range count for span case with span %v", sc.span.String())
}

if sc.expectedLiveCount != spanStats.TotalStats.LiveCount {
return errors.Newf("mismatch on expected live count for span case with span %v", sc.span.String())
}
}

return nil
})

require.NoError(t, err)
// Check each span has their expected values.
for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
}
})

}
Expand Down
12 changes: 8 additions & 4 deletions pkg/ccl/serverccl/tenant_test_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ var _ TenantTestHelper = &tenantTestHelper{}

// NewTestTenantHelper constructs a TenantTestHelper instance.
func NewTestTenantHelper(
t *testing.T, tenantClusterSize int, knobs base.TestingKnobs,
t *testing.T, tenantClusterSize int, numNodes int, knobs base.TestingKnobs,
) TenantTestHelper {
t.Helper()

Expand All @@ -151,9 +151,13 @@ func NewTestTenantHelper(
params.Knobs = knobs
// We're running tenant tests, no need for a default tenant.
params.DefaultTestTenant = base.TestTenantDisabled
testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{
ServerArgs: params,
})
testCluster := serverutils.StartNewTestCluster(
t,
numNodes,
base.TestClusterArgs{
ServerArgs: params,
},
)
server := testCluster.Server(0)

return &tenantTestHelper{
Expand Down
81 changes: 40 additions & 41 deletions pkg/server/span_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ func TestSpanStatsMetaScan(t *testing.T) {
}
}

func TestLocalSpanStats(t *testing.T) {
func TestSpanStatsFanOut(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStress(t)
ctx := context.Background()
numNodes := 3
const numNodes = 3
tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{})
defer tc.Stopper().Stop(ctx)

Expand Down Expand Up @@ -149,47 +148,47 @@ func TestLocalSpanStats(t *testing.T) {
{spans[4], 2, int64(numNodes * 3)},
{spans[5], 1, int64(numNodes * 2)},
}
// Multi-span request
multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0",
Spans: spans,
},
)
require.NoError(t, err)

// Verify stats across different spans.
for _, tcase := range testCases {
rSpan, err := keys.SpanAddr(tcase.span)
require.NoError(t, err)
testutils.SucceedsSoon(t, func() error {

// Assert expected values from multi-span request
spanStats := multiResult.SpanToStats[tcase.span.String()]
require.Equal(
t,
tcase.expectedRanges,
spanStats.RangeCount,
fmt.Sprintf(
"Multi-span: expected %d ranges in span [%s - %s], found %d",
tcase.expectedRanges,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.RangeCount,
),
)
require.Equal(
t,
tcase.expectedKeys,
spanStats.TotalStats.LiveCount,
fmt.Sprintf(
"Multi-span: expected %d keys in span [%s - %s], found %d",
tcase.expectedKeys,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.TotalStats.LiveCount,
),
// Multi-span request
multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0",
Spans: spans,
},
)
}
require.NoError(t, err)

// Verify stats across different spans.
for _, tcase := range testCases {
rSpan, err := keys.SpanAddr(tcase.span)
require.NoError(t, err)

// Assert expected values from multi-span request
spanStats := multiResult.SpanToStats[tcase.span.String()]
if tcase.expectedRanges != spanStats.RangeCount {
return errors.Newf("Multi-span: expected %d ranges in span [%s - %s], found %d",
tcase.expectedRanges,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.RangeCount,
)
}
if tcase.expectedKeys != spanStats.TotalStats.LiveCount {
return errors.Newf(
"Multi-span: expected %d keys in span [%s - %s], found %d",
tcase.expectedKeys,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.TotalStats.LiveCount,
)
}
}

return nil
})

}

// BenchmarkSpanStats measures the cost of collecting span statistics.
Expand Down

0 comments on commit 68753c7

Please sign in to comment.