From 68753c7ac3b0319611e2e1bf68d38e2a7b8b8b14 Mon Sep 17 00:00:00 2001 From: zachlite Date: Mon, 10 Jul 2023 11:58:22 -0400 Subject: [PATCH] server, statusccl: de-flake span stats fan-out tests This commit fixes flaky behavior while running `TestSpanStatsFanOut` and `TestTenantSpanStats` under stress. Both tests have been updated to ensure the following behavior: - The tests make sure range splits occur before proceeding. - The tests will retry their assertions to give the new key-value pairs time to replicate. Additionally, `NewTestTenantHelper` was updated to accept a parameter for the number of nodes in the test host cluster. `TestTenantSpanStats` now uses a 3-node cluster to test a real fan-out. Resolves #99770 Resolves #99559 Epic:none Release note: None --- .../serverccl/adminccl/tenant_admin_test.go | 7 +- pkg/ccl/serverccl/api_v2_tenant_test.go | 7 +- .../serverccl/statusccl/tenant_status_test.go | 106 ++++++++++-------- pkg/ccl/serverccl/tenant_test_utils.go | 12 +- pkg/server/span_stats_test.go | 81 +++++++------ 5 files changed, 120 insertions(+), 93 deletions(-) diff --git a/pkg/ccl/serverccl/adminccl/tenant_admin_test.go b/pkg/ccl/serverccl/adminccl/tenant_admin_test.go index 32c419c09242..1c71274d1861 100644 --- a/pkg/ccl/serverccl/adminccl/tenant_admin_test.go +++ b/pkg/ccl/serverccl/adminccl/tenant_admin_test.go @@ -42,7 +42,12 @@ func TestTenantAdminAPI(t *testing.T) { StoreDisableCoalesceAdjacent: true, } - testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs) + testHelper := serverccl.NewTestTenantHelper( + t, + 3, /* tenantClusterSize */ + 1, /* numNodes */ + knobs, + ) defer testHelper.Cleanup(ctx, t) t.Run("tenant_jobs", func(t *testing.T) { diff --git a/pkg/ccl/serverccl/api_v2_tenant_test.go b/pkg/ccl/serverccl/api_v2_tenant_test.go index 4b83dd45fd34..76157006877b 100644 --- a/pkg/ccl/serverccl/api_v2_tenant_test.go +++ b/pkg/ccl/serverccl/api_v2_tenant_test.go @@ -32,7 +32,12 @@ func TestExecSQL(t *testing.T) { ctx := context.Background() - testHelper := NewTestTenantHelper(t, 3 /* tenantClusterSize */, base.TestingKnobs{}) + testHelper := NewTestTenantHelper( + t, + 3, /* tenantClusterSize */ + 1, /* numNodes */ + base.TestingKnobs{}, + ) defer testHelper.Cleanup(ctx, t) tenantCluster := testHelper.TestCluster() diff --git a/pkg/ccl/serverccl/statusccl/tenant_status_test.go b/pkg/ccl/serverccl/statusccl/tenant_status_test.go index 19154fe8c1dd..c4e7128250e8 100644 --- a/pkg/ccl/serverccl/statusccl/tenant_status_test.go +++ b/pkg/ccl/serverccl/statusccl/tenant_status_test.go @@ -66,7 +66,12 @@ func TestTenantStatusAPI(t *testing.T) { StoreDisableCoalesceAdjacent: true, } - testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs) + testHelper := serverccl.NewTestTenantHelper( + t, + 3, /* tenantClusterSize */ + 1, /* numNodes */ + knobs, + ) defer testHelper.Cleanup(ctx, t) // Speed up propagation of tenant capability changes. @@ -133,18 +138,26 @@ func TestTenantStatusAPI(t *testing.T) { testTenantHotRanges(ctx, t, testHelper) }) - t.Run("tenant_span_stats", func(t *testing.T) { - skip.UnderStressWithIssue(t, 99559) - skip.UnderDeadlockWithIssue(t, 99770) - testTenantSpanStats(ctx, t, testHelper) - }) - t.Run("tenant_nodes_capability", func(t *testing.T) { testTenantNodesCapability(ctx, t, testHelper) }) } -func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.TenantTestHelper) { +func TestTenantSpanStats(t *testing.T) { + defer leaktest.AfterTest(t)() + s := log.ScopeWithoutShowLogs(t) + defer s.Close(t) + defer s.SetupSingleFileLogging()() + ctx := context.Background() + const numNodes = 3 + helper := serverccl.NewTestTenantHelper( + t, + 3, /* tenantClusterSize */ + numNodes, + base.TestingKnobs{}, + ) + defer helper.Cleanup(ctx, t) + tenantA := helper.TestCluster().Tenant(0) tenantB := helper.ControlCluster().Tenant(0) @@ -212,18 +225,22 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten return bytes.Join(keys, nil) } - controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: []roachpb.Span{aSpan}, - }) - require.NoError(t, err) - // Create a new range in this tenant. - _, _, err = helper.HostCluster().Server(0).SplitRange(makeKey(tPrefix, roachpb.Key("c"))) + newRangeKey := makeKey(tPrefix, roachpb.Key("c")) + _, newDesc, err := helper.HostCluster().Server(0).SplitRange(newRangeKey) require.NoError(t, err) - // Wait for the split to finish and propagate. + // Wait until the range split occurs. + testutils.SucceedsSoon(t, func() error { + desc, err := helper.HostCluster().LookupRange(newRangeKey) + require.NoError(t, err) + if !desc.StartKey.Equal(newDesc.StartKey) { + return errors.New("range has not split") + } + return nil + }) + + // Wait for the new range to replicate. err = helper.HostCluster().WaitForFullReplication() require.NoError(t, err) @@ -237,19 +254,6 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten } } - stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: []roachpb.Span{aSpan}, - }) - - require.NoError(t, err) - - controlSpanStats := controlStats.SpanToStats[aSpan.String()] - testSpanStats := stats.SpanToStats[aSpan.String()] - require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount) - require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount) - // Make a multi-span call type spanCase struct { span roachpb.Span @@ -264,7 +268,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])), }, expectedRangeCount: 1, - expectedLiveCount: 1, + expectedLiveCount: 1 * numNodes, }, { // "d", "f" - single range, multiple keys @@ -273,7 +277,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])), }, expectedRangeCount: 1, - expectedLiveCount: 2, + expectedLiveCount: 2 * numNodes, }, { // "bb", "e" - multiple ranges, multiple keys @@ -282,7 +286,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])), }, expectedRangeCount: 2, - expectedLiveCount: 2, + expectedLiveCount: 2 * numNodes, }, { @@ -292,7 +296,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])), }, expectedRangeCount: 2, - expectedLiveCount: 3, + expectedLiveCount: 3 * numNodes, }, } @@ -301,19 +305,29 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten spans = append(spans, sc.span) } - stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: spans, - }) + testutils.SucceedsSoon(t, func() error { + stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, + &roachpb.SpanStatsRequest{ + NodeID: "0", // 0 indicates we want stats from all nodes. + Spans: spans, + }) + + require.NoError(t, err) + + for _, sc := range spanCases { + spanStats := stats.SpanToStats[sc.span.String()] + if sc.expectedRangeCount != spanStats.RangeCount { + return errors.Newf("mismatch on expected range count for span case with span %v", sc.span.String()) + } + + if sc.expectedLiveCount != spanStats.TotalStats.LiveCount { + return errors.Newf("mismatch on expected live count for span case with span %v", sc.span.String()) + } + } + + return nil + }) - require.NoError(t, err) - // Check each span has their expected values. - for _, sc := range spanCases { - spanStats := stats.SpanToStats[sc.span.String()] - require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String())) - require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String())) - } }) } diff --git a/pkg/ccl/serverccl/tenant_test_utils.go b/pkg/ccl/serverccl/tenant_test_utils.go index 41aa5ecd296a..172a175efb43 100644 --- a/pkg/ccl/serverccl/tenant_test_utils.go +++ b/pkg/ccl/serverccl/tenant_test_utils.go @@ -141,7 +141,7 @@ var _ TenantTestHelper = &tenantTestHelper{} // NewTestTenantHelper constructs a TenantTestHelper instance. func NewTestTenantHelper( - t *testing.T, tenantClusterSize int, knobs base.TestingKnobs, + t *testing.T, tenantClusterSize int, numNodes int, knobs base.TestingKnobs, ) TenantTestHelper { t.Helper() @@ -151,9 +151,13 @@ func NewTestTenantHelper( params.Knobs = knobs // We're running tenant tests, no need for a default tenant. params.DefaultTestTenant = base.TestTenantDisabled - testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{ - ServerArgs: params, - }) + testCluster := serverutils.StartNewTestCluster( + t, + numNodes, + base.TestClusterArgs{ + ServerArgs: params, + }, + ) server := testCluster.Server(0) return &tenantTestHelper{ diff --git a/pkg/server/span_stats_test.go b/pkg/server/span_stats_test.go index b66eec552655..c43104bb9d72 100644 --- a/pkg/server/span_stats_test.go +++ b/pkg/server/span_stats_test.go @@ -70,12 +70,11 @@ func TestSpanStatsMetaScan(t *testing.T) { } } -func TestLocalSpanStats(t *testing.T) { +func TestSpanStatsFanOut(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - skip.UnderStress(t) ctx := context.Background() - numNodes := 3 + const numNodes = 3 tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{}) defer tc.Stopper().Stop(ctx) @@ -149,47 +148,47 @@ func TestLocalSpanStats(t *testing.T) { {spans[4], 2, int64(numNodes * 3)}, {spans[5], 1, int64(numNodes * 2)}, } - // Multi-span request - multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", - Spans: spans, - }, - ) - require.NoError(t, err) - // Verify stats across different spans. - for _, tcase := range testCases { - rSpan, err := keys.SpanAddr(tcase.span) - require.NoError(t, err) + testutils.SucceedsSoon(t, func() error { - // Assert expected values from multi-span request - spanStats := multiResult.SpanToStats[tcase.span.String()] - require.Equal( - t, - tcase.expectedRanges, - spanStats.RangeCount, - fmt.Sprintf( - "Multi-span: expected %d ranges in span [%s - %s], found %d", - tcase.expectedRanges, - rSpan.Key.String(), - rSpan.EndKey.String(), - spanStats.RangeCount, - ), - ) - require.Equal( - t, - tcase.expectedKeys, - spanStats.TotalStats.LiveCount, - fmt.Sprintf( - "Multi-span: expected %d keys in span [%s - %s], found %d", - tcase.expectedKeys, - rSpan.Key.String(), - rSpan.EndKey.String(), - spanStats.TotalStats.LiveCount, - ), + // Multi-span request + multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx, + &roachpb.SpanStatsRequest{ + NodeID: "0", + Spans: spans, + }, ) - } + require.NoError(t, err) + + // Verify stats across different spans. + for _, tcase := range testCases { + rSpan, err := keys.SpanAddr(tcase.span) + require.NoError(t, err) + + // Assert expected values from multi-span request + spanStats := multiResult.SpanToStats[tcase.span.String()] + if tcase.expectedRanges != spanStats.RangeCount { + return errors.Newf("Multi-span: expected %d ranges in span [%s - %s], found %d", + tcase.expectedRanges, + rSpan.Key.String(), + rSpan.EndKey.String(), + spanStats.RangeCount, + ) + } + if tcase.expectedKeys != spanStats.TotalStats.LiveCount { + return errors.Newf( + "Multi-span: expected %d keys in span [%s - %s], found %d", + tcase.expectedKeys, + rSpan.Key.String(), + rSpan.EndKey.String(), + spanStats.TotalStats.LiveCount, + ) + } + } + + return nil + }) + } // BenchmarkSpanStats measures the cost of collecting span statistics.