From f1bc66e4cb2165935550cc0b026fc8ab169cafa9 Mon Sep 17 00:00:00 2001 From: zachlite Date: Mon, 10 Jul 2023 11:58:22 -0400 Subject: [PATCH] server, statusccl: de-flake span stats fan-out tests This commit fixes flaky behavior while running `TestSpanStatsFanOut` and `TestTenantSpanStats` under stress. Both tests have been updated to ensure the following behavior: - The tests make sure range splits occur before proceeding. - The tests will retry their assertions to give the new key-value pairs time to replicate. Additionally, `NewTestTenantHelper` was updated to accept a parameter for the number of nodes in the test host cluster. `TestTenantSpanStats` now uses a 3-node cluster to test a real fan-out. Resolves #99770 Resolves #99559 Epic:none Release note: None --- .../serverccl/statusccl/tenant_status_test.go | 71 ++++++++-------- pkg/server/span_stats_test.go | 81 +++++++++---------- 2 files changed, 75 insertions(+), 77 deletions(-) diff --git a/pkg/ccl/serverccl/statusccl/tenant_status_test.go b/pkg/ccl/serverccl/statusccl/tenant_status_test.go index 19154fe8c1dd..1e01da43e48d 100644 --- a/pkg/ccl/serverccl/statusccl/tenant_status_test.go +++ b/pkg/ccl/serverccl/statusccl/tenant_status_test.go @@ -134,8 +134,6 @@ func TestTenantStatusAPI(t *testing.T) { }) t.Run("tenant_span_stats", func(t *testing.T) { - skip.UnderStressWithIssue(t, 99559) - skip.UnderDeadlockWithIssue(t, 99770) testTenantSpanStats(ctx, t, testHelper) }) @@ -212,18 +210,22 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten return bytes.Join(keys, nil) } - controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: []roachpb.Span{aSpan}, - }) - require.NoError(t, err) - // Create a new range in this tenant. - _, _, err = helper.HostCluster().Server(0).SplitRange(makeKey(tPrefix, roachpb.Key("c"))) + newRangeKey := makeKey(tPrefix, roachpb.Key("c")) + _, newDesc, err := helper.HostCluster().Server(0).SplitRange(newRangeKey) require.NoError(t, err) - // Wait for the split to finish and propagate. + // Wait until the range split occurs. + testutils.SucceedsSoon(t, func() error { + desc, err := helper.HostCluster().LookupRange(newRangeKey) + require.NoError(t, err) + if !desc.StartKey.Equal(newDesc.StartKey) { + return errors.New("range has not split") + } + return nil + }) + + // Wait for the new range to replicate. err = helper.HostCluster().WaitForFullReplication() require.NoError(t, err) @@ -237,19 +239,6 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten } } - stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: []roachpb.Span{aSpan}, - }) - - require.NoError(t, err) - - controlSpanStats := controlStats.SpanToStats[aSpan.String()] - testSpanStats := stats.SpanToStats[aSpan.String()] - require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount) - require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount) - // Make a multi-span call type spanCase struct { span roachpb.Span @@ -301,19 +290,29 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten spans = append(spans, sc.span) } - stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", // 0 indicates we want stats from all nodes. - Spans: spans, - }) + testutils.SucceedsSoon(t, func() error { + stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx, + &roachpb.SpanStatsRequest{ + NodeID: "0", // 0 indicates we want stats from all nodes. + Spans: spans, + }) + + require.NoError(t, err) + + for _, sc := range spanCases { + spanStats := stats.SpanToStats[sc.span.String()] + if sc.expectedRangeCount != spanStats.RangeCount { + return errors.Newf("mismatch on expected range count for span case with span %v", sc.span.String()) + } + + if sc.expectedLiveCount != spanStats.TotalStats.LiveCount { + return errors.Newf("mismatch on expected live count for span case with span %v", sc.span.String()) + } + } + + return nil + }) - require.NoError(t, err) - // Check each span has their expected values. - for _, sc := range spanCases { - spanStats := stats.SpanToStats[sc.span.String()] - require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String())) - require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String())) - } }) } diff --git a/pkg/server/span_stats_test.go b/pkg/server/span_stats_test.go index b66eec552655..c43104bb9d72 100644 --- a/pkg/server/span_stats_test.go +++ b/pkg/server/span_stats_test.go @@ -70,12 +70,11 @@ func TestSpanStatsMetaScan(t *testing.T) { } } -func TestLocalSpanStats(t *testing.T) { +func TestSpanStatsFanOut(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - skip.UnderStress(t) ctx := context.Background() - numNodes := 3 + const numNodes = 3 tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{}) defer tc.Stopper().Stop(ctx) @@ -149,47 +148,47 @@ func TestLocalSpanStats(t *testing.T) { {spans[4], 2, int64(numNodes * 3)}, {spans[5], 1, int64(numNodes * 2)}, } - // Multi-span request - multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx, - &roachpb.SpanStatsRequest{ - NodeID: "0", - Spans: spans, - }, - ) - require.NoError(t, err) - // Verify stats across different spans. - for _, tcase := range testCases { - rSpan, err := keys.SpanAddr(tcase.span) - require.NoError(t, err) + testutils.SucceedsSoon(t, func() error { - // Assert expected values from multi-span request - spanStats := multiResult.SpanToStats[tcase.span.String()] - require.Equal( - t, - tcase.expectedRanges, - spanStats.RangeCount, - fmt.Sprintf( - "Multi-span: expected %d ranges in span [%s - %s], found %d", - tcase.expectedRanges, - rSpan.Key.String(), - rSpan.EndKey.String(), - spanStats.RangeCount, - ), - ) - require.Equal( - t, - tcase.expectedKeys, - spanStats.TotalStats.LiveCount, - fmt.Sprintf( - "Multi-span: expected %d keys in span [%s - %s], found %d", - tcase.expectedKeys, - rSpan.Key.String(), - rSpan.EndKey.String(), - spanStats.TotalStats.LiveCount, - ), + // Multi-span request + multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx, + &roachpb.SpanStatsRequest{ + NodeID: "0", + Spans: spans, + }, ) - } + require.NoError(t, err) + + // Verify stats across different spans. + for _, tcase := range testCases { + rSpan, err := keys.SpanAddr(tcase.span) + require.NoError(t, err) + + // Assert expected values from multi-span request + spanStats := multiResult.SpanToStats[tcase.span.String()] + if tcase.expectedRanges != spanStats.RangeCount { + return errors.Newf("Multi-span: expected %d ranges in span [%s - %s], found %d", + tcase.expectedRanges, + rSpan.Key.String(), + rSpan.EndKey.String(), + spanStats.RangeCount, + ) + } + if tcase.expectedKeys != spanStats.TotalStats.LiveCount { + return errors.Newf( + "Multi-span: expected %d keys in span [%s - %s], found %d", + tcase.expectedKeys, + rSpan.Key.String(), + rSpan.EndKey.String(), + spanStats.TotalStats.LiveCount, + ) + } + } + + return nil + }) + } // BenchmarkSpanStats measures the cost of collecting span statistics.