server, statusccl: de-flake span stats fan-out tests

This commit fixes flaky behavior while running `TestSpanStatsFanOut` and `TestTenantSpanStats` under stress. Both tests have been updated to ensure the following behavior: - The tests make sure range splits occur before proceeding. - The tests will retry their assertions to give the new key-value pairs time to replicate. Additionally, `NewTestTenantHelper` was updated to accept a parameter for the number of nodes in the test host cluster. `TestTenantSpanStats` now uses a 3-node cluster to test a real fan-out. Resolves #99770 Resolves #99559 Epic:none Release note: None
cockroachdb · Jul 10, 2023 · 68753c7 · 68753c7
1 parent e2af0b8
commit 68753c7
Show file tree

Hide file tree

Showing 5 changed files with 120 additions and 93 deletions.
diff --git a/pkg/ccl/serverccl/adminccl/tenant_admin_test.go b/pkg/ccl/serverccl/adminccl/tenant_admin_test.go
@@ -42,7 +42,12 @@ func TestTenantAdminAPI(t *testing.T) {
 		StoreDisableCoalesceAdjacent: true,
 	}
 
-	testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs)
+	testHelper := serverccl.NewTestTenantHelper(
+		t,
+		3, /* tenantClusterSize */
+		1, /* numNodes */
+		knobs,
+	)
 	defer testHelper.Cleanup(ctx, t)
 
 	t.Run("tenant_jobs", func(t *testing.T) {

diff --git a/pkg/ccl/serverccl/api_v2_tenant_test.go b/pkg/ccl/serverccl/api_v2_tenant_test.go
@@ -32,7 +32,12 @@ func TestExecSQL(t *testing.T) {
 
 	ctx := context.Background()
 
-	testHelper := NewTestTenantHelper(t, 3 /* tenantClusterSize */, base.TestingKnobs{})
+	testHelper := NewTestTenantHelper(
+		t,
+		3, /* tenantClusterSize */
+		1, /* numNodes */
+		base.TestingKnobs{},
+	)
 	defer testHelper.Cleanup(ctx, t)
 
 	tenantCluster := testHelper.TestCluster()

diff --git a/pkg/ccl/serverccl/statusccl/tenant_status_test.go b/pkg/ccl/serverccl/statusccl/tenant_status_test.go
@@ -66,7 +66,12 @@ func TestTenantStatusAPI(t *testing.T) {
 		StoreDisableCoalesceAdjacent: true,
 	}
 
-	testHelper := serverccl.NewTestTenantHelper(t, 3 /* tenantClusterSize */, knobs)
+	testHelper := serverccl.NewTestTenantHelper(
+		t,
+		3, /* tenantClusterSize */
+		1, /* numNodes */
+		knobs,
+	)
 	defer testHelper.Cleanup(ctx, t)
 
 	// Speed up propagation of tenant capability changes.
@@ -133,18 +138,26 @@ func TestTenantStatusAPI(t *testing.T) {
 		testTenantHotRanges(ctx, t, testHelper)
 	})
 
-	t.Run("tenant_span_stats", func(t *testing.T) {
-		skip.UnderStressWithIssue(t, 99559)
-		skip.UnderDeadlockWithIssue(t, 99770)
-		testTenantSpanStats(ctx, t, testHelper)
-	})
-
 	t.Run("tenant_nodes_capability", func(t *testing.T) {
 		testTenantNodesCapability(ctx, t, testHelper)
 	})
 }
 
-func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.TenantTestHelper) {
+func TestTenantSpanStats(t *testing.T) {
+	defer leaktest.AfterTest(t)()
+	s := log.ScopeWithoutShowLogs(t)
+	defer s.Close(t)
+	defer s.SetupSingleFileLogging()()
+	ctx := context.Background()
+	const numNodes = 3
+	helper := serverccl.NewTestTenantHelper(
+		t,
+		3, /* tenantClusterSize */
+		numNodes,
+		base.TestingKnobs{},
+	)
+	defer helper.Cleanup(ctx, t)
+
 	tenantA := helper.TestCluster().Tenant(0)
 	tenantB := helper.ControlCluster().Tenant(0)
 
@@ -212,18 +225,22 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 			return bytes.Join(keys, nil)
 		}
 
-		controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
-			&roachpb.SpanStatsRequest{
-				NodeID: "0", // 0 indicates we want stats from all nodes.
-				Spans:  []roachpb.Span{aSpan},
-			})
-		require.NoError(t, err)
-
 		// Create a new range in this tenant.
-		_, _, err = helper.HostCluster().Server(0).SplitRange(makeKey(tPrefix, roachpb.Key("c")))
+		newRangeKey := makeKey(tPrefix, roachpb.Key("c"))
+		_, newDesc, err := helper.HostCluster().Server(0).SplitRange(newRangeKey)
 		require.NoError(t, err)
 
-		// Wait for the split to finish and propagate.
+		// Wait until the range split occurs.
+		testutils.SucceedsSoon(t, func() error {
+			desc, err := helper.HostCluster().LookupRange(newRangeKey)
+			require.NoError(t, err)
+			if !desc.StartKey.Equal(newDesc.StartKey) {
+				return errors.New("range has not split")
+			}
+			return nil
+		})
+
+		// Wait for the new range to replicate.
 		err = helper.HostCluster().WaitForFullReplication()
 		require.NoError(t, err)
 
@@ -237,19 +254,6 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 			}
 		}
 
-		stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
-			&roachpb.SpanStatsRequest{
-				NodeID: "0", // 0 indicates we want stats from all nodes.
-				Spans:  []roachpb.Span{aSpan},
-			})
-
-		require.NoError(t, err)
-
-		controlSpanStats := controlStats.SpanToStats[aSpan.String()]
-		testSpanStats := stats.SpanToStats[aSpan.String()]
-		require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
-		require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)
-
 		// Make a multi-span call
 		type spanCase struct {
 			span               roachpb.Span
@@ -264,7 +268,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 					EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])),
 				},
 				expectedRangeCount: 1,
-				expectedLiveCount:  1,
+				expectedLiveCount:  1 * numNodes,
 			},
 			{
 				// "d", "f" - single range, multiple keys
@@ -273,7 +277,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 					EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])),
 				},
 				expectedRangeCount: 1,
-				expectedLiveCount:  2,
+				expectedLiveCount:  2 * numNodes,
 			},
 			{
 				// "bb", "e" - multiple ranges, multiple keys
@@ -282,7 +286,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 					EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])),
 				},
 				expectedRangeCount: 2,
-				expectedLiveCount:  2,
+				expectedLiveCount:  2 * numNodes,
 			},
 
 			{
@@ -292,7 +296,7 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 					EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
 				},
 				expectedRangeCount: 2,
-				expectedLiveCount:  3,
+				expectedLiveCount:  3 * numNodes,
 			},
 		}
 
@@ -301,19 +305,29 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
 			spans = append(spans, sc.span)
 		}
 
-		stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
-			&roachpb.SpanStatsRequest{
-				NodeID: "0", // 0 indicates we want stats from all nodes.
-				Spans:  spans,
-			})
+		testutils.SucceedsSoon(t, func() error {
+			stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
+				&roachpb.SpanStatsRequest{
+					NodeID: "0", // 0 indicates we want stats from all nodes.
+					Spans:  spans,
+				})
+
+			require.NoError(t, err)
+
+			for _, sc := range spanCases {
+				spanStats := stats.SpanToStats[sc.span.String()]
+				if sc.expectedRangeCount != spanStats.RangeCount {
+					return errors.Newf("mismatch on expected range count for span case with span %v", sc.span.String())
+				}
+
+				if sc.expectedLiveCount != spanStats.TotalStats.LiveCount {
+					return errors.Newf("mismatch on expected live count for span case with span %v", sc.span.String())
+				}
+			}
+
+			return nil
+		})
 
-		require.NoError(t, err)
-		// Check each span has their expected values.
-		for _, sc := range spanCases {
-			spanStats := stats.SpanToStats[sc.span.String()]
-			require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
-			require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
-		}
 	})
 
 }

diff --git a/pkg/ccl/serverccl/tenant_test_utils.go b/pkg/ccl/serverccl/tenant_test_utils.go
@@ -141,7 +141,7 @@ var _ TenantTestHelper = &tenantTestHelper{}
 
 // NewTestTenantHelper constructs a TenantTestHelper instance.
 func NewTestTenantHelper(
-	t *testing.T, tenantClusterSize int, knobs base.TestingKnobs,
+	t *testing.T, tenantClusterSize int, numNodes int, knobs base.TestingKnobs,
 ) TenantTestHelper {
 	t.Helper()
 
@@ -151,9 +151,13 @@ func NewTestTenantHelper(
 	params.Knobs = knobs
 	// We're running tenant tests, no need for a default tenant.
 	params.DefaultTestTenant = base.TestTenantDisabled
-	testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{
-		ServerArgs: params,
-	})
+	testCluster := serverutils.StartNewTestCluster(
+		t,
+		numNodes,
+		base.TestClusterArgs{
+			ServerArgs: params,
+		},
+	)
 	server := testCluster.Server(0)
 
 	return &tenantTestHelper{

diff --git a/pkg/server/span_stats_test.go b/pkg/server/span_stats_test.go
@@ -70,12 +70,11 @@ func TestSpanStatsMetaScan(t *testing.T) {
 	}
 }
 
-func TestLocalSpanStats(t *testing.T) {
+func TestSpanStatsFanOut(t *testing.T) {
 	defer leaktest.AfterTest(t)()
 	defer log.Scope(t).Close(t)
-	skip.UnderStress(t)
 	ctx := context.Background()
-	numNodes := 3
+	const numNodes = 3
 	tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{})
 	defer tc.Stopper().Stop(ctx)
 
@@ -149,47 +148,47 @@ func TestLocalSpanStats(t *testing.T) {
 		{spans[4], 2, int64(numNodes * 3)},
 		{spans[5], 1, int64(numNodes * 2)},
 	}
-	// Multi-span request
-	multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
-		&roachpb.SpanStatsRequest{
-			NodeID: "0",
-			Spans:  spans,
-		},
-	)
-	require.NoError(t, err)
 
-	// Verify stats across different spans.
-	for _, tcase := range testCases {
-		rSpan, err := keys.SpanAddr(tcase.span)
-		require.NoError(t, err)
+	testutils.SucceedsSoon(t, func() error {
 
-		// Assert expected values from multi-span request
-		spanStats := multiResult.SpanToStats[tcase.span.String()]
-		require.Equal(
-			t,
-			tcase.expectedRanges,
-			spanStats.RangeCount,
-			fmt.Sprintf(
-				"Multi-span: expected %d ranges in span [%s - %s], found %d",
-				tcase.expectedRanges,
-				rSpan.Key.String(),
-				rSpan.EndKey.String(),
-				spanStats.RangeCount,
-			),
-		)
-		require.Equal(
-			t,
-			tcase.expectedKeys,
-			spanStats.TotalStats.LiveCount,
-			fmt.Sprintf(
-				"Multi-span: expected %d keys in span [%s - %s], found %d",
-				tcase.expectedKeys,
-				rSpan.Key.String(),
-				rSpan.EndKey.String(),
-				spanStats.TotalStats.LiveCount,
-			),
+		// Multi-span request
+		multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
+			&roachpb.SpanStatsRequest{
+				NodeID: "0",
+				Spans:  spans,
+			},
 		)
-	}
+		require.NoError(t, err)
+
+		// Verify stats across different spans.
+		for _, tcase := range testCases {
+			rSpan, err := keys.SpanAddr(tcase.span)
+			require.NoError(t, err)
+
+			// Assert expected values from multi-span request
+			spanStats := multiResult.SpanToStats[tcase.span.String()]
+			if tcase.expectedRanges != spanStats.RangeCount {
+				return errors.Newf("Multi-span: expected %d ranges in span [%s - %s], found %d",
+					tcase.expectedRanges,
+					rSpan.Key.String(),
+					rSpan.EndKey.String(),
+					spanStats.RangeCount,
+				)
+			}
+			if tcase.expectedKeys != spanStats.TotalStats.LiveCount {
+				return errors.Newf(
+					"Multi-span: expected %d keys in span [%s - %s], found %d",
+					tcase.expectedKeys,
+					rSpan.Key.String(),
+					rSpan.EndKey.String(),
+					spanStats.TotalStats.LiveCount,
+				)
+			}
+		}
+
+		return nil
+	})
+
 }
 
 // BenchmarkSpanStats measures the cost of collecting span statistics.