Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
106551: server, statusccl: de-flake span stats fan-out tests r=zachlite a=zachlite

This commit fixes flaky behavior while running
`TestSpanStatsFanOut` and `TestTenantSpanStats` under stress.

Both tests have been updated to ensure the following behavior:
- The tests make sure range splits occur before proceeding.
- The tests will retry their assertions to give the new key-value pairs time to replicate.

Additionally, `NewTestTenantHelper` was updated to accept a parameter for the number of nodes in the test host cluster. `TestTenantSpanStats` now uses a 3-node cluster to test a real fan-out.

Resolves #99770
Resolves #99559
Epic:none
Release note: None

106627: diagnosticsccl: unskip TestTenantReport r=yuzefovich a=yuzefovich

The flakiness of the test was resolved by #106053.

Addresses: #101622.
Epic: None

Release note: None

Co-authored-by: zachlite <[email protected]>
Co-authored-by: Yahor Yuzefovich <[email protected]>
  • Loading branch information
3 people committed Jul 11, 2023
3 parents c566d1f + f1bc66e + 749255e commit 0b628fb
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 78 deletions.
1 change: 0 additions & 1 deletion pkg/ccl/serverccl/diagnosticsccl/reporter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ const elemName = "somestring"

func TestTenantReport(t *testing.T) {
defer leaktest.AfterTest(t)()
skip.WithIssue(t, 101622, "flaky test")
defer log.Scope(t).Close(t)

rt := startReporterTest(t, base.TestTenantDisabled)
Expand Down
71 changes: 35 additions & 36 deletions pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,6 @@ func TestTenantStatusAPI(t *testing.T) {
})

t.Run("tenant_span_stats", func(t *testing.T) {
skip.UnderStressWithIssue(t, 99559)
skip.UnderDeadlockWithIssue(t, 99770)
testTenantSpanStats(ctx, t, testHelper)
})

Expand Down Expand Up @@ -212,18 +210,22 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
return bytes.Join(keys, nil)
}

controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})
require.NoError(t, err)

// Create a new range in this tenant.
_, _, err = helper.HostCluster().Server(0).SplitRange(makeKey(tPrefix, roachpb.Key("c")))
newRangeKey := makeKey(tPrefix, roachpb.Key("c"))
_, newDesc, err := helper.HostCluster().Server(0).SplitRange(newRangeKey)
require.NoError(t, err)

// Wait for the split to finish and propagate.
// Wait until the range split occurs.
testutils.SucceedsSoon(t, func() error {
desc, err := helper.HostCluster().LookupRange(newRangeKey)
require.NoError(t, err)
if !desc.StartKey.Equal(newDesc.StartKey) {
return errors.New("range has not split")
}
return nil
})

// Wait for the new range to replicate.
err = helper.HostCluster().WaitForFullReplication()
require.NoError(t, err)

Expand All @@ -237,19 +239,6 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
}
}

stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})

require.NoError(t, err)

controlSpanStats := controlStats.SpanToStats[aSpan.String()]
testSpanStats := stats.SpanToStats[aSpan.String()]
require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)

// Make a multi-span call
type spanCase struct {
span roachpb.Span
Expand Down Expand Up @@ -301,19 +290,29 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten
spans = append(spans, sc.span)
}

stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})
testutils.SucceedsSoon(t, func() error {
stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})

require.NoError(t, err)

for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
if sc.expectedRangeCount != spanStats.RangeCount {
return errors.Newf("mismatch on expected range count for span case with span %v", sc.span.String())
}

if sc.expectedLiveCount != spanStats.TotalStats.LiveCount {
return errors.Newf("mismatch on expected live count for span case with span %v", sc.span.String())
}
}

return nil
})

require.NoError(t, err)
// Check each span has their expected values.
for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
}
})

}
Expand Down
81 changes: 40 additions & 41 deletions pkg/server/span_stats_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,11 @@ func TestSpanStatsMetaScan(t *testing.T) {
}
}

func TestLocalSpanStats(t *testing.T) {
func TestSpanStatsFanOut(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderStress(t)
ctx := context.Background()
numNodes := 3
const numNodes = 3
tc := testcluster.StartTestCluster(t, numNodes, base.TestClusterArgs{})
defer tc.Stopper().Stop(ctx)

Expand Down Expand Up @@ -149,47 +148,47 @@ func TestLocalSpanStats(t *testing.T) {
{spans[4], 2, int64(numNodes * 3)},
{spans[5], 1, int64(numNodes * 2)},
}
// Multi-span request
multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0",
Spans: spans,
},
)
require.NoError(t, err)

// Verify stats across different spans.
for _, tcase := range testCases {
rSpan, err := keys.SpanAddr(tcase.span)
require.NoError(t, err)
testutils.SucceedsSoon(t, func() error {

// Assert expected values from multi-span request
spanStats := multiResult.SpanToStats[tcase.span.String()]
require.Equal(
t,
tcase.expectedRanges,
spanStats.RangeCount,
fmt.Sprintf(
"Multi-span: expected %d ranges in span [%s - %s], found %d",
tcase.expectedRanges,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.RangeCount,
),
)
require.Equal(
t,
tcase.expectedKeys,
spanStats.TotalStats.LiveCount,
fmt.Sprintf(
"Multi-span: expected %d keys in span [%s - %s], found %d",
tcase.expectedKeys,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.TotalStats.LiveCount,
),
// Multi-span request
multiResult, err := s.StatusServer().(serverpb.StatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0",
Spans: spans,
},
)
}
require.NoError(t, err)

// Verify stats across different spans.
for _, tcase := range testCases {
rSpan, err := keys.SpanAddr(tcase.span)
require.NoError(t, err)

// Assert expected values from multi-span request
spanStats := multiResult.SpanToStats[tcase.span.String()]
if tcase.expectedRanges != spanStats.RangeCount {
return errors.Newf("Multi-span: expected %d ranges in span [%s - %s], found %d",
tcase.expectedRanges,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.RangeCount,
)
}
if tcase.expectedKeys != spanStats.TotalStats.LiveCount {
return errors.Newf(
"Multi-span: expected %d keys in span [%s - %s], found %d",
tcase.expectedKeys,
rSpan.Key.String(),
rSpan.EndKey.String(),
spanStats.TotalStats.LiveCount,
)
}
}

return nil
})

}

// BenchmarkSpanStats measures the cost of collecting span statistics.
Expand Down

0 comments on commit 0b628fb

Please sign in to comment.