Skip to content

Commit

Permalink
server: extend span statistics endpoint
Browse files Browse the repository at this point in the history
Extends: #96223

This PR extends the implementation of our SpanStats RPC endpoint to
fetch stats for multiple spans at once. By extending the endpoint, we
amortize the cost of the RPC's node fanout across all requested spans,
whereas previously, we were issuing a fanout per span requested.
Additionally, this change batches KV layer requests for ranges fully
contained by the span, instead of issuing a request per fully contained
range.

Release note: None

https://cockroachlabs.atlassian.net/browse/DOC-1355 #Informs:
#33316 #Epic: CRDB-8035
  • Loading branch information
Thomas Hardy committed Mar 24, 2023
1 parent 96a9871 commit a7ff80c
Show file tree
Hide file tree
Showing 32 changed files with 875 additions and 248 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@ go_test(
"//pkg/sql/opt/exec/execbuilder:testdata", # keep
],
shard_count = 4,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/3node-tenant/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,7 @@ go_test(
"//pkg/sql/opt/exec/execbuilder:testdata", # keep
],
shard_count = 48,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/5node/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 5,
tags = [
"ccl_test",
"cpu:3",
],
tags = ["cpu:3"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/fakedist-disk/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 5,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/fakedist-vec-off/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 5,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/fakedist/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 6,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 5,
tags = [
"ccl_test",
"cpu:1",
],
tags = ["cpu:1"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/local-vec-off/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 5,
tags = [
"ccl_test",
"cpu:1",
],
tags = ["cpu:1"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
5 changes: 1 addition & 4 deletions pkg/ccl/logictestccl/tests/local/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 19,
tags = [
"ccl_test",
"cpu:1",
],
tags = ["cpu:1"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 4,
tags = [
"ccl_test",
"cpu:4",
],
tags = ["cpu:4"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 1,
tags = [
"ccl_test",
"cpu:2",
],
tags = ["cpu:2"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 19,
tags = [
"ccl_test",
"cpu:4",
],
tags = ["cpu:4"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 14,
tags = [
"ccl_test",
"cpu:4",
],
tags = ["cpu:4"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 8,
tags = [
"ccl_test",
"cpu:4",
],
tags = ["cpu:4"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@ go_test(
"//pkg/ccl/logictestccl:testdata", # keep
],
shard_count = 26,
tags = [
"ccl_test",
"cpu:4",
],
tags = ["cpu:4"],
deps = [
"//pkg/build/bazel",
"//pkg/ccl",
Expand Down
120 changes: 105 additions & 15 deletions pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

t.Run("test tenant permissioning", func(t *testing.T) {
req := roachpb.SpanStatsRequest{
NodeID: "0",
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0",
Spans: []roachpb.Span{aSpan},
}
resp := roachpb.SpanStatsResponse{}

Expand All @@ -162,19 +161,44 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

adminClient := helper.TestCluster().TenantHTTPClient(t, 1, true)
adminClient.PostJSON("/_status/span", &req, &resp)
require.Greaterf(t, resp.RangeCount, int32(0), "postive range count")
require.Greaterf(t, resp.SpanToStats[aSpan.String()].RangeCount, int32(0), "positive range count")
})

t.Run("test tenant isolation", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(bSpan.Key),
EndKey: roachpb.RKey(bSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{bSpan},
})
require.Error(t, err)
})

t.Run("test invalid request payload", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
})
require.ErrorContains(t, err, `span stats request - unexpected populated legacy fields (StartKey, EndKey)`)
})

t.Run("test exceed span request limit", func(t *testing.T) {
// Set the span batch limit to 1.
_, err := helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = 1`)
require.NoError(t, err)
_, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan, aSpan},
})
require.ErrorContains(t, err, `error getting span statistics - number of spans in request payload (2) exceeds`+
` 'server.span_stats.span_batch_limit' cluster setting limit (1)`)
// Reset the span batch limit to default.
_, err = helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = $1`, roachpb.DefaultSpanStatsSpanLimit)
require.NoError(t, err)
})

t.Run("test KV node fan-out", func(t *testing.T) {
_, tID, err := keys.DecodeTenantPrefix(aSpan.Key)
require.NoError(t, err)
Expand All @@ -186,9 +210,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})
require.NoError(t, err)

Expand All @@ -212,14 +235,81 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})

require.NoError(t, err)
require.Equal(t, controlStats.RangeCount+1, stats.RangeCount)
require.Equal(t, controlStats.TotalStats.LiveCount+int64(len(incKeys)), stats.TotalStats.LiveCount)

controlSpanStats := controlStats.SpanToStats[aSpan.String()]
testSpanStats := stats.SpanToStats[aSpan.String()]
require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)

// Make a multi-span call
type spanCase struct {
span roachpb.Span
expectedRangeCount int32
expectedLiveCount int64
}
spanCases := []spanCase{
{
// "a", "b" - single range, single key
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])),
},
expectedRangeCount: 1,
expectedLiveCount: 1,
},
{
// "d", "f" - single range, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])),
},
expectedRangeCount: 1,
expectedLiveCount: 2,
},
{
// "bb", "e" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[2])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])),
},
expectedRangeCount: 2,
expectedLiveCount: 2,
},

{
// "a", "d" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
},
expectedRangeCount: 2,
expectedLiveCount: 3,
},
}

var spans []roachpb.Span
for _, sc := range spanCases {
spans = append(spans, sc.span)
}

stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})

require.NoError(t, err)
// Check each span has their expected values.
for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
}
})

}
Expand Down
2 changes: 2 additions & 0 deletions pkg/roachpb/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"metadata_replicas.go",
"span_config.go",
"span_group.go",
"span_stats.go",
"tenant.go",
"version.go",
],
Expand All @@ -27,6 +28,7 @@ go_library(
"//pkg/keysbase",
"//pkg/kv/kvserver/allocator/load",
"//pkg/kv/kvserver/concurrency/lock",
"//pkg/settings",
"//pkg/storage/enginepb",
"//pkg/util",
"//pkg/util/bitarray",
Expand Down
39 changes: 39 additions & 0 deletions pkg/roachpb/span_stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package roachpb

import "github.com/cockroachdb/cockroach/pkg/settings"

// Put span statistics cluster settings here to avoid import cycle.

const DefaultSpanStatsSpanLimit = 500

// SpanStatsBatchLimit registers the maximum number of spans allowed in a
// span stats request payload.
var SpanStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.span_batch_limit",
"the maximum number of spans allowed in a request payload for span statistics",
DefaultSpanStatsSpanLimit,
settings.PositiveInt,
)

const defaultRangeStatsBatchLimit = 100

// RangeStatsBatchLimit registers the maximum number of ranges to be batched
// when fetching range stats for a span.
var RangeStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.range_batch_limit",
"the maximum batch size when fetching ranges statistics for a span",
defaultRangeStatsBatchLimit,
settings.PositiveInt,
)
Loading

0 comments on commit a7ff80c

Please sign in to comment.