Skip to content

Commit

Permalink
server: extend span statistics endpoint
Browse files Browse the repository at this point in the history
Extends: cockroachdb#96223

This PR extends the implementation of our SpanStats RPC endpoint to
fetch stats for multiple spans at once. By extending the endpoint, we
amortize the cost of the RPC's node fanout across all requested spans,
whereas previously, we were issuing a fanout per span requested.
Additionally, this change batches KV layer requests for ranges fully
contained by the span, instead of issuing a request per fully contained
range.

Release note: None

https://cockroachlabs.atlassian.net/browse/DOC-1355 #Informs:
cockroachdb#33316 #Epic: CRDB-8035
  • Loading branch information
Thomas Hardy authored and ericharmeling committed Mar 29, 2023
1 parent 427212b commit b50ae15
Show file tree
Hide file tree
Showing 17 changed files with 860 additions and 188 deletions.
120 changes: 105 additions & 15 deletions pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

t.Run("test tenant permissioning", func(t *testing.T) {
req := roachpb.SpanStatsRequest{
NodeID: "0",
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0",
Spans: []roachpb.Span{aSpan},
}
resp := roachpb.SpanStatsResponse{}

Expand All @@ -162,19 +161,44 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

adminClient := helper.TestCluster().TenantHTTPClient(t, 1, true)
adminClient.PostJSON("/_status/span", &req, &resp)
require.Greaterf(t, resp.RangeCount, int32(0), "postive range count")
require.Greaterf(t, resp.SpanToStats[aSpan.String()].RangeCount, int32(0), "positive range count")
})

t.Run("test tenant isolation", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(bSpan.Key),
EndKey: roachpb.RKey(bSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{bSpan},
})
require.Error(t, err)
})

t.Run("test invalid request payload", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
})
require.ErrorContains(t, err, `span stats request - unexpected populated legacy fields (StartKey, EndKey)`)
})

t.Run("test exceed span request limit", func(t *testing.T) {
// Set the span batch limit to 1.
_, err := helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = 1`)
require.NoError(t, err)
_, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan, aSpan},
})
require.ErrorContains(t, err, `error getting span statistics - number of spans in request payload (2) exceeds`+
` 'server.span_stats.span_batch_limit' cluster setting limit (1)`)
// Reset the span batch limit to default.
_, err = helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = $1`, roachpb.DefaultSpanStatsSpanLimit)
require.NoError(t, err)
})

t.Run("test KV node fan-out", func(t *testing.T) {
_, tID, err := keys.DecodeTenantPrefix(aSpan.Key)
require.NoError(t, err)
Expand All @@ -186,9 +210,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})
require.NoError(t, err)

Expand All @@ -212,14 +235,81 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})

require.NoError(t, err)
require.Equal(t, controlStats.RangeCount+1, stats.RangeCount)
require.Equal(t, controlStats.TotalStats.LiveCount+int64(len(incKeys)), stats.TotalStats.LiveCount)

controlSpanStats := controlStats.SpanToStats[aSpan.String()]
testSpanStats := stats.SpanToStats[aSpan.String()]
require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)

// Make a multi-span call
type spanCase struct {
span roachpb.Span
expectedRangeCount int32
expectedLiveCount int64
}
spanCases := []spanCase{
{
// "a", "b" - single range, single key
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])),
},
expectedRangeCount: 1,
expectedLiveCount: 1,
},
{
// "d", "f" - single range, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])),
},
expectedRangeCount: 1,
expectedLiveCount: 2,
},
{
// "bb", "e" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[2])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])),
},
expectedRangeCount: 2,
expectedLiveCount: 2,
},

{
// "a", "d" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
},
expectedRangeCount: 2,
expectedLiveCount: 3,
},
}

var spans []roachpb.Span
for _, sc := range spanCases {
spans = append(spans, sc.span)
}

stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})

require.NoError(t, err)
// Check each span has their expected values.
for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
}
})

}
Expand Down
2 changes: 2 additions & 0 deletions pkg/roachpb/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"metadata_replicas.go",
"span_config.go",
"span_group.go",
"span_stats.go",
"tenant.go",
"version.go",
],
Expand All @@ -27,6 +28,7 @@ go_library(
"//pkg/keysbase",
"//pkg/kv/kvserver/allocator/load",
"//pkg/kv/kvserver/concurrency/lock",
"//pkg/settings",
"//pkg/storage/enginepb",
"//pkg/util",
"//pkg/util/bitarray",
Expand Down
39 changes: 39 additions & 0 deletions pkg/roachpb/span_stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package roachpb

import "github.com/cockroachdb/cockroach/pkg/settings"

// Put span statistics cluster settings here to avoid import cycle.

const DefaultSpanStatsSpanLimit = 500

// SpanStatsBatchLimit registers the maximum number of spans allowed in a
// span stats request payload.
var SpanStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.span_batch_limit",
"the maximum number of spans allowed in a request payload for span statistics",
DefaultSpanStatsSpanLimit,
settings.PositiveInt,
)

const defaultRangeStatsBatchLimit = 100

// RangeStatsBatchLimit registers the maximum number of ranges to be batched
// when fetching range stats for a span.
var RangeStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.range_batch_limit",
"the maximum batch size when fetching ranges statistics for a span",
defaultRangeStatsBatchLimit,
settings.PositiveInt,
)
16 changes: 14 additions & 2 deletions pkg/roachpb/span_stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ option go_package = "github.com/cockroachdb/cockroach/pkg/roachpb";
import "storage/enginepb/mvcc.proto";
import "gogoproto/gogo.proto";
import "google/api/annotations.proto";
import "roachpb/data.proto";

// SpanStatsRequest is used to request a SpanStatsResponse for the given key
// span and node id. A node_id value of 0 indicates that the server should
Expand All @@ -24,15 +25,26 @@ message SpanStatsRequest {
string node_id = 1 [(gogoproto.customname) = "NodeID"];
bytes start_key = 2 [(gogoproto.casttype) = "RKey"];
bytes end_key = 3 [(gogoproto.casttype) = "RKey"];
repeated Span spans = 4 [(gogoproto.nullable) = false];
}

message SpanStats {
cockroach.storage.enginepb.MVCCStats total_stats = 1 [(gogoproto.nullable) = false];
// range_count measures the number of ranges that the request span falls within.
// A SpanStatsResponse for a span that lies within a range, and whose start
// key sorts after the range start, and whose end key sorts before the
// range end, will have a range_count value of 1.
int32 range_count = 2;
uint64 approximate_disk_bytes = 3;
}

message SpanStatsResponse {
cockroach.storage.enginepb.MVCCStats total_stats = 1 [(gogoproto.nullable) = false];
// range_count measures the number of ranges that the request span falls within.
// A SpanStatsResponse for a span that lies within a range, and whose start
// key sorts after the range start, and whose end key sorts before the
// range end, will have a range_count value of 1.
int32 range_count = 2;
uint64 approximate_disk_bytes = 3;
cockroach.storage.enginepb.MVCCStats total_stats = 1
[(gogoproto.nullable) = false];
map<string, SpanStats> span_to_stats = 4;
}
17 changes: 13 additions & 4 deletions pkg/rpc/auth_tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,19 @@ func (a tenantAuthorizer) authGetRangeDescriptors(
func (a tenantAuthorizer) authSpanStats(
tenID roachpb.TenantID, args *roachpb.SpanStatsRequest,
) error {
return validateSpan(tenID, roachpb.Span{
Key: args.StartKey.AsRawKey(),
EndKey: args.EndKey.AsRawKey(),
})
// Check if request comes in old format (i.e. fanout from 22.2 node)
if args.StartKey != nil && args.EndKey != nil {
return validateSpan(tenID, roachpb.Span{
Key: args.StartKey.AsRawKey(), EndKey: args.EndKey.AsRawKey(),
})
}
for _, span := range args.Spans {
err := validateSpan(tenID, span)
if err != nil {
return err
}
}
return nil
}

// authRangeLookup authorizes the provided tenant to invoke the RangeLookup RPC
Expand Down
11 changes: 5 additions & 6 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -1422,9 +1422,8 @@ func (s *adminServer) statsForSpan(
if err == nil {
client := serverpb.NewStatusClient(conn)
req := roachpb.SpanStatsRequest{
StartKey: rSpan.Key,
EndKey: rSpan.EndKey,
NodeID: nodeID.String(),
Spans: []roachpb.Span{span},
NodeID: nodeID.String(),
}
spanResponse, err = client.SpanStats(ctx, &req)
}
Expand Down Expand Up @@ -1459,9 +1458,9 @@ func (s *adminServer) statsForSpan(
},
)
} else {
tableStatResponse.Stats.Add(resp.resp.TotalStats)
tableStatResponse.ReplicaCount += int64(resp.resp.RangeCount)
tableStatResponse.ApproximateDiskBytes += resp.resp.ApproximateDiskBytes
tableStatResponse.Stats.Add(resp.resp.SpanToStats[span.String()].TotalStats)
tableStatResponse.ReplicaCount += int64(resp.resp.SpanToStats[span.String()].RangeCount)
tableStatResponse.ApproximateDiskBytes += resp.resp.SpanToStats[span.String()].ApproximateDiskBytes
}
case <-ctx.Done():
// Caller gave up, stop doing work.
Expand Down
Loading

0 comments on commit b50ae15

Please sign in to comment.