Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

extend span stats #98490

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 105 additions & 15 deletions pkg/ccl/serverccl/statusccl/tenant_status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

t.Run("test tenant permissioning", func(t *testing.T) {
req := roachpb.SpanStatsRequest{
NodeID: "0",
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0",
Spans: []roachpb.Span{aSpan},
}
resp := roachpb.SpanStatsResponse{}

Expand All @@ -162,19 +161,44 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

adminClient := helper.TestCluster().TenantHTTPClient(t, 1, true)
adminClient.PostJSON("/_status/span", &req, &resp)
require.Greaterf(t, resp.RangeCount, int32(0), "postive range count")
require.Greaterf(t, resp.SpanToStats[aSpan.String()].RangeCount, int32(0), "positive range count")
})

t.Run("test tenant isolation", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(bSpan.Key),
EndKey: roachpb.RKey(bSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{bSpan},
})
require.Error(t, err)
})

t.Run("test invalid request payload", func(t *testing.T) {
_, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
})
require.ErrorContains(t, err, `span stats request - unexpected populated legacy fields (StartKey, EndKey)`)
})

t.Run("test exceed span request limit", func(t *testing.T) {
// Set the span batch limit to 1.
_, err := helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = 1`)
require.NoError(t, err)
_, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan, aSpan},
})
require.ErrorContains(t, err, `error getting span statistics - number of spans in request payload (2) exceeds`+
` 'server.span_stats.span_batch_limit' cluster setting limit (1)`)
// Reset the span batch limit to default.
_, err = helper.HostCluster().ServerConn(0).Exec(`SET CLUSTER SETTING server.span_stats.span_batch_limit = $1`, roachpb.DefaultSpanStatsSpanLimit)
require.NoError(t, err)
})

t.Run("test KV node fan-out", func(t *testing.T) {
_, tID, err := keys.DecodeTenantPrefix(aSpan.Key)
require.NoError(t, err)
Expand All @@ -186,9 +210,8 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

controlStats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})
require.NoError(t, err)

Expand All @@ -212,14 +235,81 @@ func testTenantSpanStats(ctx context.Context, t *testing.T, helper serverccl.Ten

stats, err := tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
StartKey: roachpb.RKey(aSpan.Key),
EndKey: roachpb.RKey(aSpan.EndKey),
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: []roachpb.Span{aSpan},
})

require.NoError(t, err)
require.Equal(t, controlStats.RangeCount+1, stats.RangeCount)
require.Equal(t, controlStats.TotalStats.LiveCount+int64(len(incKeys)), stats.TotalStats.LiveCount)

controlSpanStats := controlStats.SpanToStats[aSpan.String()]
testSpanStats := stats.SpanToStats[aSpan.String()]
require.Equal(t, controlSpanStats.RangeCount+1, testSpanStats.RangeCount)
require.Equal(t, controlSpanStats.TotalStats.LiveCount+int64(len(incKeys)), testSpanStats.TotalStats.LiveCount)

// Make a multi-span call
type spanCase struct {
span roachpb.Span
expectedRangeCount int32
expectedLiveCount int64
}
spanCases := []spanCase{
{
// "a", "b" - single range, single key
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[1])),
},
expectedRangeCount: 1,
expectedLiveCount: 1,
},
{
// "d", "f" - single range, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[5])),
},
expectedRangeCount: 1,
expectedLiveCount: 2,
},
{
// "bb", "e" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[2])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[4])),
},
expectedRangeCount: 2,
expectedLiveCount: 2,
},

{
// "a", "d" - multiple ranges, multiple keys
span: roachpb.Span{
Key: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[0])),
EndKey: makeKey(keys.MakeTenantPrefix(tID), []byte(incKeys[3])),
},
expectedRangeCount: 2,
expectedLiveCount: 3,
},
}

var spans []roachpb.Span
for _, sc := range spanCases {
spans = append(spans, sc.span)
}

stats, err = tenantA.TenantStatusSrv().(serverpb.TenantStatusServer).SpanStats(ctx,
&roachpb.SpanStatsRequest{
NodeID: "0", // 0 indicates we want stats from all nodes.
Spans: spans,
})

require.NoError(t, err)
// Check each span has their expected values.
for _, sc := range spanCases {
spanStats := stats.SpanToStats[sc.span.String()]
require.Equal(t, spanStats.RangeCount, sc.expectedRangeCount, fmt.Sprintf("mismatch on expected range count for span case with span %v", sc.span.String()))
require.Equal(t, spanStats.TotalStats.LiveCount, sc.expectedLiveCount, fmt.Sprintf("mismatch on expected live count for span case with span %v", sc.span.String()))
}
})

}
Expand Down
2 changes: 2 additions & 0 deletions pkg/roachpb/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"metadata_replicas.go",
"span_config.go",
"span_group.go",
"span_stats.go",
"tenant.go",
"version.go",
],
Expand All @@ -27,6 +28,7 @@ go_library(
"//pkg/keysbase",
"//pkg/kv/kvserver/allocator/load",
"//pkg/kv/kvserver/concurrency/lock",
"//pkg/settings",
"//pkg/storage/enginepb",
"//pkg/util",
"//pkg/util/bitarray",
Expand Down
39 changes: 39 additions & 0 deletions pkg/roachpb/span_stats.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Copyright 2023 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package roachpb

import "github.com/cockroachdb/cockroach/pkg/settings"

// Put span statistics cluster settings here to avoid import cycle.

const DefaultSpanStatsSpanLimit = 500

// SpanStatsBatchLimit registers the maximum number of spans allowed in a
// span stats request payload.
var SpanStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.span_batch_limit",
"the maximum number of spans allowed in a request payload for span statistics",
DefaultSpanStatsSpanLimit,
settings.PositiveInt,
)

const defaultRangeStatsBatchLimit = 100

// RangeStatsBatchLimit registers the maximum number of ranges to be batched
// when fetching range stats for a span.
var RangeStatsBatchLimit = settings.RegisterIntSetting(
settings.TenantWritable,
"server.span_stats.range_batch_limit",
"the maximum batch size when fetching ranges statistics for a span",
defaultRangeStatsBatchLimit,
settings.PositiveInt,
)
16 changes: 14 additions & 2 deletions pkg/roachpb/span_stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ option go_package = "github.com/cockroachdb/cockroach/pkg/roachpb";
import "storage/enginepb/mvcc.proto";
import "gogoproto/gogo.proto";
import "google/api/annotations.proto";
import "roachpb/data.proto";

// SpanStatsRequest is used to request a SpanStatsResponse for the given key
// span and node id. A node_id value of 0 indicates that the server should
Expand All @@ -24,15 +25,26 @@ message SpanStatsRequest {
string node_id = 1 [(gogoproto.customname) = "NodeID"];
bytes start_key = 2 [(gogoproto.casttype) = "RKey"];
bytes end_key = 3 [(gogoproto.casttype) = "RKey"];
repeated Span spans = 4 [(gogoproto.nullable) = false];
}

message SpanStats {
cockroach.storage.enginepb.MVCCStats total_stats = 1 [(gogoproto.nullable) = false];
// range_count measures the number of ranges that the request span falls within.
// A SpanStatsResponse for a span that lies within a range, and whose start
// key sorts after the range start, and whose end key sorts before the
// range end, will have a range_count value of 1.
int32 range_count = 2;
uint64 approximate_disk_bytes = 3;
}

message SpanStatsResponse {
cockroach.storage.enginepb.MVCCStats total_stats = 1 [(gogoproto.nullable) = false];
// range_count measures the number of ranges that the request span falls within.
// A SpanStatsResponse for a span that lies within a range, and whose start
// key sorts after the range start, and whose end key sorts before the
// range end, will have a range_count value of 1.
int32 range_count = 2;
uint64 approximate_disk_bytes = 3;
cockroach.storage.enginepb.MVCCStats total_stats = 1
[(gogoproto.nullable) = false];
map<string, SpanStats> span_to_stats = 4;
}
17 changes: 13 additions & 4 deletions pkg/rpc/auth_tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,19 @@ func (a tenantAuthorizer) authGetRangeDescriptors(
func (a tenantAuthorizer) authSpanStats(
tenID roachpb.TenantID, args *roachpb.SpanStatsRequest,
) error {
return validateSpan(tenID, roachpb.Span{
Key: args.StartKey.AsRawKey(),
EndKey: args.EndKey.AsRawKey(),
})
// Check if request comes in old format (i.e. fanout from 22.2 node)
if args.StartKey != nil && args.EndKey != nil {
return validateSpan(tenID, roachpb.Span{
Key: args.StartKey.AsRawKey(), EndKey: args.EndKey.AsRawKey(),
})
}
for _, span := range args.Spans {
err := validateSpan(tenID, span)
if err != nil {
return err
}
}
return nil
}

// authRangeLookup authorizes the provided tenant to invoke the RangeLookup RPC
Expand Down
11 changes: 5 additions & 6 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -1422,9 +1422,8 @@ func (s *adminServer) statsForSpan(
if err == nil {
client := serverpb.NewStatusClient(conn)
req := roachpb.SpanStatsRequest{
StartKey: rSpan.Key,
EndKey: rSpan.EndKey,
NodeID: nodeID.String(),
Spans: []roachpb.Span{span},
NodeID: nodeID.String(),
}
spanResponse, err = client.SpanStats(ctx, &req)
}
Expand Down Expand Up @@ -1459,9 +1458,9 @@ func (s *adminServer) statsForSpan(
},
)
} else {
tableStatResponse.Stats.Add(resp.resp.TotalStats)
tableStatResponse.ReplicaCount += int64(resp.resp.RangeCount)
tableStatResponse.ApproximateDiskBytes += resp.resp.ApproximateDiskBytes
tableStatResponse.Stats.Add(resp.resp.SpanToStats[span.String()].TotalStats)
tableStatResponse.ReplicaCount += int64(resp.resp.SpanToStats[span.String()].RangeCount)
tableStatResponse.ApproximateDiskBytes += resp.resp.SpanToStats[span.String()].ApproximateDiskBytes
}
case <-ctx.Done():
// Caller gave up, stop doing work.
Expand Down
Loading