Skip to content

Commit

Permalink
insights: crdb_internal.cluster_execution_insights
Browse files Browse the repository at this point in the history
Here we introduce a new virtual table for a cluster-wide view of
"insights," a subsystem of sqlstats that is currently disabled by
default but that will identify slow- and slower-than-usual statement
executions, along with other potentially problematic behaviors we will
be building support for.

This table will back the upcoming insights UI over the new SQL-over-HTTP
endpoint.

Release note (sql change): A new crdb_internal virtual table,
cluster_execution_insights, was introduced, offering a cluster-wide view
of the same node-local information available in node_execution_insights.
The insights subsystem is, as of this commit, still under development
and disabled by default, so there will not yet be much to see here.
  • Loading branch information
matthewtodd committed Aug 9, 2022
1 parent 7c18668 commit 4aa312a
Show file tree
Hide file tree
Showing 27 changed files with 2,182 additions and 1,791 deletions.
1 change: 1 addition & 0 deletions docs/generated/http/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ genrule(
"//pkg/sql/pgwire/pgerror:pgerror_proto",
"//pkg/sql/schemachanger/scpb:scpb_proto",
"//pkg/sql/sessiondatapb:sessiondatapb_proto",
"//pkg/sql/sqlstats/insights:insights_proto",
"//pkg/sql/types:types_proto",
"//pkg/storage/enginepb:enginepb_proto",
"//pkg/ts/catalog:catalog_proto",
Expand Down
46 changes: 46 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -4877,6 +4877,52 @@ Support status: [reserved](#support-status)



## ListExecutionInsights



ListExecutionInsights returns potentially problematic statements cluster-wide,
along with actions we suggest the application developer might take to remedy them.

Support status: [reserved](#support-status)

#### Request Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_id | [string](#cockroach.server.serverpb.ListExecutionInsightsRequest-string) | | node_id is a string so that "local" can be used to specify that no forwarding is necessary. | [reserved](#support-status) |







#### Response Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| insights | [cockroach.sql.insights.Insight](#cockroach.server.serverpb.ListExecutionInsightsResponse-cockroach.sql.insights.Insight) | repeated | insights lists any potentially problematic statements and actions we suggest the application developer might take to remedy them. | [reserved](#support-status) |
| errors | [cockroach.errorspb.EncodedError](#cockroach.server.serverpb.ListExecutionInsightsResponse-cockroach.errorspb.EncodedError) | repeated | errors holds any errors that occurred during fan-out calls to other nodes. | [reserved](#support-status) |







## RequestCA

`GET /_join/v1/ca`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ crdb_internal cluster_contended_tables view admin NULL NULL
crdb_internal cluster_contention_events table admin NULL NULL
crdb_internal cluster_database_privileges table admin NULL NULL
crdb_internal cluster_distsql_flows table admin NULL NULL
crdb_internal cluster_execution_insights table admin NULL NULL
crdb_internal cluster_inflight_traces table admin NULL NULL
crdb_internal cluster_locks table admin NULL NULL
crdb_internal cluster_queries table admin NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
Expand Down
3 changes: 3 additions & 0 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ zip
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows...
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows: done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows: writing output: debug/crdb_internal.cluster_distsql_flows.txt...
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights...
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights: done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights: writing output: debug/crdb_internal.cluster_execution_insights.txt...
[cluster] retrieving SQL data for crdb_internal.cluster_locks...
[cluster] retrieving SQL data for crdb_internal.cluster_locks: done
[cluster] retrieving SQL data for crdb_internal.cluster_locks: writing output: debug/crdb_internal.cluster_locks.txt...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] retrieving SQL data for crdb_internal.cluster_contention_events... writing output: debug/crdb_internal.cluster_contention_events.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_distsql_flows... writing output: debug/crdb_internal.cluster_distsql_flows.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_database_privileges... writing output: debug/crdb_internal.cluster_database_privileges.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_execution_insights... writing output: debug/crdb_internal.cluster_execution_insights.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_locks... writing output: debug/crdb_internal.cluster_locks.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_queries... writing output: debug/crdb_internal.cluster_queries.txt... done
[cluster] retrieving SQL data for crdb_internal.cluster_sessions... writing output: debug/crdb_internal.cluster_sessions.txt... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ var debugZipTablesPerCluster = []string{
"crdb_internal.cluster_contention_events",
"crdb_internal.cluster_distsql_flows",
"crdb_internal.cluster_database_privileges",
"crdb_internal.cluster_execution_insights",
"crdb_internal.cluster_locks",
"crdb_internal.cluster_queries",
"crdb_internal.cluster_sessions",
Expand Down
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ go_library(
"//pkg/sql/sqlliveness",
"//pkg/sql/sqlliveness/slprovider",
"//pkg/sql/sqlstats",
"//pkg/sql/sqlstats/insights",
"//pkg/sql/sqlstats/persistedsqlstats",
"//pkg/sql/sqlstats/persistedsqlstats/sqlstatsutil",
"//pkg/sql/sqlutil",
Expand Down
4 changes: 4 additions & 0 deletions pkg/server/serverpb/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ proto_library(
"//pkg/server/diagnostics/diagnosticspb:diagnosticspb_proto",
"//pkg/server/status/statuspb:statuspb_proto",
"//pkg/sql/contentionpb:contentionpb_proto",
"//pkg/sql/sqlstats/insights:insights_proto",
"//pkg/storage/enginepb:enginepb_proto",
"//pkg/ts/catalog:catalog_proto",
"//pkg/util:util_proto",
"//pkg/util/log/logpb:logpb_proto",
"//pkg/util/metric:metric_proto",
"//pkg/util/tracing/tracingpb:tracingpb_proto",
"@com_github_cockroachdb_errors//errorspb:errorspb_proto",
"@com_github_gogo_protobuf//gogoproto:gogo_proto",
"@com_google_protobuf//:duration_proto",
"@com_google_protobuf//:timestamp_proto",
Expand Down Expand Up @@ -68,13 +70,15 @@ go_proto_library(
"//pkg/sql/contentionpb",
"//pkg/sql/execinfrapb", # keep
"//pkg/sql/pgwire/pgwirecancel", # keep
"//pkg/sql/sqlstats/insights",
"//pkg/storage/enginepb",
"//pkg/ts/catalog",
"//pkg/util",
"//pkg/util/log/logpb",
"//pkg/util/metric",
"//pkg/util/tracing/tracingpb",
"//pkg/util/uuid", # keep
"@com_github_cockroachdb_errors//errorspb",
"@com_github_gogo_protobuf//gogoproto",
# NB: The grpc-gateway compiler injects a dependency on the descriptor
# package that Gazelle isn't prepared to deal with.
Expand Down
1 change: 1 addition & 0 deletions pkg/server/serverpb/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ type SQLStatusServer interface {
TxnIDResolution(context.Context, *TxnIDResolutionRequest) (*TxnIDResolutionResponse, error)
TransactionContentionEvents(context.Context, *TransactionContentionEventsRequest) (*TransactionContentionEventsResponse, error)
NodesList(context.Context, *NodesListRequest) (*NodesListResponse, error)
ListExecutionInsights(context.Context, *ListExecutionInsightsRequest) (*ListExecutionInsightsResponse, error)
}

// OptionalNodesStatusServer is a StatusServer that is only optionally present
Expand Down
27 changes: 27 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package cockroach.server.serverpb;
option go_package = "serverpb";

import "build/info.proto";
import "errorspb/errors.proto";
import "gossip/gossip.proto";
import "jobs/jobspb/jobs.proto";
import "roachpb/app_stats.proto";
Expand All @@ -23,6 +24,7 @@ import "server/diagnostics/diagnosticspb/diagnostics.proto";
import "server/serverpb/index_recommendations.proto";
import "server/status/statuspb/status.proto";
import "sql/contentionpb/contention.proto";
import "sql/sqlstats/insights/insights.proto";
import "storage/enginepb/engine.proto";
import "storage/enginepb/mvcc.proto";
import "storage/enginepb/rocksdb.proto";
Expand Down Expand Up @@ -1824,6 +1826,27 @@ message TransactionContentionEventsResponse {
];
}

message ListExecutionInsightsRequest {
// node_id is a string so that "local" can be used to specify that no
// forwarding is necessary.
string node_id = 1 [
(gogoproto.customname) = "NodeID"
];
}

message ListExecutionInsightsResponse {
// insights lists any potentially problematic statements and actions we
// suggest the application developer might take to remedy them.
repeated cockroach.sql.insights.Insight insights = 1 [
(gogoproto.nullable) = false
];

// errors holds any errors that occurred during fan-out calls to other nodes.
repeated errorspb.EncodedError errors = 2 [
(gogoproto.nullable) = false
];
}

service Status {
// Certificates retrieves a copy of the TLS certificates.
rpc Certificates(CertificatesRequest) returns (CertificatesResponse) {
Expand Down Expand Up @@ -2260,4 +2283,8 @@ service Status {
get: "/_status/transactioncontentionevents"
};
}

// ListExecutionInsights returns potentially problematic statements cluster-wide,
// along with actions we suggest the application developer might take to remedy them.
rpc ListExecutionInsights(ListExecutionInsightsRequest) returns (ListExecutionInsightsResponse) {}
}
74 changes: 74 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/privilege"
"github.com/cockroachdb/cockroach/pkg/sql/roleoption"
"github.com/cockroachdb/cockroach/pkg/sql/sem/catconstants"
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats/insights"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/contextutil"
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
Expand Down Expand Up @@ -418,6 +419,19 @@ func (b *baseStatusServer) ListLocalDistSQLFlows(
return response, nil
}

func (b *baseStatusServer) localExecutionInsights(
ctx context.Context,
) (*serverpb.ListExecutionInsightsResponse, error) {
var response serverpb.ListExecutionInsightsResponse

reader := b.sqlServer.pgServer.SQLServer.GetSQLStatsProvider()
reader.IterateInsights(ctx, func(ctx context.Context, insight *insights.Insight) {
response.Insights = append(response.Insights, *insight)
})

return &response, nil
}

func (b *baseStatusServer) localTxnIDResolution(
req *serverpb.TxnIDResolutionRequest,
) *serverpb.TxnIDResolutionResponse {
Expand Down Expand Up @@ -3105,6 +3119,66 @@ func mergeDistSQLRemoteFlows(a, b []serverpb.DistSQLRemoteFlows) []serverpb.Dist
return result
}

func (s *statusServer) ListExecutionInsights(
ctx context.Context, req *serverpb.ListExecutionInsightsRequest,
) (*serverpb.ListExecutionInsightsResponse, error) {
ctx = propagateGatewayMetadata(ctx)
ctx = s.AnnotateCtx(ctx)

// Check permissions early to avoid fan-out to all nodes.
if err := s.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil {
// NB: not using serverError() here since the priv checker
// already returns a proper gRPC error status.
return nil, err
}

localRequest := serverpb.ListExecutionInsightsRequest{NodeID: "local"}

if len(req.NodeID) > 0 {
requestedNodeID, local, err := s.parseNodeID(req.NodeID)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}
if local {
return s.localExecutionInsights(ctx)
}
statusClient, err := s.dialNode(ctx, requestedNodeID)
if err != nil {
return nil, serverError(ctx, err)
}
return statusClient.ListExecutionInsights(ctx, &localRequest)
}

var response serverpb.ListExecutionInsightsResponse

dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) {
return s.dialNode(ctx, nodeID)
}
nodeFn := func(ctx context.Context, client interface{}, nodeID roachpb.NodeID) (interface{}, error) {
statusClient := client.(serverpb.StatusClient)
resp, err := statusClient.ListExecutionInsights(ctx, &localRequest)
if err != nil {
return nil, err
}
return resp, nil
}
responseFn := func(nodeID roachpb.NodeID, nodeResponse interface{}) {
if nodeResponse == nil {
return
}
insightsResponse := nodeResponse.(*serverpb.ListExecutionInsightsResponse)
response.Insights = append(response.Insights, insightsResponse.Insights...)
}
errorFn := func(nodeID roachpb.NodeID, err error) {
response.Errors = append(response.Errors, errors.EncodeError(ctx, err))
}

if err := s.iterateNodes(ctx, "execution insights list", dialFn, nodeFn, responseFn, errorFn); err != nil {
return nil, serverError(ctx, err)
}
return &response, nil
}

// SpanStats requests the total statistics stored on a node for a given key
// span, which may include multiple ranges.
func (s *statusServer) SpanStats(
Expand Down
65 changes: 65 additions & 0 deletions pkg/server/tenant_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,71 @@ func (t *tenantStatusServer) ListLocalContentionEvents(
return t.baseStatusServer.ListLocalContentionEvents(ctx, req)
}

func (t *tenantStatusServer) ListExecutionInsights(
ctx context.Context, req *serverpb.ListExecutionInsightsRequest,
) (*serverpb.ListExecutionInsightsResponse, error) {
ctx = propagateGatewayMetadata(ctx)
ctx = t.AnnotateCtx(ctx)

// Check permissions early to avoid fan-out to all nodes.
if err := t.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil {
// NB: not using serverError() here since the priv checker
// already returns a proper gRPC error status.
return nil, err
}

if t.sqlServer.SQLInstanceID() == 0 {
return nil, status.Errorf(codes.Unavailable, "instanceID not set")
}

localRequest := serverpb.ListExecutionInsightsRequest{NodeID: "local"}

if len(req.NodeID) > 0 {
requestedInstanceID, local, err := t.parseInstanceID(req.NodeID)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}
if local {
return t.baseStatusServer.localExecutionInsights(ctx)
}
instance, err := t.sqlServer.sqlInstanceProvider.GetInstance(ctx, requestedInstanceID)
if err != nil {
return nil, err
}
statusClient, err := t.dialPod(ctx, requestedInstanceID, instance.InstanceAddr)
if err != nil {
return nil, err
}
return statusClient.ListExecutionInsights(ctx, &localRequest)
}

var response serverpb.ListExecutionInsightsResponse

podFn := func(ctx context.Context, client interface{}, _ base.SQLInstanceID) (interface{}, error) {
statusClient := client.(serverpb.StatusClient)
resp, err := statusClient.ListExecutionInsights(ctx, &localRequest)
if err != nil {
return nil, err
}
return resp, nil
}
responseFn := func(_ base.SQLInstanceID, nodeResp interface{}) {
if nodeResp == nil {
return
}
insightsResponse := nodeResp.(*serverpb.ListExecutionInsightsResponse)
response.Insights = append(response.Insights, insightsResponse.Insights...)
}
errorFn := func(instanceID base.SQLInstanceID, err error) {
response.Errors = append(response.Errors, errors.EncodeError(ctx, err))
}

if err := t.iteratePods(ctx, "execution insights list", t.dialCallback, podFn, responseFn, errorFn); err != nil {
return nil, err
}
return &response, nil
}

func (t *tenantStatusServer) ResetSQLStats(
ctx context.Context, req *serverpb.ResetSQLStatsRequest,
) (*serverpb.ResetSQLStatsResponse, error) {
Expand Down
Loading

0 comments on commit 4aa312a

Please sign in to comment.