Skip to content

Commit

Permalink
sql: introduce crdb_internal.historical_contention_events virtual table
Browse files Browse the repository at this point in the history
This commit introduces `crdb_internal.historical_contention_events`
virtual table. This virtual tables exposes historical contention events
annoatetd with transaction fingerprint IDs for transactions that have
finished executing. This allows this virtual table to be joined into the
statement statistics and transaction statistics tables.
The new virtual table require at least VIEWACTIVITYREDACTED permission
to access. However, in order to view the contending keys, it would
require at least VIEWACTIVITY permission or above.

Resolves cockroachdb#75904

Release note (sql change): introducing
`crdb_internal.historical_contention_events` virtual table, that exposes
historical contention events. The events exposed in the new virtual
table also include transaction fingerprint IDs for both blocking and
waiting transactions. This allows the new virtual table to be joined
into statement statistics and transaction statistics tables.
The new virtual table require at least VIEWACTIVITYREDACTED permission
to access. However, in order to view the contending keys, it would
require at least VIEWACTIVITY permission or above. The contention events
are stored in memory. The amount of contention events stored is
controlled via 'sql.contention.event_store.capacity' cluster setting.

Release note (api change): introducing
GET `/_status/historicalcontentionevents` endpoint, that returns
cluster-wide in-memory historical contention events.
The endpoint require at least VIEWACTIVITYREDACTED permission
to access. However, in order to expose the contending keys, it would
require at least VIEWACTIVITY permission or above. The contention events
are stored in memory. The amount of contention events stored is
controlled via 'sql.contention.event_store.capacity' cluster setting.
  • Loading branch information
Azhng committed Feb 24, 2022
1 parent fd8f40a commit a5b5fed
Show file tree
Hide file tree
Showing 28 changed files with 2,276 additions and 1,626 deletions.
45 changes: 45 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -4342,6 +4342,51 @@ Response object for issuing Transaction ID Resolution.



## HistoricalContentionEvents

`GET /_status/historicalcontentionevents`

HistoricalContentionEvents returns a list of un-aggregated contention
events sorted by the collection timestamp.

Support status: [reserved](#support-status)

#### Request Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_id | [string](#cockroach.server.serverpb.HistoricalContentionEventsRequest-string) | | | [reserved](#support-status) |







#### Response Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| events | [cockroach.sql.contentionpb.ExtendedContentionEvent](#cockroach.server.serverpb.HistoricalContentionEventsResponse-cockroach.sql.contentionpb.ExtendedContentionEvent) | repeated | | [reserved](#support-status) |







## RequestCA

`GET /_join/v1/ca`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ crdb_internal gossip_alerts table NULL NULL NULL
crdb_internal gossip_liveness table NULL NULL NULL
crdb_internal gossip_network table NULL NULL NULL
crdb_internal gossip_nodes table NULL NULL NULL
crdb_internal historical_contention_events table NULL NULL NULL
crdb_internal index_columns table NULL NULL NULL
crdb_internal index_usage_statistics table NULL NULL NULL
crdb_internal invalid_objects table NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[cluster] requesting CPU profiles
Expand Down
3 changes: 3 additions & 0 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,9 @@ zip
[cluster] retrieving SQL data for crdb_internal.default_privileges...
[cluster] retrieving SQL data for crdb_internal.default_privileges: done
[cluster] retrieving SQL data for crdb_internal.default_privileges: writing output: debug/crdb_internal.default_privileges.txt...
[cluster] retrieving SQL data for crdb_internal.historical_contention_events...
[cluster] retrieving SQL data for crdb_internal.historical_contention_events: done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events: writing output: debug/crdb_internal.historical_contention_events.txt...
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics...
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics: done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics: writing output: debug/crdb_internal.index_usage_statistics.txt...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ debug zip --concurrency=1 /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response...
[cluster] requesting liveness: last request failed: rpc error: ...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ var debugZipTablesPerCluster = []string{
"crdb_internal.invalid_objects",
"crdb_internal.index_usage_statistics",
"crdb_internal.table_indexes",
"crdb_internal.historical_contention_events",
}

// getNodesList constructs a NodesListResponse using the Nodes API. We need this while building
Expand Down
3 changes: 3 additions & 0 deletions pkg/rpc/auth_tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ func (a tenantAuthorizer) authorize(
case "/cockroach.server.serverpb.Status/CancelLocalQuery":
return a.authTenant(tenID)

case "/cockroach.server.serverpb.Status/HistoricalContentionEvents":
return a.authTenant(tenID)

case "/cockroach.roachpb.Internal/GetSpanConfigs":
return a.authGetSpanConfigs(tenID, req.(*roachpb.GetSpanConfigsRequest))

Expand Down
24 changes: 24 additions & 0 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -3293,6 +3293,30 @@ func (c *adminPrivilegeChecker) requireViewActivityAndNoViewActivityRedactedPerm
return nil
}

func (c *adminPrivilegeChecker) hasViewActivityPermission(
ctx context.Context,
) (hasViewActivityPermission bool, _ error) {
userName, isAdmin, err := c.getUserAndRole(ctx)
if err != nil {
return false /* hasViewActivityPermission */, serverError(ctx, err)
}

if isAdmin {
return true /* hasViewActivityPermission */, nil
}

hasViewActivity, err := c.hasRoleOption(ctx, userName, roleoption.VIEWACTIVITY)
if err != nil {
return false /* hasViewActivityPermission */, serverError(ctx, err)
}

if hasViewActivity {
return true /* hasViewActivityPermission */, nil
}

return false /* hasViewActivityPermission */, nil
}

// Note that the function returns plain errors, and it is the caller's
// responsibility to convert them to serverErrors.
func (c *adminPrivilegeChecker) getUserAndRole(
Expand Down
1 change: 1 addition & 0 deletions pkg/server/serverpb/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type SQLStatusServer interface {
TableIndexStats(context.Context, *TableIndexStatsRequest) (*TableIndexStatsResponse, error)
UserSQLRoles(ctx context.Context, request *UserSQLRolesRequest) (*UserSQLRolesResponse, error)
TxnIDResolution(context.Context, *TxnIDResolutionRequest) (*TxnIDResolutionResponse, error)
HistoricalContentionEvents(context.Context, *HistoricalContentionEventsRequest) (*HistoricalContentionEventsResponse, error)
}

// OptionalNodesStatusServer is a StatusServer that is only optionally present
Expand Down
18 changes: 18 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1509,6 +1509,16 @@ message TxnIDResolutionResponse {
(gogoproto.nullable) = false];
}

message HistoricalContentionEventsRequest {
string node_id = 1 [(gogoproto.customname) = "NodeID"];
}

message HistoricalContentionEventsResponse {
repeated cockroach.sql.contentionpb.ExtendedContentionEvent events = 1 [
(gogoproto.nullable) = false
];
}

service Status {
// Certificates retrieves a copy of the TLS certificates.
rpc Certificates(CertificatesRequest) returns (CertificatesResponse) {
Expand Down Expand Up @@ -1917,4 +1927,12 @@ service Status {
// Client is responsible to perform retries if the requested transaction ID
// is not returned in the RPC response.
rpc TxnIDResolution(TxnIDResolutionRequest) returns (TxnIDResolutionResponse) {}

// HistoricalContentionEvents returns a list of un-aggregated contention
// events sorted by the collection timestamp.
rpc HistoricalContentionEvents(HistoricalContentionEventsRequest) returns (HistoricalContentionEventsResponse) {
option (google.api.http) = {
get: "/_status/historicalcontentionevents"
};
}
}
92 changes: 92 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,27 @@ func (b *baseStatusServer) localTxnIDResolution(
return resp
}

func (b *baseStatusServer) localHistoricalContentionEvents(
shouldRedactContendingKey bool,
) *serverpb.HistoricalContentionEventsResponse {
registry := b.sqlServer.execCfg.ContentionRegistry

resp := &serverpb.HistoricalContentionEventsResponse{
Events: make([]contentionpb.ExtendedContentionEvent, 0),
}
// Ignore error returned by ForEachEvent() since if our own callback doesn't
// return error, ForEachEvent() also doesn't return error.
_ = registry.ForEachEvent(func(event *contentionpb.ExtendedContentionEvent) error {
if shouldRedactContendingKey {
event.BlockingEvent.Key = []byte{}
}
resp.Events = append(resp.Events, *event)
return nil
})

return resp
}

// A statusServer provides a RESTful status API.
type statusServer struct {
*baseStatusServer
Expand Down Expand Up @@ -3087,3 +3108,74 @@ func (s *statusServer) TxnIDResolution(

return statusClient.TxnIDResolution(ctx, req)
}

func (s *statusServer) HistoricalContentionEvents(
ctx context.Context, req *serverpb.HistoricalContentionEventsRequest,
) (*serverpb.HistoricalContentionEventsResponse, error) {
ctx = s.AnnotateCtx(propagateGatewayMetadata(ctx))

if err := s.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil {
return nil, err
}

hasViewActivity, err := s.privilegeChecker.hasViewActivityPermission(ctx)
if err != nil {
return nil, err
}
shouldRedactContendingKeys := !hasViewActivity

if s.gossip.NodeID.Get() == 0 {
return nil, status.Errorf(codes.Unavailable, "nodeID not set")
}

if len(req.NodeID) > 0 {
requestedNodeID, local, err := s.parseNodeID(req.NodeID)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}
if local {
return s.localHistoricalContentionEvents(shouldRedactContendingKeys), nil
}

statusClient, err := s.dialNode(ctx, requestedNodeID)
if err != nil {
return nil, err
}
return statusClient.HistoricalContentionEvents(ctx, req)
}

dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) {
statusClient, err := s.dialNode(ctx, nodeID)
return statusClient, err
}

localReq := &serverpb.HistoricalContentionEventsRequest{
NodeID: "local",
}

resp := &serverpb.HistoricalContentionEventsResponse{
Events: make([]contentionpb.ExtendedContentionEvent, 0),
}

if err := s.iterateNodes(ctx, "historical contention events for node",
dialFn,
func(ctx context.Context, client interface{}, _ roachpb.NodeID) (interface{}, error) {
statusClient := client.(serverpb.StatusClient)
return statusClient.HistoricalContentionEvents(ctx, localReq)
},
func(nodeID roachpb.NodeID, nodeResp interface{}) {
historicalContentionEvents := nodeResp.(*serverpb.HistoricalContentionEventsResponse)
resp.Events = append(resp.Events, historicalContentionEvents.Events...)
},
func(nodeID roachpb.NodeID, nodeFnError error) {
},
); err != nil {
return nil, err
}

sort.Slice(resp.Events, func(i, j int) bool {
return resp.Events[i].CollectionTs.Before(resp.Events[j].CollectionTs)
})

return resp, nil
}
Loading

0 comments on commit a5b5fed

Please sign in to comment.