Skip to content

Commit

Permalink
sql: introduce crdb_internal.historical_contention_events virtual table
Browse files Browse the repository at this point in the history
This commit introduces `crdb_internal.historical_contention_events`
virtual table. This virtual tables exposes historical contention events
annotated with transaction fingerprint IDs for transactions that have
finished executing. This allows this virtual table to be joined into the
statement statistics and transaction statistics tables.
The new virtual table require at least VIEWACTIVITYREDACTED permission
to access. However, in order to view the contending keys, it would
require at least VIEWACTIVITY permission or above.

Resolves cockroachdb#75904

Release note (sql change): introducing
`crdb_internal.historical_contention_events` virtual table, that exposes
historical contention events. The events exposed in the new virtual
table also include transaction fingerprint IDs for both blocking and
waiting transactions. This allows the new virtual table to be joined
into statement statistics and transaction statistics tables.
The new virtual table require either VIEWACTIVITYREDACTED OR
VIEWACTIVITY role option to access. However, in order to view the
contending keys, it would require VIEWACTIVITY role option. The contention
events are stored in memory. The amount of contention events stored is
controlled via 'sql.contention.event_store.capacity' cluster setting.

Release note (api change): introducing
GET `/_status/historicalcontentionevents` endpoint, that returns
cluster-wide in-memory historical contention events.
The endpoint require either VIEWACTIVITYREDACTED OR VIEWACTIVITY role
option to access. However, in order to view the contending keys, it would
require VIEWACTIVITY role option. The contention events are stored in memory.
The amount of contention events stored is controlled via
'sql.contention.event_store.capacity' cluster setting.
  • Loading branch information
Azhng committed Feb 28, 2022
1 parent 9b45e99 commit c363db3
Show file tree
Hide file tree
Showing 28 changed files with 2,321 additions and 1,626 deletions.
45 changes: 45 additions & 0 deletions docs/generated/http/full.md
Original file line number Diff line number Diff line change
Expand Up @@ -4342,6 +4342,51 @@ Response object for issuing Transaction ID Resolution.



## HistoricalContentionEvents

`GET /_status/historicalcontentionevents`

HistoricalContentionEvents returns a list of un-aggregated contention
events sorted by the collection timestamp.

Support status: [reserved](#support-status)

#### Request Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| node_id | [string](#cockroach.server.serverpb.HistoricalContentionEventsRequest-string) | | | [reserved](#support-status) |







#### Response Parameters







| Field | Type | Label | Description | Support status |
| ----- | ---- | ----- | ----------- | -------------- |
| events | [cockroach.sql.contentionpb.ExtendedContentionEvent](#cockroach.server.serverpb.HistoricalContentionEventsResponse-cockroach.sql.contentionpb.ExtendedContentionEvent) | repeated | | [reserved](#support-status) |







## RequestCA

`GET /_join/v1/ca`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ crdb_internal gossip_alerts table NULL NULL NULL
crdb_internal gossip_liveness table NULL NULL NULL
crdb_internal gossip_network table NULL NULL NULL
crdb_internal gossip_nodes table NULL NULL NULL
crdb_internal historical_contention_events table NULL NULL NULL
crdb_internal index_columns table NULL NULL NULL
crdb_internal index_usage_statistics table NULL NULL NULL
crdb_internal invalid_objects table NULL NULL NULL
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0s /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial1_excluded
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip /dev/null --concurrency=1 --exclude-nodes=2 --cpu-profile-duration=0
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/partial2
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=0 /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[node 1] node status... converting to JSON... writing binary output: debug/nodes/1/status.json... done
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events... writing output: debug/crdb_internal.transaction_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response... converting to JSON... writing binary output: debug/liveness.json... done
[cluster] requesting CPU profiles
Expand Down
3 changes: 3 additions & 0 deletions pkg/cli/testdata/zip/testzip_concurrent
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ zip
[cluster] retrieving SQL data for crdb_internal.table_indexes...
[cluster] retrieving SQL data for crdb_internal.table_indexes: done
[cluster] retrieving SQL data for crdb_internal.table_indexes: writing output: debug/crdb_internal.table_indexes.txt...
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events...
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events: done
[cluster] retrieving SQL data for crdb_internal.transaction_contention_events: writing output: debug/crdb_internal.transaction_contention_events.txt...
[cluster] retrieving SQL data for crdb_internal.zones...
[cluster] retrieving SQL data for crdb_internal.zones: done
[cluster] retrieving SQL data for crdb_internal.zones: writing output: debug/crdb_internal.zones.txt...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/testdata/zip/testzip_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ debug zip --concurrency=1 --cpu-profile-duration=1s /dev/null
[cluster] retrieving SQL data for crdb_internal.invalid_objects... writing output: debug/crdb_internal.invalid_objects.txt... done
[cluster] retrieving SQL data for crdb_internal.index_usage_statistics... writing output: debug/crdb_internal.index_usage_statistics.txt... done
[cluster] retrieving SQL data for crdb_internal.table_indexes... writing output: debug/crdb_internal.table_indexes.txt... done
[cluster] retrieving SQL data for crdb_internal.historical_contention_events... writing output: debug/crdb_internal.historical_contention_events.txt... done
[cluster] requesting nodes... received response... converting to JSON... writing binary output: debug/nodes.json... done
[cluster] requesting liveness... received response...
[cluster] requesting liveness: last request failed: rpc error: ...
Expand Down
1 change: 1 addition & 0 deletions pkg/cli/zip_cluster_wide.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ var debugZipTablesPerCluster = []string{
"crdb_internal.invalid_objects",
"crdb_internal.index_usage_statistics",
"crdb_internal.table_indexes",
"crdb_internal.transaction_contention_events",
}

// nodesInfo holds node details pulled from a SQL or storage node.
Expand Down
3 changes: 3 additions & 0 deletions pkg/rpc/auth_tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ func (a tenantAuthorizer) authorize(
case "/cockroach.server.serverpb.Status/CancelLocalQuery":
return a.authTenant(tenID)

case "/cockroach.server.serverpb.Status/HistoricalContentionEvents":
return a.authTenant(tenID)

case "/cockroach.roachpb.Internal/GetSpanConfigs":
return a.authGetSpanConfigs(tenID, req.(*roachpb.GetSpanConfigsRequest))

Expand Down
20 changes: 20 additions & 0 deletions pkg/server/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -3293,6 +3293,26 @@ func (c *adminPrivilegeChecker) requireViewActivityAndNoViewActivityRedactedPerm
return nil
}

func (c *adminPrivilegeChecker) hasViewActivityRedactedRoleOption(
ctx context.Context,
) (hasViewActivityPermission bool, _ error) {
userName, _, err := c.getUserAndRole(ctx)
if err != nil {
return false /* hasViewActivityRedactedRoleOption */, serverError(ctx, err)
}

hasViewActivity, err := c.hasRoleOption(ctx, userName, roleoption.VIEWACTIVITYREDACTED)
if err != nil {
return false /* hasViewActivityRedactedRoleOption */, serverError(ctx, err)
}

if hasViewActivity {
return true /* hasViewActivityRedactedRoleOption */, nil
}

return false /* hasViewActivityRedactedRoleOption */, nil
}

// Note that the function returns plain errors, and it is the caller's
// responsibility to convert them to serverErrors.
func (c *adminPrivilegeChecker) getUserAndRole(
Expand Down
1 change: 1 addition & 0 deletions pkg/server/serverpb/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ type SQLStatusServer interface {
TableIndexStats(context.Context, *TableIndexStatsRequest) (*TableIndexStatsResponse, error)
UserSQLRoles(ctx context.Context, request *UserSQLRolesRequest) (*UserSQLRolesResponse, error)
TxnIDResolution(context.Context, *TxnIDResolutionRequest) (*TxnIDResolutionResponse, error)
TransactionContentionEvents(context.Context, *TransactionContentionEventsRequest) (*TransactionContentionEventsResponse, error)
}

// OptionalNodesStatusServer is a StatusServer that is only optionally present
Expand Down
18 changes: 18 additions & 0 deletions pkg/server/serverpb/status.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1512,6 +1512,16 @@ message TxnIDResolutionResponse {
(gogoproto.nullable) = false];
}

message TransactionContentionEventsRequest {
string node_id = 1 [(gogoproto.customname) = "NodeID"];
}

message TransactionContentionEventsResponse {
repeated cockroach.sql.contentionpb.ExtendedContentionEvent events = 1 [
(gogoproto.nullable) = false
];
}

service Status {
// Certificates retrieves a copy of the TLS certificates.
rpc Certificates(CertificatesRequest) returns (CertificatesResponse) {
Expand Down Expand Up @@ -1920,4 +1930,12 @@ service Status {
// Client is responsible to perform retries if the requested transaction ID
// is not returned in the RPC response.
rpc TxnIDResolution(TxnIDResolutionRequest) returns (TxnIDResolutionResponse) {}

// TransactionContentionEvents returns a list of un-aggregated contention
// events sorted by the collection timestamp.
rpc TransactionContentionEvents(TransactionContentionEventsRequest) returns (TransactionContentionEventsResponse) {
option (google.api.http) = {
get: "/_status/transactioncontentionevents"
};
}
}
99 changes: 99 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,27 @@ func (b *baseStatusServer) localTxnIDResolution(
return resp
}

func (b *baseStatusServer) localTransactionContentionEvents(
shouldRedactContendingKey bool,
) *serverpb.TransactionContentionEventsResponse {
registry := b.sqlServer.execCfg.ContentionRegistry

resp := &serverpb.TransactionContentionEventsResponse{
Events: make([]contentionpb.ExtendedContentionEvent, 0),
}
// Ignore error returned by ForEachEvent() since if our own callback doesn't
// return error, ForEachEvent() also doesn't return error.
_ = registry.ForEachEvent(func(event *contentionpb.ExtendedContentionEvent) error {
if shouldRedactContendingKey {
event.BlockingEvent.Key = []byte{}
}
resp.Events = append(resp.Events, *event)
return nil
})

return resp
}

// A statusServer provides a RESTful status API.
type statusServer struct {
*baseStatusServer
Expand Down Expand Up @@ -3000,3 +3021,81 @@ func (s *statusServer) TxnIDResolution(

return statusClient.TxnIDResolution(ctx, req)
}

func (s *statusServer) TransactionContentionEvents(
ctx context.Context, req *serverpb.TransactionContentionEventsRequest,
) (*serverpb.TransactionContentionEventsResponse, error) {
ctx = s.AnnotateCtx(propagateGatewayMetadata(ctx))

if err := s.privilegeChecker.requireViewActivityOrViewActivityRedactedPermission(ctx); err != nil {
return nil, err
}

roles, err := s.UserSQLRoles(ctx, &serverpb.UserSQLRolesRequest{})
if err != nil {
return nil, err
}

shouldRedactContendingKey := false
for _, role := range roles.Roles {
if role == roleoption.VIEWACTIVITYREDACTED.String() {
shouldRedactContendingKey = true
break
}
}

if s.gossip.NodeID.Get() == 0 {
return nil, status.Errorf(codes.Unavailable, "nodeID not set")
}

if len(req.NodeID) > 0 {
requestedNodeID, local, err := s.parseNodeID(req.NodeID)
if err != nil {
return nil, status.Errorf(codes.InvalidArgument, err.Error())
}
if local {
return s.localTransactionContentionEvents(shouldRedactContendingKey), nil
}

statusClient, err := s.dialNode(ctx, requestedNodeID)
if err != nil {
return nil, err
}
return statusClient.TransactionContentionEvents(ctx, req)
}

dialFn := func(ctx context.Context, nodeID roachpb.NodeID) (interface{}, error) {
statusClient, err := s.dialNode(ctx, nodeID)
return statusClient, err
}

rpcCallFn := func(ctx context.Context, client interface{}, _ roachpb.NodeID) (interface{}, error) {
statusClient := client.(serverpb.StatusClient)
return statusClient.TransactionContentionEvents(ctx, &serverpb.TransactionContentionEventsRequest{
NodeID: "local",
})
}

resp := &serverpb.TransactionContentionEventsResponse{
Events: make([]contentionpb.ExtendedContentionEvent, 0),
}

if err := s.iterateNodes(ctx, "txn contention events for node",
dialFn,
rpcCallFn,
func(nodeID roachpb.NodeID, nodeResp interface{}) {
txnContentionEvents := nodeResp.(*serverpb.TransactionContentionEventsResponse)
resp.Events = append(resp.Events, txnContentionEvents.Events...)
},
func(nodeID roachpb.NodeID, nodeFnError error) {
},
); err != nil {
return nil, err
}

sort.Slice(resp.Events, func(i, j int) bool {
return resp.Events[i].CollectionTs.Before(resp.Events[j].CollectionTs)
})

return resp, nil
}
Loading

0 comments on commit c363db3

Please sign in to comment.