Skip to content

Commit

Permalink
sql: add stmt error code to execution insights tables
Browse files Browse the repository at this point in the history
Part of: cockroachdb#87785.

Previously, the insights subsystem did not keep track of `error code` for
failed executions and only contained a `FailedExecution` value for the
problem field. This commit adds the `error_code` column to the
`crdb_internal.{cluster/node}_execution_insights` virtual tables. This
commit also bubbles up that error code at the transaction level for
writing into the `crdb_internal.{cluster/node}_txn_execution_insights`
virtual table.

Release note (sql change): Adds `error_code` column to the
`crdb_internal.{cluster/node}_execution_insights` virtual tables, which
contains the error code for a failed execution. Adds `last_error_code`
column to the `crdb_internal.{cluster/node}_txn_execution_insights`
virtual tables, which is the error code of the last failed statement in
that transaction.
  • Loading branch information
gtr committed Feb 10, 2023
1 parent f8eb2ef commit 6c783f3
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 21 deletions.
18 changes: 14 additions & 4 deletions pkg/ccl/logictestccl/testdata/logic_test/crdb_internal_tenant
Original file line number Diff line number Diff line change
Expand Up @@ -310,15 +310,25 @@ SELECT * FROM crdb_internal.node_inflight_trace_spans WHERE span_id < 0
----
trace_id parent_span_id span_id goroutine_id finished start_time duration operation

query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames
SELECT * FROM crdb_internal.cluster_execution_insights WHERE query = ''
----
session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention contention_events index_recommendations implicit_txn cpu_sql_nanos error_code

query TTTBTTTTTIITITTTTTTTTTTTTTIT colnames
SELECT * FROM crdb_internal.node_execution_insights WHERE query = ''
----
session_id txn_id txn_fingerprint_id stmt_id stmt_fingerprint_id problem causes query status start_time end_time full_scan user_name app_name database_name plan_gist rows_read rows_written priority retries last_retry_reason exec_node_ids contention contention_events index_recommendations implicit_txn cpu_sql_nanos error_code

query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.cluster_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code

query TTTBTTTTTIITITTTTTI colnames
query TTTBTTTTTIITITTTTTIT colnames
SELECT * FROM crdb_internal.node_txn_execution_insights WHERE query = ''
----
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos
txn_id txn_fingerprint_id query implicit_txn session_id start_time end_time user_name app_name rows_read rows_written priority retries last_retry_reason contention problems causes stmt_execution_ids cpu_sql_nanos last_error_code

query ITTI
SELECT range_id, start_pretty, end_pretty, lease_holder FROM crdb_internal.ranges
Expand Down
15 changes: 13 additions & 2 deletions pkg/sql/crdb_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -7042,7 +7042,8 @@ CREATE TABLE crdb_internal.%s (
problems STRING[] NOT NULL,
causes STRING[] NOT NULL,
stmt_execution_ids STRING[] NOT NULL,
cpu_sql_nanos INT8
cpu_sql_nanos INT8,
last_error_code STRING
)`

var crdbInternalClusterTxnExecutionInsightsTable = virtualSchemaTable{
Expand Down Expand Up @@ -7151,6 +7152,13 @@ func populateTxnExecutionInsights(
}
}

errorCode := tree.DNull
for _, stmt := range insight.Statements {
if stmt.ErrorCode != "" {
errorCode = tree.NewDString(stmt.ErrorCode)
}
}

err = errors.CombineErrors(err, addRow(
tree.NewDUuid(tree.DUuid{UUID: insight.Transaction.ID}),
tree.NewDBytes(tree.DBytes(sqlstatsutil.EncodeUint64ToBytes(uint64(insight.Transaction.FingerprintID)))),
Expand All @@ -7171,6 +7179,7 @@ func populateTxnExecutionInsights(
causes,
stmtIDs,
tree.NewDInt(tree.DInt(insight.Transaction.CPUSQLNanos)),
errorCode,
))

if err != nil {
Expand Down Expand Up @@ -7210,7 +7219,8 @@ CREATE TABLE crdb_internal.%s (
contention_events JSONB,
index_recommendations STRING[] NOT NULL,
implicit_txn BOOL NOT NULL,
cpu_sql_nanos INT8
cpu_sql_nanos INT8,
error_code STRING
)`

var crdbInternalClusterExecutionInsightsTable = virtualSchemaTable{
Expand Down Expand Up @@ -7341,6 +7351,7 @@ func populateStmtInsights(
indexRecommendations,
tree.MakeDBool(tree.DBool(insight.Transaction.ImplicitTxn)),
tree.NewDInt(tree.DInt(s.CPUSQLNanos)),
tree.NewDString(s.ErrorCode),
))
}
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/sql/logictest/testdata/logic_test/crdb_internal_catalog

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pkg/sql/sqlstats/insights/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ go_test(
"//pkg/settings/cluster",
"//pkg/sql/appstatspb",
"//pkg/sql/clusterunique",
"//pkg/sql/pgwire/pgcode",
"//pkg/util/stop",
"//pkg/util/uint128",
"//pkg/util/uuid",
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/sqlstats/insights/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,7 @@ func (d *latencyThresholdDetector) enabled() bool {
func (d *latencyThresholdDetector) isSlow(s *Statement) bool {
return d.enabled() && s.LatencyInSeconds >= LatencyThreshold.Get(&d.st.SV).Seconds()
}

func isFailed(s *Statement) bool {
return s.Status == Statement_Failed
}
60 changes: 60 additions & 0 deletions pkg/sql/sqlstats/insights/detector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,66 @@ func TestLatencyQuantileDetector(t *testing.T) {
}
})

// Testing the slow and failure detectors at the same time.
t.Run("isSlow and isFailed", func(t *testing.T) {
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
AnomalyDetectionEnabled.Override(ctx, &st.SV, true)
AnomalyDetectionLatencyThreshold.Override(ctx, &st.SV, 100*time.Millisecond)

tests := []struct {
name string
seedLatency time.Duration
candidateLatency time.Duration
status Statement_Status
isSlow bool
isFailed bool
}{{
name: "slow and failed statement",
seedLatency: 100 * time.Millisecond,
candidateLatency: 200 * time.Millisecond,
status: Statement_Failed,
isSlow: true,
isFailed: true,
}, {
name: "slow and non-failed statement",
seedLatency: 100 * time.Millisecond,
candidateLatency: 200 * time.Millisecond,
status: Statement_Completed,
isSlow: true,
isFailed: false,
}, {
name: "fast and non-failed statement",
seedLatency: 100 * time.Millisecond,
candidateLatency: 50 * time.Millisecond,
status: Statement_Completed,
isSlow: false,
isFailed: false,
}, {
name: "fast and failed statement",
seedLatency: 100 * time.Millisecond,
candidateLatency: 50 * time.Millisecond,
status: Statement_Failed,
isSlow: false,
isFailed: true,
}}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
d := newAnomalyDetector(st, NewMetrics())
for i := 0; i < 1000; i++ {
d.isSlow(&Statement{LatencyInSeconds: test.seedLatency.Seconds()})
}
stmt := &Statement{LatencyInSeconds: test.candidateLatency.Seconds(), Status: test.status}
require.Equal(t, test.isSlow, d.isSlow(stmt))
require.Equal(t, test.isFailed, isFailed(stmt))
})
}
})

// The statement is fast and has failed. This should hit the failure
// detector but not the slow detector.

t.Run("metrics", func(t *testing.T) {
ctx := context.Background()
st := cluster.MakeTestingClusterSettings()
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/sqlstats/insights/insights.proto
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ message Statement {
Problem problem = 21;
repeated Cause causes = 22;
int64 cpu_sql_nanos = 23 [(gogoproto.customname) = "CPUSQLNanos"];
string error_code = 24;
}


Expand Down
12 changes: 8 additions & 4 deletions pkg/sql/sqlstats/insights/integration/insights_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ func TestInsightsIntegration(t *testing.T) {
"end_time, "+
"full_scan, "+
"implicit_txn, "+
"cpu_sql_nanos "+
"cpu_sql_nanos, "+
"error_code "+
"FROM crdb_internal.node_execution_insights where "+
"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)

Expand All @@ -109,7 +110,8 @@ func TestInsightsIntegration(t *testing.T) {
var fullScan bool
var implicitTxn bool
var cpuSQLNanos int64
err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos)
var errorCode string
err = row.Scan(&query, &status, &startInsights, &endInsights, &fullScan, &implicitTxn, &cpuSQLNanos, &errorCode)

if err != nil {
return err
Expand Down Expand Up @@ -142,15 +144,17 @@ func TestInsightsIntegration(t *testing.T) {
"start_time, "+
"end_time, "+
"implicit_txn, "+
"cpu_sql_nanos "+
"cpu_sql_nanos, "+
"error_code "+
"FROM crdb_internal.cluster_txn_execution_insights WHERE "+
"query = $1 and app_name = $2 ", "SELECT pg_sleep($1)", appName)

var query string
var startInsights, endInsights time.Time
var implicitTxn bool
var cpuSQLNanos int64
err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos)
var errorCode string
err = row.Scan(&query, &startInsights, &endInsights, &implicitTxn, &cpuSQLNanos, &errorCode)

if err != nil {
return err
Expand Down
15 changes: 7 additions & 8 deletions pkg/sql/sqlstats/insights/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,11 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
delete(r.statements, sessionID)
defer statements.release()

var slowStatements intsets.Fast
// Mark statements which are detected as slow or have a failed status.
var slowOrFailedStatements intsets.Fast
for i, s := range *statements {
if r.detector.isSlow(s) {
slowStatements.Add(i)
if r.detector.isSlow(s) || isFailed(s) {
slowOrFailedStatements.Add(i)
}
}

Expand All @@ -112,8 +113,8 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
highContention = transaction.Contention.Seconds() >= LatencyThreshold.Get(&r.causes.st.SV).Seconds()
}

if slowStatements.Empty() && !highContention {
// We only record an insight if we have slow statements or high txn contention.
if slowOrFailedStatements.Empty() && !highContention {
// We only record an insight if we have slow or failed statements or high txn contention.
return
}

Expand All @@ -127,14 +128,12 @@ func (r *lockingRegistry) ObserveTransaction(sessionID clusterunique.ID, transac
}

for i, s := range *statements {
if slowStatements.Contains(i) {
if slowOrFailedStatements.Contains(i) {
switch s.Status {
case Statement_Completed:
s.Problem = Problem_SlowExecution
s.Causes = r.causes.examine(s.Causes, s)
case Statement_Failed:
// Note that we'll be building better failure support for 23.1.
// For now, we only mark failed statements that were also slow.
s.Problem = Problem_FailedExecution
}

Expand Down
12 changes: 11 additions & 1 deletion pkg/sql/sqlstats/insights/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/appstatspb"
"github.com/cockroachdb/cockroach/pkg/sql/clusterunique"
"github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode"
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/stretchr/testify/require"
)
Expand All @@ -32,6 +33,14 @@ func newStmtWithProblemAndCauses(stmt *Statement, problem Problem, causes []Caus
return &newStmt
}

// Return a new failed statement with an added errorCode.
func newFailedStmtWithErrorCode(stmt *Statement, errorCode string) *Statement {
newStmt := *stmt
newStmt.Problem = Problem_FailedExecution
newStmt.ErrorCode = errorCode
return &newStmt
}

func TestRegistry(t *testing.T) {
ctx := context.Background()

Expand Down Expand Up @@ -80,6 +89,7 @@ func TestRegistry(t *testing.T) {
FingerprintID: appstatspb.StmtFingerprintID(100),
LatencyInSeconds: 2,
Status: Statement_Failed,
ErrorCode: "22012",
}

st := cluster.MakeTestingClusterSettings()
Expand All @@ -93,7 +103,7 @@ func TestRegistry(t *testing.T) {
Session: session,
Transaction: transaction,
Statements: []*Statement{
newStmtWithProblemAndCauses(statement, Problem_FailedExecution, nil),
newFailedStmtWithErrorCode(statement, pgcode.DivisionByZero.String()),
},
}}
var actual []*Insight
Expand Down
6 changes: 6 additions & 0 deletions pkg/sql/sqlstats/ssmemstorage/ss_mem_writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ func (s *Container) RecordStatement(
cpuSQLNanos = value.ExecStats.CPUTime.Nanoseconds()
}

var errorCode string
if value.StatementError != nil {
errorCode = pgerror.GetPGCode(value.StatementError).String()
}

s.insights.ObserveStatement(value.SessionID, &insights.Statement{
ID: value.StatementID,
FingerprintID: stmtFingerprintID,
Expand All @@ -222,6 +227,7 @@ func (s *Container) RecordStatement(
IndexRecommendations: value.IndexRecommendations,
Database: value.Database,
CPUSQLNanos: cpuSQLNanos,
ErrorCode: errorCode,
})

return stats.ID, nil
Expand Down

0 comments on commit 6c783f3

Please sign in to comment.