From fcc4454eb1a6defb0f5e67dbc22e314a3d17505f Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Fri, 1 Dec 2023 16:47:48 -0500 Subject: [PATCH 1/4] sql: add telemetry for mixed DDL/DML transactions This patch adds feature counter telemetry for explicit transactions that have schema changes. We track is a transaction has DDL only or a mixture of DDL and DML, and if it succeeded or failed. Release note: None --- pkg/sql/conn_executor_exec.go | 31 +++++++++++++++++++++- pkg/sql/sqltelemetry/schema.go | 16 +++++++++++ pkg/sql/testdata/telemetry/schema | 44 +++++++++++++++++++++++++++++++ 3 files changed, 90 insertions(+), 1 deletion(-) diff --git a/pkg/sql/conn_executor_exec.go b/pkg/sql/conn_executor_exec.go index 6021132db56e..24e45b725e71 100644 --- a/pkg/sql/conn_executor_exec.go +++ b/pkg/sql/conn_executor_exec.go @@ -1445,10 +1445,15 @@ func (ex *connExecutor) reportSessionDataChanges(fn func() error) error { return nil } -func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) error { +func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) (retErr error) { ctx, sp := tracing.EnsureChildSpan(ctx, ex.server.cfg.AmbientCtx.Tracer, "commit sql txn") defer sp.Finish() + defer func() { + failed := retErr != nil + ex.recordDDLTxnTelemetry(failed) + }() + if err := ex.extraTxnState.sqlCursors.closeAll(true /* errorOnWithHold */); err != nil { return err } @@ -1525,6 +1530,29 @@ func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) error return nil } +// recordDDLTxnTelemetry records telemetry for explicit transactions that +// contain DDL. +func (ex *connExecutor) recordDDLTxnTelemetry(failed bool) { + numDDL, numStmts := ex.extraTxnState.numDDL, ex.state.mu.stmtCount + if numDDL == 0 || ex.implicitTxn() { + return + } + // Subtract 1 statement so the COMMIT/ROLLBACK is not counted. + if numDDL == numStmts-1 { + if failed { + telemetry.Inc(sqltelemetry.DDLOnlyTransactionFailureCounter) + } else { + telemetry.Inc(sqltelemetry.DDLOnlyTransactionSuccessCounter) + } + } else /* numDDL != numStmts-1 */ { + if failed { + telemetry.Inc(sqltelemetry.MixedDDLDMLTransactionFailureCounter) + } else { + telemetry.Inc(sqltelemetry.MixedDDLDMLTransactionSuccessCounter) + } + } +} + // createJobs creates jobs for the records cached in schemaChangeJobRecords // during this transaction. func (ex *connExecutor) createJobs(ctx context.Context) error { @@ -1558,6 +1586,7 @@ func (ex *connExecutor) rollbackSQLTransaction( } ex.extraTxnState.prepStmtsNamespace.closeAllPortals(ctx, &ex.extraTxnState.prepStmtsNamespaceMemAcc) + ex.recordDDLTxnTelemetry(true /* failed */) if err := ex.state.mu.txn.Rollback(ctx); err != nil { log.Warningf(ctx, "txn rollback failed: %s", err) diff --git a/pkg/sql/sqltelemetry/schema.go b/pkg/sql/sqltelemetry/schema.go index a4f19b9962ea..381a9434742a 100644 --- a/pkg/sql/sqltelemetry/schema.go +++ b/pkg/sql/sqltelemetry/schema.go @@ -203,3 +203,19 @@ var DeclarativeSchemaChangerCounter = telemetry.GetCounterOnce("sql.schema.schem // LegacySchemaChangerCounter is incremented whenever the legacy schema changer // is used. var LegacySchemaChangerCounter = telemetry.GetCounterOnce("sql.schema.schema_changer_mode.legacy") + +// MixedDDLDMLTransactionSuccessCounter is incremented whenever an explicit +// transaction that has both DDL and DML statements succeeds. +var MixedDDLDMLTransactionSuccessCounter = telemetry.GetCounterOnce("sql.schema.transaction.mixed_ddl_dml.success") + +// MixedDDLDMLTransactionFailureCounter is incremented whenever an explicit +// transaction that has both DDL and DML statements fails. +var MixedDDLDMLTransactionFailureCounter = telemetry.GetCounterOnce("sql.schema.transaction.mixed_ddl_dml.failure") + +// DDLOnlyTransactionSuccessCounter is incremented whenever an explicit +// transaction that has only DDL statements succeeds. +var DDLOnlyTransactionSuccessCounter = telemetry.GetCounterOnce("sql.schema.transaction.ddl_only.success") + +// DDLOnlyTransactionFailureCounter is incremented whenever an explicit +// transaction that has only DDL statements fails. +var DDLOnlyTransactionFailureCounter = telemetry.GetCounterOnce("sql.schema.transaction.ddl_only.failure") diff --git a/pkg/sql/testdata/telemetry/schema b/pkg/sql/testdata/telemetry/schema index bb9dadb97eb7..0732042cb691 100644 --- a/pkg/sql/testdata/telemetry/schema +++ b/pkg/sql/testdata/telemetry/schema @@ -171,3 +171,47 @@ CREATE TYPE composite_typ AS (a int, b int) ---- sql.schema.create_type sql.schema.schema_changer_mode.legacy + +exec +BEGIN; +SELECT 1; +DROP INDEX t_b_idx; +SELECT 1/0; +---- +error: pq: division by zero + +feature-usage +ROLLBACK; +---- +sql.schema.transaction.mixed_ddl_dml.failure + +exec +BEGIN; +DROP INDEX t_b_idx; +---- + +feature-usage +ROLLBACK; +---- +sql.schema.transaction.ddl_only.failure + +feature-usage +BEGIN; +SELECT 1; +DROP INDEX t_b_idx; +COMMIT; +---- +sql.schema.change_in_explicit_txn +sql.schema.drop_index +sql.schema.schema_changer_mode.legacy +sql.schema.transaction.mixed_ddl_dml.success + +feature-usage +BEGIN; +CREATE INDEX ON t(b); +COMMIT; +---- +sql.schema.change_in_explicit_txn +sql.schema.create_index +sql.schema.schema_changer_mode.legacy +sql.schema.transaction.ddl_only.success From 5debfd709b84acb1ece5d809e8dd79d0940c9b11 Mon Sep 17 00:00:00 2001 From: Rafi Shamim Date: Mon, 18 Dec 2023 11:06:31 -0500 Subject: [PATCH 2/4] sql: skip TestExperimentalRelocateNonVoters under duress Release note: None --- pkg/sql/multitenant_admin_function_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/sql/multitenant_admin_function_test.go b/pkg/sql/multitenant_admin_function_test.go index b21b1bf09820..c71ff2db7ccd 100644 --- a/pkg/sql/multitenant_admin_function_test.go +++ b/pkg/sql/multitenant_admin_function_test.go @@ -937,6 +937,7 @@ func TestRelocateNonVoters(t *testing.T) { func TestExperimentalRelocateNonVoters(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + skip.UnderDuress(t, "test flakes in slow builds; see #108081") testCases := []testCase{ { From 26544d3d2d7400958e376cfbcfbd6d95eda51e9c Mon Sep 17 00:00:00 2001 From: DarrylWong Date: Mon, 18 Dec 2023 13:37:35 -0500 Subject: [PATCH 3/4] roachtest: specify port for post test timeseries collection After #114097, we need to specify ports for cockroach node connections. This one was missed and timeseries were not being collected in test failures. Release note: none Epic: none --- pkg/cmd/roachtest/cluster.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index ac0af70a7b14..861bf7a11a03 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1240,7 +1240,7 @@ func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger) sec = fmt.Sprintf("--certs-dir=%s", certs) } if err := c.RunE( - ctx, c.Node(node), fmt.Sprintf("%s debug tsdump %s --format=raw > tsdump.gob", test.DefaultCockroachPath, sec), + ctx, c.Node(node), fmt.Sprintf("%s debug tsdump %s --port={pgport%s} --format=raw > tsdump.gob", test.DefaultCockroachPath, sec, c.Node(node)), ); err != nil { return err } From e6752168be60ed90c96e4cd225c886e1769808da Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Tue, 12 Dec 2023 13:07:27 +0000 Subject: [PATCH 4/4] roachtest: run consistency checks after tests This patch runs replica consistency checks during post-test assertions. Previously, this only checked for stats inconsistencies, it now checks for actual data inconsistencies. This is useful to detect data corruption bugs, typically in e.g. Pebble or Raft snapshots. The consistency check runs at full speed for up to 20 minutes before timing out, and will report any inconsistencies found during a partial run. This is sufficient to check about 200 GB of data. Tests can opt out by skipping `registry.PostValidationReplicaDivergence`. Storage checkpoints are not yet taken when inconsistencies are detected, since it requires additional support in SQL builtins to take them at the same Raft log index across nodes. This will be addressed separately. Epic: none Release note: None --- pkg/cmd/roachtest/cluster.go | 4 +- pkg/cmd/roachtest/roachtestutil/BUILD.bazel | 3 + .../roachtestutil/validation_check.go | 75 +++++++++++++++---- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index ac0af70a7b14..fd044b76b2ce 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1441,14 +1441,14 @@ func (c *clusterImpl) HealthStatus( } // assertConsistentReplicas fails the test if -// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges' +// crdb_internal.check_consistency(false, ”, ”) indicates that any ranges' // replicas are inconsistent with each other. func (c *clusterImpl) assertConsistentReplicas( ctx context.Context, db *gosql.DB, t *testImpl, ) error { t.L().Printf("checking for replica divergence") return timeutil.RunWithTimeout( - ctx, "consistency check", 5*time.Minute, + ctx, "consistency check", 20*time.Minute, func(ctx context.Context) error { return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db) }, diff --git a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel index 1ced98458b58..848fea077ae0 100644 --- a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel +++ b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel @@ -16,13 +16,16 @@ go_library( "//pkg/cmd/roachtest/cluster", "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/test", + "//pkg/kv/kvpb", "//pkg/roachprod/config", "//pkg/roachprod/install", "//pkg/roachprod/logger", "//pkg/testutils/sqlutils", "//pkg/util/humanizeutil", + "//pkg/util/log", "//pkg/util/timeutil", "@com_github_cockroachdb_errors//:errors", + "@com_github_cockroachdb_redact//:redact", ], ) diff --git a/pkg/cmd/roachtest/roachtestutil/validation_check.go b/pkg/cmd/roachtest/roachtestutil/validation_check.go index 49e3b6eb8884..f245e936648b 100644 --- a/pkg/cmd/roachtest/roachtestutil/validation_check.go +++ b/pkg/cmd/roachtest/roachtestutil/validation_check.go @@ -14,30 +14,44 @@ package roachtestutil import ( "context" gosql "database/sql" + "fmt" "time" + "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/roachprod/logger" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" + "github.com/cockroachdb/redact" ) -// CheckReplicaDivergenceOnDB runs a stats-only consistency check via the -// provided DB. It ignores transient errors that can result from the -// implementation of crdb_internal.check_consistency, so a nil result -// does not prove anything. +// CheckReplicaDivergenceOnDB runs a consistency check via the provided DB. It +// ignores transient errors that can result from the implementation of +// crdb_internal.check_consistency, so a nil result does not prove anything. +// +// The consistency check may not get enough time to complete, but will return +// any inconsistencies that it did find before timing out. func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql.DB) error { - // NB: we set a statement_timeout since context cancellation won't work here, - // see: + // Speed up consistency checks. The test is done, so let's go full throttle. + _, err := db.ExecContext(ctx, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'") + if err != nil { + return err + } + + // NB: we set a statement_timeout since context cancellation won't work here. + // We've seen the consistency checks hang indefinitely in some cases. // https://github.com/cockroachdb/cockroach/pull/34520 // - // We've seen the consistency checks hang indefinitely in some cases. + // TODO(erikgrinaker): avoid result set buffering. We seem to be receiving + // results in batches of 64 rows, regardless of results_buffer_size or the + // row size (e.g. with 16 KB ballast per row). Not clear where this buffering + // is happening or how to disable it. + started := timeutil.Now() rows, err := db.QueryContext(ctx, ` -SET statement_timeout = '5m'; +SET statement_timeout = '20m'; SELECT t.range_id, t.start_key_pretty, t.status, t.detail -FROM -crdb_internal.check_consistency(true, '', '') as t -WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) +FROM crdb_internal.check_consistency(false, '', '') as t;`) if err != nil { // TODO(tbg): the checks can fail for silly reasons like missing gossiped // descriptors, etc. -- not worth failing the test for. Ideally this would @@ -46,20 +60,51 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) return nil } defer rows.Close() + + logEvery := log.Every(time.Minute) + logEvery.ShouldLog() // don't immediately log + + const maxReport = 10 // max number of inconsistencies to report var finalErr error + var ranges, inconsistent int for rows.Next() { var rangeID int32 var prettyKey, status, detail string if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil { l.Printf("consistency check failed with %v; ignoring", scanErr) - return nil + return finalErr // return partial finalErr + } + // Only detect replica inconsistencies, and ignore MVCC stats mismatches + // since these can happen in rare cases due to lease requests not respecting + // latches: https://github.com/cockroachdb/cockroach/issues/93896 + // + // TODO(erikgrinaker): We should take storage checkpoints for inconsistent + // ranges as well, up to maxReport. This requires support in + // check_consistency() such that we take the checkpoints at the same Raft + // log index across nodes. + if status == kvpb.CheckConsistencyResponse_RANGE_INCONSISTENT.String() { + inconsistent++ + msg := fmt.Sprintf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail) + l.Printf(msg) + if inconsistent <= maxReport { + finalErr = errors.CombineErrors(finalErr, errors.Newf("%s", redact.SafeString(msg))) + } else if inconsistent == maxReport+1 { + finalErr = errors.CombineErrors(finalErr, + errors.Newf("max number of inconsistencies %d exceeded", maxReport)) + } + } + + ranges++ + if logEvery.ShouldLog() { + l.Printf("consistency checked %d ranges (at key %s)", ranges, prettyKey) } - finalErr = errors.CombineErrors(finalErr, - errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)) } + l.Printf("consistency checked %d ranges in %s, found %d inconsistent ranges", + ranges, timeutil.Since(started).Round(time.Second), inconsistent) + if err := rows.Err(); err != nil { l.Printf("consistency check failed with %v; ignoring", err) - return nil + return finalErr // return partial finalErr } return finalErr }