Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
116218: roachtest: run consistency checks after tests r=erikgrinaker a=erikgrinaker

This patch runs replica consistency checks during post-test assertions. Previously, this only checked for stats inconsistencies, it now checks for actual data inconsistencies. This is useful to detect data corruption bugs, typically in e.g. Pebble or Raft snapshots.

The consistency check runs at full speed for up to 20 minutes before timing out, and will report any inconsistencies found during a partial run. This is sufficient to check about 200 GB of data. Tests can opt out by skipping `registry.PostValidationReplicaDivergence`.

Storage checkpoints are not yet taken when inconsistencies are detected, since it requires additional support in SQL builtins to take them at the same Raft log index across nodes. This will be addressed separately.

Touches #115770.
Epic: none
Release note: None

116614: sql: add telemetry for mixed DDL/DML transactions r=rafiss a=rafiss

This patch adds feature counter telemetry for explicit transactions that have schema changes. We track if a transaction has DDL only or a mixture of DDL and DML, and if it succeeded or failed.

fixes #115012
Release note: None

116672: sql: skip TestExperimentalRelocateNonVoters under duress r=rafiss a=rafiss

fixes #115935
Release note: None

116695: roachtest: specify port for post test timeseries collection r=renatolabs a=DarrylWong

After #114097, we need to specify ports for cockroach node connections. This one was missed and timeseries were not being collected in test failures.

Release note: none
Epic: none

Co-authored-by: Erik Grinaker <[email protected]>
Co-authored-by: Rafi Shamim <[email protected]>
Co-authored-by: DarrylWong <[email protected]>
  • Loading branch information
4 people committed Dec 18, 2023
5 parents 1f6a6f1 + e675216 + fcc4454 + 5debfd7 + 26544d3 commit fec5213
Show file tree
Hide file tree
Showing 7 changed files with 157 additions and 19 deletions.
6 changes: 3 additions & 3 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1240,7 +1240,7 @@ func (c *clusterImpl) FetchTimeseriesData(ctx context.Context, l *logger.Logger)
sec = fmt.Sprintf("--certs-dir=%s", certs)
}
if err := c.RunE(
ctx, c.Node(node), fmt.Sprintf("%s debug tsdump %s --format=raw > tsdump.gob", test.DefaultCockroachPath, sec),
ctx, c.Node(node), fmt.Sprintf("%s debug tsdump %s --port={pgport%s} --format=raw > tsdump.gob", test.DefaultCockroachPath, sec, c.Node(node)),
); err != nil {
return err
}
Expand Down Expand Up @@ -1441,14 +1441,14 @@ func (c *clusterImpl) HealthStatus(
}

// assertConsistentReplicas fails the test if
// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges'
// crdb_internal.check_consistency(false, ”, ”) indicates that any ranges'
// replicas are inconsistent with each other.
func (c *clusterImpl) assertConsistentReplicas(
ctx context.Context, db *gosql.DB, t *testImpl,
) error {
t.L().Printf("checking for replica divergence")
return timeutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
ctx, "consistency check", 20*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
},
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/roachtestutil/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@ go_library(
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/test",
"//pkg/kv/kvpb",
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/testutils/sqlutils",
"//pkg/util/humanizeutil",
"//pkg/util/log",
"//pkg/util/timeutil",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
],
)

Expand Down
75 changes: 60 additions & 15 deletions pkg/cmd/roachtest/roachtestutil/validation_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,44 @@ package roachtestutil
import (
"context"
gosql "database/sql"
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)

// CheckReplicaDivergenceOnDB runs a stats-only consistency check via the
// provided DB. It ignores transient errors that can result from the
// implementation of crdb_internal.check_consistency, so a nil result
// does not prove anything.
// CheckReplicaDivergenceOnDB runs a consistency check via the provided DB. It
// ignores transient errors that can result from the implementation of
// crdb_internal.check_consistency, so a nil result does not prove anything.
//
// The consistency check may not get enough time to complete, but will return
// any inconsistencies that it did find before timing out.
func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql.DB) error {
// NB: we set a statement_timeout since context cancellation won't work here,
// see:
// Speed up consistency checks. The test is done, so let's go full throttle.
_, err := db.ExecContext(ctx, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'")
if err != nil {
return err
}

// NB: we set a statement_timeout since context cancellation won't work here.
// We've seen the consistency checks hang indefinitely in some cases.
// https://github.com/cockroachdb/cockroach/pull/34520
//
// We've seen the consistency checks hang indefinitely in some cases.
// TODO(erikgrinaker): avoid result set buffering. We seem to be receiving
// results in batches of 64 rows, regardless of results_buffer_size or the
// row size (e.g. with 16 KB ballast per row). Not clear where this buffering
// is happening or how to disable it.
started := timeutil.Now()
rows, err := db.QueryContext(ctx, `
SET statement_timeout = '5m';
SET statement_timeout = '20m';
SELECT t.range_id, t.start_key_pretty, t.status, t.detail
FROM
crdb_internal.check_consistency(true, '', '') as t
WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
FROM crdb_internal.check_consistency(false, '', '') as t;`)
if err != nil {
// TODO(tbg): the checks can fail for silly reasons like missing gossiped
// descriptors, etc. -- not worth failing the test for. Ideally this would
Expand All @@ -46,20 +60,51 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
return nil
}
defer rows.Close()

logEvery := log.Every(time.Minute)
logEvery.ShouldLog() // don't immediately log

const maxReport = 10 // max number of inconsistencies to report
var finalErr error
var ranges, inconsistent int
for rows.Next() {
var rangeID int32
var prettyKey, status, detail string
if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil {
l.Printf("consistency check failed with %v; ignoring", scanErr)
return nil
return finalErr // return partial finalErr
}
// Only detect replica inconsistencies, and ignore MVCC stats mismatches
// since these can happen in rare cases due to lease requests not respecting
// latches: https://github.com/cockroachdb/cockroach/issues/93896
//
// TODO(erikgrinaker): We should take storage checkpoints for inconsistent
// ranges as well, up to maxReport. This requires support in
// check_consistency() such that we take the checkpoints at the same Raft
// log index across nodes.
if status == kvpb.CheckConsistencyResponse_RANGE_INCONSISTENT.String() {
inconsistent++
msg := fmt.Sprintf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)
l.Printf(msg)
if inconsistent <= maxReport {
finalErr = errors.CombineErrors(finalErr, errors.Newf("%s", redact.SafeString(msg)))
} else if inconsistent == maxReport+1 {
finalErr = errors.CombineErrors(finalErr,
errors.Newf("max number of inconsistencies %d exceeded", maxReport))
}
}

ranges++
if logEvery.ShouldLog() {
l.Printf("consistency checked %d ranges (at key %s)", ranges, prettyKey)
}
finalErr = errors.CombineErrors(finalErr,
errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail))
}
l.Printf("consistency checked %d ranges in %s, found %d inconsistent ranges",
ranges, timeutil.Since(started).Round(time.Second), inconsistent)

if err := rows.Err(); err != nil {
l.Printf("consistency check failed with %v; ignoring", err)
return nil
return finalErr // return partial finalErr
}
return finalErr
}
Expand Down
31 changes: 30 additions & 1 deletion pkg/sql/conn_executor_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -1445,10 +1445,15 @@ func (ex *connExecutor) reportSessionDataChanges(fn func() error) error {
return nil
}

func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) error {
func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) (retErr error) {
ctx, sp := tracing.EnsureChildSpan(ctx, ex.server.cfg.AmbientCtx.Tracer, "commit sql txn")
defer sp.Finish()

defer func() {
failed := retErr != nil
ex.recordDDLTxnTelemetry(failed)
}()

if err := ex.extraTxnState.sqlCursors.closeAll(true /* errorOnWithHold */); err != nil {
return err
}
Expand Down Expand Up @@ -1525,6 +1530,29 @@ func (ex *connExecutor) commitSQLTransactionInternal(ctx context.Context) error
return nil
}

// recordDDLTxnTelemetry records telemetry for explicit transactions that
// contain DDL.
func (ex *connExecutor) recordDDLTxnTelemetry(failed bool) {
numDDL, numStmts := ex.extraTxnState.numDDL, ex.state.mu.stmtCount
if numDDL == 0 || ex.implicitTxn() {
return
}
// Subtract 1 statement so the COMMIT/ROLLBACK is not counted.
if numDDL == numStmts-1 {
if failed {
telemetry.Inc(sqltelemetry.DDLOnlyTransactionFailureCounter)
} else {
telemetry.Inc(sqltelemetry.DDLOnlyTransactionSuccessCounter)
}
} else /* numDDL != numStmts-1 */ {
if failed {
telemetry.Inc(sqltelemetry.MixedDDLDMLTransactionFailureCounter)
} else {
telemetry.Inc(sqltelemetry.MixedDDLDMLTransactionSuccessCounter)
}
}
}

// createJobs creates jobs for the records cached in schemaChangeJobRecords
// during this transaction.
func (ex *connExecutor) createJobs(ctx context.Context) error {
Expand Down Expand Up @@ -1558,6 +1586,7 @@ func (ex *connExecutor) rollbackSQLTransaction(
}

ex.extraTxnState.prepStmtsNamespace.closeAllPortals(ctx, &ex.extraTxnState.prepStmtsNamespaceMemAcc)
ex.recordDDLTxnTelemetry(true /* failed */)

if err := ex.state.mu.txn.Rollback(ctx); err != nil {
log.Warningf(ctx, "txn rollback failed: %s", err)
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/multitenant_admin_function_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -937,6 +937,7 @@ func TestRelocateNonVoters(t *testing.T) {
func TestExperimentalRelocateNonVoters(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
skip.UnderDuress(t, "test flakes in slow builds; see #108081")

testCases := []testCase{
{
Expand Down
16 changes: 16 additions & 0 deletions pkg/sql/sqltelemetry/schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,19 @@ var DeclarativeSchemaChangerCounter = telemetry.GetCounterOnce("sql.schema.schem
// LegacySchemaChangerCounter is incremented whenever the legacy schema changer
// is used.
var LegacySchemaChangerCounter = telemetry.GetCounterOnce("sql.schema.schema_changer_mode.legacy")

// MixedDDLDMLTransactionSuccessCounter is incremented whenever an explicit
// transaction that has both DDL and DML statements succeeds.
var MixedDDLDMLTransactionSuccessCounter = telemetry.GetCounterOnce("sql.schema.transaction.mixed_ddl_dml.success")

// MixedDDLDMLTransactionFailureCounter is incremented whenever an explicit
// transaction that has both DDL and DML statements fails.
var MixedDDLDMLTransactionFailureCounter = telemetry.GetCounterOnce("sql.schema.transaction.mixed_ddl_dml.failure")

// DDLOnlyTransactionSuccessCounter is incremented whenever an explicit
// transaction that has only DDL statements succeeds.
var DDLOnlyTransactionSuccessCounter = telemetry.GetCounterOnce("sql.schema.transaction.ddl_only.success")

// DDLOnlyTransactionFailureCounter is incremented whenever an explicit
// transaction that has only DDL statements fails.
var DDLOnlyTransactionFailureCounter = telemetry.GetCounterOnce("sql.schema.transaction.ddl_only.failure")
44 changes: 44 additions & 0 deletions pkg/sql/testdata/telemetry/schema
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,47 @@ CREATE TYPE composite_typ AS (a int, b int)
----
sql.schema.create_type
sql.schema.schema_changer_mode.legacy

exec
BEGIN;
SELECT 1;
DROP INDEX t_b_idx;
SELECT 1/0;
----
error: pq: division by zero

feature-usage
ROLLBACK;
----
sql.schema.transaction.mixed_ddl_dml.failure

exec
BEGIN;
DROP INDEX t_b_idx;
----

feature-usage
ROLLBACK;
----
sql.schema.transaction.ddl_only.failure

feature-usage
BEGIN;
SELECT 1;
DROP INDEX t_b_idx;
COMMIT;
----
sql.schema.change_in_explicit_txn
sql.schema.drop_index
sql.schema.schema_changer_mode.legacy
sql.schema.transaction.mixed_ddl_dml.success

feature-usage
BEGIN;
CREATE INDEX ON t(b);
COMMIT;
----
sql.schema.change_in_explicit_txn
sql.schema.create_index
sql.schema.schema_changer_mode.legacy
sql.schema.transaction.ddl_only.success

0 comments on commit fec5213

Please sign in to comment.