Skip to content

Commit

Permalink
roachtest: run consistency checks after tests
Browse files Browse the repository at this point in the history
This patch runs replica consistency checks during post-test assertions.
Previously, this only checked for stats inconsistencies, it now checks
for actual data inconsistencies. This is useful to detect data
corruption bugs, typically in e.g. Pebble or Raft snapshots.

The consistency check runs at full speed for up to 20 minutes before
timing out, and will report any inconsistencies found during a partial
run. This is sufficient to check about 200 GB of data. Tests can opt out
by skipping `registry.PostValidationReplicaDivergence`.

Storage checkpoints are not yet taken when inconsistencies are detected,
since it requires additional support in SQL builtins to take them at
the same Raft log index across nodes. This will be addressed separately.

Epic: none
Release note: None
  • Loading branch information
erikgrinaker committed Dec 18, 2023
1 parent 6fae6a7 commit e675216
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 17 deletions.
4 changes: 2 additions & 2 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1441,14 +1441,14 @@ func (c *clusterImpl) HealthStatus(
}

// assertConsistentReplicas fails the test if
// crdb_internal.check_consistency(true, ”, ”) indicates that any ranges'
// crdb_internal.check_consistency(false, ”, ”) indicates that any ranges'
// replicas are inconsistent with each other.
func (c *clusterImpl) assertConsistentReplicas(
ctx context.Context, db *gosql.DB, t *testImpl,
) error {
t.L().Printf("checking for replica divergence")
return timeutil.RunWithTimeout(
ctx, "consistency check", 5*time.Minute,
ctx, "consistency check", 20*time.Minute,
func(ctx context.Context) error {
return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db)
},
Expand Down
3 changes: 3 additions & 0 deletions pkg/cmd/roachtest/roachtestutil/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@ go_library(
"//pkg/cmd/roachtest/cluster",
"//pkg/cmd/roachtest/option",
"//pkg/cmd/roachtest/test",
"//pkg/kv/kvpb",
"//pkg/roachprod/config",
"//pkg/roachprod/install",
"//pkg/roachprod/logger",
"//pkg/testutils/sqlutils",
"//pkg/util/humanizeutil",
"//pkg/util/log",
"//pkg/util/timeutil",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
],
)

Expand Down
75 changes: 60 additions & 15 deletions pkg/cmd/roachtest/roachtestutil/validation_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,30 +14,44 @@ package roachtestutil
import (
"context"
gosql "database/sql"
"fmt"
"time"

"github.com/cockroachdb/cockroach/pkg/kv/kvpb"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/redact"
)

// CheckReplicaDivergenceOnDB runs a stats-only consistency check via the
// provided DB. It ignores transient errors that can result from the
// implementation of crdb_internal.check_consistency, so a nil result
// does not prove anything.
// CheckReplicaDivergenceOnDB runs a consistency check via the provided DB. It
// ignores transient errors that can result from the implementation of
// crdb_internal.check_consistency, so a nil result does not prove anything.
//
// The consistency check may not get enough time to complete, but will return
// any inconsistencies that it did find before timing out.
func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql.DB) error {
// NB: we set a statement_timeout since context cancellation won't work here,
// see:
// Speed up consistency checks. The test is done, so let's go full throttle.
_, err := db.ExecContext(ctx, "SET CLUSTER SETTING server.consistency_check.max_rate = '1GB'")
if err != nil {
return err
}

// NB: we set a statement_timeout since context cancellation won't work here.
// We've seen the consistency checks hang indefinitely in some cases.
// https://github.com/cockroachdb/cockroach/pull/34520
//
// We've seen the consistency checks hang indefinitely in some cases.
// TODO(erikgrinaker): avoid result set buffering. We seem to be receiving
// results in batches of 64 rows, regardless of results_buffer_size or the
// row size (e.g. with 16 KB ballast per row). Not clear where this buffering
// is happening or how to disable it.
started := timeutil.Now()
rows, err := db.QueryContext(ctx, `
SET statement_timeout = '5m';
SET statement_timeout = '20m';
SELECT t.range_id, t.start_key_pretty, t.status, t.detail
FROM
crdb_internal.check_consistency(true, '', '') as t
WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
FROM crdb_internal.check_consistency(false, '', '') as t;`)
if err != nil {
// TODO(tbg): the checks can fail for silly reasons like missing gossiped
// descriptors, etc. -- not worth failing the test for. Ideally this would
Expand All @@ -46,20 +60,51 @@ WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`)
return nil
}
defer rows.Close()

logEvery := log.Every(time.Minute)
logEvery.ShouldLog() // don't immediately log

const maxReport = 10 // max number of inconsistencies to report
var finalErr error
var ranges, inconsistent int
for rows.Next() {
var rangeID int32
var prettyKey, status, detail string
if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil {
l.Printf("consistency check failed with %v; ignoring", scanErr)
return nil
return finalErr // return partial finalErr
}
// Only detect replica inconsistencies, and ignore MVCC stats mismatches
// since these can happen in rare cases due to lease requests not respecting
// latches: https://github.com/cockroachdb/cockroach/issues/93896
//
// TODO(erikgrinaker): We should take storage checkpoints for inconsistent
// ranges as well, up to maxReport. This requires support in
// check_consistency() such that we take the checkpoints at the same Raft
// log index across nodes.
if status == kvpb.CheckConsistencyResponse_RANGE_INCONSISTENT.String() {
inconsistent++
msg := fmt.Sprintf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)
l.Printf(msg)
if inconsistent <= maxReport {
finalErr = errors.CombineErrors(finalErr, errors.Newf("%s", redact.SafeString(msg)))
} else if inconsistent == maxReport+1 {
finalErr = errors.CombineErrors(finalErr,
errors.Newf("max number of inconsistencies %d exceeded", maxReport))
}
}

ranges++
if logEvery.ShouldLog() {
l.Printf("consistency checked %d ranges (at key %s)", ranges, prettyKey)
}
finalErr = errors.CombineErrors(finalErr,
errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail))
}
l.Printf("consistency checked %d ranges in %s, found %d inconsistent ranges",
ranges, timeutil.Since(started).Round(time.Second), inconsistent)

if err := rows.Err(); err != nil {
l.Printf("consistency check failed with %v; ignoring", err)
return nil
return finalErr // return partial finalErr
}
return finalErr
}
Expand Down

0 comments on commit e675216

Please sign in to comment.