From ecaf9c8fd4c7af5f7703f98868e4f7271613212b Mon Sep 17 00:00:00 2001 From: Tobias Grieger Date: Tue, 30 Aug 2022 13:54:18 +0200 Subject: [PATCH] roachtest: move CheckReplicaDivergenceOnDB to util This had no business sitting on `Cluster`. Release note: None --- pkg/BUILD.bazel | 2 + pkg/cmd/roachtest/BUILD.bazel | 1 + pkg/cmd/roachtest/cluster.go | 48 +------------- .../roachtest/cluster/cluster_interface.go | 1 - pkg/cmd/roachtest/roachtestutil/BUILD.bazel | 15 +++++ .../roachtestutil/consistency_check.go | 62 +++++++++++++++++++ pkg/cmd/roachtest/tests/BUILD.bazel | 1 + pkg/cmd/roachtest/tests/version.go | 3 +- 8 files changed, 85 insertions(+), 48 deletions(-) create mode 100644 pkg/cmd/roachtest/roachtestutil/BUILD.bazel create mode 100644 pkg/cmd/roachtest/roachtestutil/consistency_check.go diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 1cb3bae0d43f..ea9fb5b9660f 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -939,6 +939,7 @@ GO_TARGETS = [ "//pkg/cmd/roachtest/clusterstats:clusterstats_test", "//pkg/cmd/roachtest/option:option", "//pkg/cmd/roachtest/registry:registry", + "//pkg/cmd/roachtest/roachtestutil:roachtestutil", "//pkg/cmd/roachtest/spec:spec", "//pkg/cmd/roachtest/test:test", "//pkg/cmd/roachtest/tests:tests", @@ -2296,6 +2297,7 @@ GET_X_DATA_TARGETS = [ "//pkg/cmd/roachtest/clusterstats:get_x_data", "//pkg/cmd/roachtest/option:get_x_data", "//pkg/cmd/roachtest/registry:get_x_data", + "//pkg/cmd/roachtest/roachtestutil:get_x_data", "//pkg/cmd/roachtest/spec:get_x_data", "//pkg/cmd/roachtest/test:get_x_data", "//pkg/cmd/roachtest/tests:get_x_data", diff --git a/pkg/cmd/roachtest/BUILD.bazel b/pkg/cmd/roachtest/BUILD.bazel index e240bfa6e703..e4ea11c28f6a 100644 --- a/pkg/cmd/roachtest/BUILD.bazel +++ b/pkg/cmd/roachtest/BUILD.bazel @@ -21,6 +21,7 @@ go_library( "//pkg/cmd/roachtest/cluster", "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/registry", + "//pkg/cmd/roachtest/roachtestutil", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", "//pkg/cmd/roachtest/tests", diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index 3494afd1b8b2..e192108cf2d3 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -35,6 +35,7 @@ import ( "github.com/armon/circbuf" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod" @@ -1303,51 +1304,6 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) { } } -// CheckReplicaDivergenceOnDB runs a fast consistency check of the whole keyspace -// against the provided db. If an inconsistency is found, it returns it in the -// error. Note that this will swallow errors returned directly from the consistency -// check since we know that such spurious errors are possibly without any relation -// to the check having failed. -func (c *clusterImpl) CheckReplicaDivergenceOnDB( - ctx context.Context, l *logger.Logger, db *gosql.DB, -) error { - // NB: we set a statement_timeout since context cancellation won't work here, - // see: - // https://github.com/cockroachdb/cockroach/pull/34520 - // - // We've seen the consistency checks hang indefinitely in some cases. - rows, err := db.QueryContext(ctx, ` -SET statement_timeout = '5m'; -SELECT t.range_id, t.start_key_pretty, t.status, t.detail -FROM -crdb_internal.check_consistency(true, '', '') as t -WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) - if err != nil { - // TODO(tbg): the checks can fail for silly reasons like missing gossiped - // descriptors, etc. -- not worth failing the test for. Ideally this would - // be rock solid. - l.Printf("consistency check failed with %v; ignoring", err) - return nil - } - defer rows.Close() - var finalErr error - for rows.Next() { - var rangeID int32 - var prettyKey, status, detail string - if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil { - l.Printf("consistency check failed with %v; ignoring", scanErr) - return nil - } - finalErr = errors.CombineErrors(finalErr, - errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)) - } - if err := rows.Err(); err != nil { - l.Printf("consistency check failed with %v; ignoring", err) - return nil - } - return finalErr -} - // FailOnReplicaDivergence fails the test if // crdb_internal.check_consistency(true, '', '') indicates that any ranges' // replicas are inconsistent with each other. It uses the first node that @@ -1385,7 +1341,7 @@ func (c *clusterImpl) FailOnReplicaDivergence(ctx context.Context, t *testImpl) if err := contextutil.RunWithTimeout( ctx, "consistency check", 5*time.Minute, func(ctx context.Context) error { - return c.CheckReplicaDivergenceOnDB(ctx, t.L(), db) + return roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db) }, ); err != nil { t.Errorf("consistency check failed: %v", err) diff --git a/pkg/cmd/roachtest/cluster/cluster_interface.go b/pkg/cmd/roachtest/cluster/cluster_interface.go index a1d42c28fe32..b839ab62cc6a 100644 --- a/pkg/cmd/roachtest/cluster/cluster_interface.go +++ b/pkg/cmd/roachtest/cluster/cluster_interface.go @@ -125,7 +125,6 @@ type Cluster interface { // These should be removed over time. MakeNodes(opts ...option.Option) string - CheckReplicaDivergenceOnDB(context.Context, *logger.Logger, *gosql.DB) error GitClone( ctx context.Context, l *logger.Logger, src, dest, branch string, node option.NodeListOption, ) error diff --git a/pkg/cmd/roachtest/roachtestutil/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel new file mode 100644 index 000000000000..b6c2ece14852 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/BUILD.bazel @@ -0,0 +1,15 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "roachtestutil", + srcs = ["consistency_check.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil", + visibility = ["//visibility:public"], + deps = [ + "//pkg/roachprod/logger", + "@com_github_cockroachdb_errors//:errors", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/cmd/roachtest/roachtestutil/consistency_check.go b/pkg/cmd/roachtest/roachtestutil/consistency_check.go new file mode 100644 index 000000000000..529dbd0a9081 --- /dev/null +++ b/pkg/cmd/roachtest/roachtestutil/consistency_check.go @@ -0,0 +1,62 @@ +// Copyright 2022 The Cockroach Authors. +// +// Use of this software is governed by the Business Source License +// included in the file licenses/BSL.txt. +// +// As of the Change Date specified in that file, in accordance with +// the Business Source License, use of this software will be governed +// by the Apache License, Version 2.0, included in the file +// licenses/APL.txt. +// + +package roachtestutil + +import ( + "context" + gosql "database/sql" + + "github.com/cockroachdb/cockroach/pkg/roachprod/logger" + "github.com/cockroachdb/errors" +) + +// CheckReplicaDivergenceOnDB runs a stats-only consistency check via the +// provided DB. It ignores transient errors that can result from the +// implementation of crdb_internal.check_consistency, so a nil result +// does not prove anything. +func CheckReplicaDivergenceOnDB(ctx context.Context, l *logger.Logger, db *gosql.DB) error { + // NB: we set a statement_timeout since context cancellation won't work here, + // see: + // https://github.com/cockroachdb/cockroach/pull/34520 + // + // We've seen the consistency checks hang indefinitely in some cases. + rows, err := db.QueryContext(ctx, ` +SET statement_timeout = '5m'; +SELECT t.range_id, t.start_key_pretty, t.status, t.detail +FROM +crdb_internal.check_consistency(true, '', '') as t +WHERE t.status NOT IN ('RANGE_CONSISTENT', 'RANGE_INDETERMINATE')`) + if err != nil { + // TODO(tbg): the checks can fail for silly reasons like missing gossiped + // descriptors, etc. -- not worth failing the test for. Ideally this would + // be rock solid. + l.Printf("consistency check failed with %v; ignoring", err) + return nil + } + defer rows.Close() + var finalErr error + for rows.Next() { + var rangeID int32 + var prettyKey, status, detail string + if scanErr := rows.Scan(&rangeID, &prettyKey, &status, &detail); scanErr != nil { + l.Printf("consistency check failed with %v; ignoring", scanErr) + return nil + } + finalErr = errors.CombineErrors(finalErr, + errors.Newf("r%d (%s) is inconsistent: %s %s\n", rangeID, prettyKey, status, detail)) + } + if err := rows.Err(); err != nil { + l.Printf("consistency check failed with %v; ignoring", err) + return nil + } + return finalErr +} diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index aa7a01985d7a..47720df33f31 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -166,6 +166,7 @@ go_library( "//pkg/cmd/roachtest/clusterstats", "//pkg/cmd/roachtest/option", "//pkg/cmd/roachtest/registry", + "//pkg/cmd/roachtest/roachtestutil", "//pkg/cmd/roachtest/spec", "//pkg/cmd/roachtest/test", "//pkg/gossip", diff --git a/pkg/cmd/roachtest/tests/version.go b/pkg/cmd/roachtest/tests/version.go index 5fa1cca59d3c..e79bb05eb974 100644 --- a/pkg/cmd/roachtest/tests/version.go +++ b/pkg/cmd/roachtest/tests/version.go @@ -19,6 +19,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil" "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" "github.com/cockroachdb/cockroach/pkg/roachprod/install" "github.com/cockroachdb/cockroach/pkg/util/version" @@ -103,7 +104,7 @@ func registerVersion(r registry.Registry) { // // https://github.com/cockroachdb/cockroach/issues/37737#issuecomment-496026918 if !strings.HasPrefix(binaryVersion, "2.") { - if err := c.CheckReplicaDivergenceOnDB(ctx, t.L(), db); err != nil { + if err := roachtestutil.CheckReplicaDivergenceOnDB(ctx, t.L(), db); err != nil { return errors.Wrapf(err, "node %d", i) } }