diff --git a/pkg/ccl/logictestccl/BUILD.bazel b/pkg/ccl/logictestccl/BUILD.bazel index 0dcd015867e1..e1975d4b05ac 100644 --- a/pkg/ccl/logictestccl/BUILD.bazel +++ b/pkg/ccl/logictestccl/BUILD.bazel @@ -29,6 +29,7 @@ go_test( "//pkg/sql/logictest", "//pkg/testutils", "//pkg/testutils/serverutils", + "//pkg/testutils/skip", "//pkg/testutils/testcluster", "//pkg/util/leaktest", "//pkg/util/randutil", diff --git a/pkg/ccl/logictestccl/logic_test.go b/pkg/ccl/logictestccl/logic_test.go index d9031d6d79ad..c19b6cafabb6 100644 --- a/pkg/ccl/logictestccl/logic_test.go +++ b/pkg/ccl/logictestccl/logic_test.go @@ -16,6 +16,7 @@ import ( _ "github.com/cockroachdb/cockroach/pkg/ccl" "github.com/cockroachdb/cockroach/pkg/sql/logictest" "github.com/cockroachdb/cockroach/pkg/testutils" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/util/leaktest" ) @@ -26,6 +27,29 @@ func TestCCLLogic(t *testing.T) { logictest.RunLogicTest(t, logictest.TestServerArgs{}, testutils.TestDataPath(t, logictestGlob)) } +// TestBackupRestoreLogic runs all non-CCL logic test files under the +// 3node-backup configuration, which randomly runs a backup and restore between +// logic test statements to ensure that we can always take a backup and restore +// the data correctly. Test files that blocklist the 3node-backup configuration +// (i.e. "# LogicTest: !3node-backup") are not run. +func TestBackupRestoreLogic(t *testing.T) { + defer leaktest.AfterTest(t)() + skip.UnderStress(t, "times out under stress") + + testdataDir := "../../sql/logictest/testdata/" + if bazel.BuiltWithBazel() { + runfile, err := bazel.Runfile("pkg/sql/logictest/testdata/") + if err != nil { + t.Fatal(err) + } + testdataDir = runfile + } + logictest.RunLogicTestWithDefaultConfig(t, logictest.TestServerArgs{}, "3node-backup", + true /* runCCLConfigs */, filepath.Join(testdataDir, logictestGlob)) + logictest.RunLogicTestWithDefaultConfig(t, logictest.TestServerArgs{}, "3node-backup", + true /* runCCLConfigs */, filepath.Join("testdata/", logictestGlob)) +} + // TestTenantLogic runs all non-CCL logic test files under the 3node-tenant // configuration, which constructs a secondary tenant and runs the test within // that secondary tenant's sandbox. Test files that blocklist the 3node-tenant diff --git a/pkg/sql/logictest/BUILD.bazel b/pkg/sql/logictest/BUILD.bazel index 35fc99b74595..00ec594d4d6f 100644 --- a/pkg/sql/logictest/BUILD.bazel +++ b/pkg/sql/logictest/BUILD.bazel @@ -47,6 +47,7 @@ go_library( "//pkg/util/tracing", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", + "@com_github_kr_pretty//:pretty", "@com_github_lib_pq//:pq", "@com_github_pmezard_go_difflib//difflib", "@com_github_stretchr_testify//require", diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index 278c7b52187e..9e3a4cada7a6 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -73,6 +73,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" + "github.com/kr/pretty" "github.com/lib/pq" "github.com/pmezard/go-difflib/difflib" "github.com/stretchr/testify/require" @@ -507,6 +508,9 @@ type testClusterConfig struct { // localities is set if nodes should be set to a particular locality. // Nodes are 1-indexed. localities map[int]roachpb.Locality + // backupRestoreProbability will periodically backup the cluster and restore + // it's state to a new cluster at random points during a logic test. + backupRestoreProbability float64 // declarativeSchemaChanger determines if the declarative schema changer // is enabled. declarativeSchemaChanger bool @@ -776,6 +780,18 @@ var logicTestConfigs = []testClusterConfig{ binaryVersion: roachpb.Version{Major: 22, Minor: 1}, disableUpgrade: true, }, + { + // 3node-backup is a config that periodically performs a cluster backup, + // and restores that backup into a new cluster before continuing the test. + // This config can only be run with a CCL binary, so is a noop if run + // through the normal logictest command. + // To run a logic test with this config as a directive, run: + // make test PKG=./pkg/ccl/logictestccl TESTS=TestBackupRestoreLogic// + name: "3node-backup", + numNodes: 3, + backupRestoreProbability: envutil.EnvOrDefaultFloat64("COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY", 0.0), + isCCLConfig: true, + }, } // An index in the above slice. @@ -1201,6 +1217,16 @@ type logicTest struct { subtestT *testing.T rng *rand.Rand cfg testClusterConfig + // serverArgs are the parameters used to create a cluster for this test. + // They are persisted since a cluster can be recreated throughout the + // lifetime of the test and we should create all clusters with the same + // arguments. + serverArgs *TestServerArgs + // clusterOpts are the options used to create a cluster for this test. + // They are persisted since a cluster can be recreated throughout the + // lifetime of the test and we should create all clusters with the same + // arguments. + clusterOpts []clusterOpt // cluster is the test cluster against which we are testing. This cluster // may be reset during the lifetime of the test. cluster serverutils.TestClusterInterface @@ -1737,12 +1763,26 @@ func (t *logicTest) shutdownCluster() { t.db = nil } +// resetCluster cleans up the current cluster, and creates a fresh one. +func (t *logicTest) resetCluster() { + t.shutdownCluster() + if t.serverArgs == nil { + // We expect the server args to be persisted to the test during test + // setup. + t.Fatal("resetting the cluster before server args were set") + } + serverArgs := *t.serverArgs + t.newCluster(serverArgs, t.clusterOpts) +} + // setup creates the initial cluster for the logic test and populates the // relevant fields on logicTest. It is expected to be called only once (per test // file), and before processing any test files - unless a mock logicTest is // created (see parallelTest.processTestFile). func (t *logicTest) setup(cfg testClusterConfig, serverArgs TestServerArgs, opts []clusterOpt) { t.cfg = cfg + t.serverArgs = &serverArgs + copy(t.clusterOpts, opts) // TODO(pmattis): Add a flag to make it easy to run the tests against a local // MySQL or Postgres instance. tempExternalIODir, tempExternalIODirCleanup := testutils.TempDir(t.rootT) @@ -2005,6 +2045,9 @@ type subtestDetails struct { } func (t *logicTest) processTestFile(path string, config testClusterConfig) error { + rng, seed := randutil.NewPseudoRand() + t.outf("rng seed: %d\n", seed) + subtests, err := fetchSubtests(path) if err != nil { return err @@ -2022,7 +2065,7 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error // If subtest has no name, then it is not a subtest, so just run the lines // in the overall test. Note that this can only happen in the first subtest. if len(subtest.name) == 0 { - if err := t.processSubtest(subtest, path); err != nil { + if err := t.processSubtest(subtest, path, config, rng); err != nil { return err } } else { @@ -2032,7 +2075,7 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error defer func() { t.subtestT = nil }() - if err := t.processSubtest(subtest, path); err != nil { + if err := t.processSubtest(subtest, path, config, rng); err != nil { t.Error(err) } }) @@ -2058,6 +2101,153 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error return nil } +func (t *logicTest) hasOpenTxns() bool { + for _, user := range t.clients { + if _, err := user.Exec("SET TRANSACTION PRIORITY NORMAL;"); !testutils.IsError(err, "there is no transaction in progress") { + return true + } + } + return false +} + +// maybeBackupRestore will randomly issue a cluster backup, create a new +// cluster, and restore that backup to the cluster before continuing the test. +// The probability of executing a backup and restore is specified in the +// testClusterConfig. +func (t *logicTest) maybeBackupRestore(rng *rand.Rand, config testClusterConfig) error { + if config.backupRestoreProbability != 0 && !config.isCCLConfig { + return errors.Newf("logic test config %s specifies a backup restore probability but is not CCL", + config.name) + } + + // We decide if we want to take a backup here based on a probability + // specified in the logic test config. + if rng.Float64() > config.backupRestoreProbability { + return nil + } + + // Check if any users have open transactions in the logictest. If they do, we + // do not want to teardown the cluster and create a new one as it might + // interfere with what the logictest is trying to test. + // + // We could perhaps make this smarter and perform the backup after the + // transaction is close. + if t.hasOpenTxns() { + return nil + } + + oldUser := t.user + defer func() { + t.setUser(oldUser, 0 /* nodeIdxOverride */) + }() + + // To restore the same state in for the logic test, we need to restore the + // data and the session state. The session state includes things like session + // variables that are set for every session that is open. + // + // TODO(adityamaru): A better approach might be to wipe the cluster once we + // have a command that enables this. That way all of the session data will not + // be lost in the process of creating a new cluster. + users := make([]string, 0, len(t.clients)) + userToHexSession := make(map[string]string, len(t.clients)) + userToSessionVars := make(map[string]map[string]string, len(t.clients)) + for user := range t.clients { + t.setUser(user, 0 /* nodeIdxOverride */) + users = append(users, user) + + // Serialize session variables. + var userSession string + var err error + if err = t.db.QueryRow(`SELECT encode(crdb_internal.serialize_session(), 'hex')`).Scan(&userSession); err == nil { + userToHexSession[user] = userSession + continue + } + log.Warningf(context.Background(), "failed to serialize session: %+v", err) + + // If we failed to serialize the session variables, lets save the output of + // `SHOW ALL`. This usually happens if the session contains prepared + // statements or portals that cause the `serialize_session()` to fail. + // + // Saving the session variables in this manner does not guarantee the test + // will succeed since there are no ordering semantics when we go to apply + // them. There are some session variables that need to be applied before + // others for them to be valid. Thus, it is strictly better to use + // `serialize/deserialize_session()`, this "hack" just gives the test one + // more chance to succeed. + userSessionVars := make(map[string]string) + existingSessionVars, err := t.db.Query("SHOW ALL") + if err != nil { + // We might get an error depending on the state of the test. Let's just + // ignore it. + // BEFOREMERGE: Add a comment here about when this is the case. + continue + } + for existingSessionVars.Next() { + var key, value string + if err := existingSessionVars.Scan(&key, &value); err != nil { + return errors.Wrap(err, "scanning session variables") + } + userSessionVars[key] = value + } + userToSessionVars[user] = userSessionVars + } + + backupLocation := fmt.Sprintf("nodelocal://1/logic-test-backup-%s", + strconv.FormatInt(timeutil.Now().UnixNano(), 10)) + + // Perform the backup and restore as root. + t.setUser(security.RootUser, 0 /* nodeIdxOverride */) + + if _, err := t.db.Exec(fmt.Sprintf("BACKUP TO '%s'", backupLocation)); err != nil { + return errors.Wrap(err, "backing up cluster") + } + + // Create a new cluster. Perhaps this can be updated to just wipe the exiting + // cluster once we have the ability to easily wipe a cluster through SQL. + t.resetCluster() + + // Run the restore as root. + t.setUser(security.RootUser, 0 /* nodeIdxOverride */) + if _, err := t.db.Exec(fmt.Sprintf("RESTORE FROM '%s'", backupLocation)); err != nil { + return errors.Wrap(err, "restoring cluster") + } + + // Restore the session state that was in the old cluster. + + // Create new connections for the existing users, and restore the session + // variables that we collected. + for _, user := range users { + // Call setUser for every user to create the connection for that user. + t.setUser(user, 0 /* nodeIdxOverride */) + + if userSession, ok := userToHexSession[user]; ok { + if _, err := t.db.Exec(fmt.Sprintf(`SELECT crdb_internal.deserialize_session(decode('%s', 'hex'))`, userSession)); err != nil { + return errors.Wrapf(err, "deserializing session") + } + } else if vars, ok := userToSessionVars[user]; ok { + // We now attempt to restore the session variables that were set on the + // backing up cluster. These are not included in the backup restore and so + // have to be restored manually. + for key, value := range vars { + // First try setting the cluster setting as a string. + if _, err := t.db.Exec(fmt.Sprintf("SET %s='%s'", key, value)); err != nil { + // If it fails, try setting the value as an int. + log.Infof(context.Background(), "this is the first error %# v", pretty.Formatter(err)) + if _, err := t.db.Exec(fmt.Sprintf("SET %s=%s", key, value)); err != nil { + // Some cluster settings can't be set at all, so ignore these errors. + // If a setting that we needed could not be restored, we expect the + // logic test to fail and let us know. + log.Infof(context.Background(), "this is the second error %# v", pretty.Formatter(err)) + continue + } + } + } + } + } + + return nil +} + // fetchSubtests reads through the test file and splices it into subtest chunks. // If there is no subtest, the output will only contain a single entry. func fetchSubtests(path string) ([]subtestDetails, error) { @@ -2106,7 +2296,9 @@ func fetchSubtests(path string) ([]subtestDetails, error) { return subtests, nil } -func (t *logicTest) processSubtest(subtest subtestDetails, path string) error { +func (t *logicTest) processSubtest( + subtest subtestDetails, path string, config testClusterConfig, rng *rand.Rand, +) error { defer t.traceStop() s := newLineScanner(subtest.buffer) @@ -2136,6 +2328,9 @@ func (t *logicTest) processSubtest(subtest subtestDetails, path string) error { path, s.line+subtest.lineLineIndexIntoFile, ) } + if err := t.maybeBackupRestore(rng, config); err != nil { + return err + } switch cmd { case "repeat": // A line "repeat X" makes the test repeat the following statement or query X times. diff --git a/pkg/sql/logictest/testdata/logic_test/ccl b/pkg/sql/logictest/testdata/logic_test/ccl index 844189f1abf4..778b1740731a 100644 --- a/pkg/sql/logictest/testdata/logic_test/ccl +++ b/pkg/sql/logictest/testdata/logic_test/ccl @@ -1,6 +1,6 @@ # 3node-tenant is blocked from running this file because the config runs with # a CCL binary, so the expected failures from using a non-CCL binary don't occur. -# LogicTest: !3node-tenant +# LogicTest: !3node-tenant !3node-backup # CCL-only statements error out trying to handle the parsed statements. diff --git a/pkg/sql/logictest/testdata/logic_test/partitioning b/pkg/sql/logictest/testdata/logic_test/partitioning index 8561b0c16180..704f66ead243 100644 --- a/pkg/sql/logictest/testdata/logic_test/partitioning +++ b/pkg/sql/logictest/testdata/logic_test/partitioning @@ -1,6 +1,6 @@ # 3node-tenant is blocked from running this file because the config runs with # a CCL binary, so the expected failures from using a non-CCL binary don't occur. -# LogicTest: !3node-tenant +# LogicTest: !3node-tenant !3node-backup statement error pgcode XXC01 creating or manipulating partitions requires a CCL binary CREATE TABLE t (a INT, b INT, c INT, PRIMARY KEY (a, b)) PARTITION BY LIST (a) ( diff --git a/pkg/sql/logictest/testdata/logic_test/tenant b/pkg/sql/logictest/testdata/logic_test/tenant index 0791955cfef3..4604567b5144 100644 --- a/pkg/sql/logictest/testdata/logic_test/tenant +++ b/pkg/sql/logictest/testdata/logic_test/tenant @@ -1,4 +1,4 @@ -# LogicTest: !3node-tenant +# LogicTest: !3node-tenant !3node-backup query IBT colnames SELECT * FROM system.tenants ORDER BY id ---- diff --git a/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant b/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant index b09121d38072..57d6066b02df 100644 --- a/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant +++ b/pkg/sql/logictest/testdata/logic_test/zone_config_system_tenant @@ -1,4 +1,4 @@ -# LogicTest: !3node-tenant +# LogicTest: !3node-tenant !3node-backup # Zone config logic tests that are only meant to work for the system tenant. statement ok