From 05e9034379244072d3f1e127391bae333b1fa2b2 Mon Sep 17 00:00:00 2001 From: Paul Bardea Date: Wed, 14 Oct 2020 18:00:27 -0400 Subject: [PATCH] logictest: randomly backup/restore in logic tests This commit introduces a new configuration to logic tests that will randomly backup the cluster, re-create the cluster and restore that backup to the new cluster. This is primarily beneficial since it runs backup/restore under a much more comprehensive set of SQL states. This configuration is currently disabled via the `COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY` environment variable being set to 0. The goal is to set up a nightly that will set this var to a non-zero number for a growing set of logic tests. Co-authored-by: Paul Bardea Closes #54060. Release note: None Release justification: non-production code changes, adds a logictest config that randomly runs backup and restore between existing logic tests. This is currently disabled. --- pkg/sql/logictest/BUILD.bazel | 1 + pkg/sql/logictest/logic.go | 212 +++++++++++++++++++++++++++++++++- 2 files changed, 209 insertions(+), 4 deletions(-) diff --git a/pkg/sql/logictest/BUILD.bazel b/pkg/sql/logictest/BUILD.bazel index 5672e8e531a8..1612c9daff32 100644 --- a/pkg/sql/logictest/BUILD.bazel +++ b/pkg/sql/logictest/BUILD.bazel @@ -47,6 +47,7 @@ go_library( "//pkg/util/tracing", "@com_github_cockroachdb_errors//:errors", "@com_github_cockroachdb_errors//oserror", + "@com_github_kr_pretty//:pretty", "@com_github_lib_pq//:pq", "@com_github_pmezard_go_difflib//difflib", "@com_github_stretchr_testify//require", diff --git a/pkg/sql/logictest/logic.go b/pkg/sql/logictest/logic.go index b32d0274debd..16b7fc19c6f9 100644 --- a/pkg/sql/logictest/logic.go +++ b/pkg/sql/logictest/logic.go @@ -73,6 +73,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/tracing" "github.com/cockroachdb/errors" "github.com/cockroachdb/errors/oserror" + "github.com/kr/pretty" "github.com/lib/pq" "github.com/pmezard/go-difflib/difflib" "github.com/stretchr/testify/require" @@ -511,6 +512,9 @@ type testClusterConfig struct { // localities is set if nodes should be set to a particular locality. // Nodes are 1-indexed. localities map[int]roachpb.Locality + // backupRestoreProbability will periodically backup the cluster and restore + // it's state to a new cluster at random points during a logic test. + backupRestoreProbability float64 } const threeNodeTenantConfigName = "3node-tenant" @@ -770,6 +774,18 @@ var logicTestConfigs = []testClusterConfig{ binaryVersion: roachpb.Version{Major: 22, Minor: 1}, disableUpgrade: true, }, + { + // 3node-backup is a config that periodically performs a cluster backup, + // and restores that backup into a new cluster before continuing the test. + // This config can only be run with a CCL binary, so is a noop if run + // through the normal logictest command. + // To run a logic test with this config as a directive, run: + // make test PKG=./pkg/ccl/logictestccl TESTS=TestBackupRestoreLogic// + name: "3node-backup", + numNodes: 3, + backupRestoreProbability: envutil.EnvOrDefaultFloat64("COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY", 0.0), + isCCLConfig: true, + }, } // An index in the above slice. @@ -1194,6 +1210,16 @@ type logicTest struct { subtestT *testing.T rng *rand.Rand cfg testClusterConfig + // serverArgs are the parameters used to create a cluster for this test. + // They are persisted since a cluster can be recreated throughout the + // lifetime of the test and we should create all clusters with the same + // arguments. + serverArgs *TestServerArgs + // clusterOpts are the options used to create a cluster for this test. + // They are persisted since a cluster can be recreated throughout the + // lifetime of the test and we should create all clusters with the same + // arguments. + clusterOpts []clusterOpt // cluster is the test cluster against which we are testing. This cluster // may be reset during the lifetime of the test. cluster serverutils.TestClusterInterface @@ -1723,19 +1749,33 @@ func (t *logicTest) shutdownCluster() { t.db = nil } +// resetCluster cleans up the current cluster, and creates a fresh one. +func (t *logicTest) resetCluster() { + t.shutdownCluster() + if t.serverArgs == nil { + // We expect the server args to be persisted to the test during test + // setup. + t.Fatal("resetting the cluster before server args were set") + } + serverArgs := *t.serverArgs + t.newCluster(serverArgs, t.clusterOpts) +} + // setup creates the initial cluster for the logic test and populates the // relevant fields on logicTest. It is expected to be called only once (per test // file), and before processing any test files - unless a mock logicTest is // created (see parallelTest.processTestFile). func (t *logicTest) setup(cfg testClusterConfig, serverArgs TestServerArgs, opts []clusterOpt) { t.cfg = cfg + t.serverArgs = &serverArgs + t.clusterOpts = opts[:] // TODO(pmattis): Add a flag to make it easy to run the tests against a local // MySQL or Postgres instance. tempExternalIODir, tempExternalIODirCleanup := testutils.TempDir(t.rootT) t.sharedIODir = tempExternalIODir t.testCleanupFuncs = append(t.testCleanupFuncs, tempExternalIODirCleanup) - t.newCluster(serverArgs, opts) + t.newCluster(serverArgs, t.clusterOpts) // Only create the test database on the initial cluster, since cluster restore // expects an empty cluster. @@ -1991,6 +2031,9 @@ type subtestDetails struct { } func (t *logicTest) processTestFile(path string, config testClusterConfig) error { + rng, seed := randutil.NewPseudoRand() + t.outf("rng seed: %d\n", seed) + subtests, err := fetchSubtests(path) if err != nil { return err @@ -2008,7 +2051,7 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error // If subtest has no name, then it is not a subtest, so just run the lines // in the overall test. Note that this can only happen in the first subtest. if len(subtest.name) == 0 { - if err := t.processSubtest(subtest, path); err != nil { + if err := t.processSubtest(subtest, path, config, rng); err != nil { return err } } else { @@ -2018,7 +2061,7 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error defer func() { t.subtestT = nil }() - if err := t.processSubtest(subtest, path); err != nil { + if err := t.processSubtest(subtest, path, config, rng); err != nil { t.Error(err) } }) @@ -2044,6 +2087,162 @@ func (t *logicTest) processTestFile(path string, config testClusterConfig) error return nil } +func (t *logicTest) hasOpenTxns(ctx context.Context) bool { + for _, user := range t.clients { + existingTxnPriority := "NORMAL" + err := user.QueryRow("SHOW TRANSACTION PRIORITY").Scan(&existingTxnPriority) + if err != nil { + // Ignore an error if we are unable to see transaction priority. + log.Warningf(ctx, "failed to check txn priority with %v", err) + continue + } + if _, err := user.Exec("SET TRANSACTION PRIORITY NORMAL;"); !testutils.IsError(err, "there is no transaction in progress") { + // Reset the txn priority to what it was before we checked for open txns. + _, err := user.Exec(fmt.Sprintf(`SET TRANSACTION PRIORITY %s`, existingTxnPriority)) + if err != nil { + log.Warningf(ctx, "failed to reset txn priority with %v", err) + } + return true + } + } + return false +} + +// maybeBackupRestore will randomly issue a cluster backup, create a new +// cluster, and restore that backup to the cluster before continuing the test. +// The probability of executing a backup and restore is specified in the +// testClusterConfig. +func (t *logicTest) maybeBackupRestore(rng *rand.Rand, config testClusterConfig) error { + if config.backupRestoreProbability != 0 && !config.isCCLConfig { + return errors.Newf("logic test config %s specifies a backup restore probability but is not CCL", + config.name) + } + + // We decide if we want to take a backup here based on a probability + // specified in the logic test config. + if rng.Float64() > config.backupRestoreProbability { + return nil + } + + // Check if any users have open transactions in the logictest. If they do, we + // do not want to teardown the cluster and create a new one as it might + // interfere with what the logictest is trying to test. + // + // We could perhaps make this smarter and perform the backup after the + // transaction is close. + if t.hasOpenTxns(context.Background()) { + return nil + } + + oldUser := t.user + defer func() { + t.setUser(oldUser, 0 /* nodeIdxOverride */) + }() + + // To restore the same state in for the logic test, we need to restore the + // data and the session state. The session state includes things like session + // variables that are set for every session that is open. + // + // TODO(adityamaru): A better approach might be to wipe the cluster once we + // have a command that enables this. That way all of the session data will not + // be lost in the process of creating a new cluster. + users := make([]string, 0, len(t.clients)) + userToHexSession := make(map[string]string, len(t.clients)) + userToSessionVars := make(map[string]map[string]string, len(t.clients)) + for user := range t.clients { + t.setUser(user, 0 /* nodeIdxOverride */) + users = append(users, user) + + // Serialize session variables. + var userSession string + var err error + if err = t.db.QueryRow(`SELECT encode(crdb_internal.serialize_session(), 'hex')`).Scan(&userSession); err == nil { + userToHexSession[user] = userSession + continue + } + log.Warningf(context.Background(), "failed to serialize session: %+v", err) + + // If we failed to serialize the session variables, lets save the output of + // `SHOW ALL`. This usually happens if the session contains prepared + // statements or portals that cause the `serialize_session()` to fail. + // + // Saving the session variables in this manner does not guarantee the test + // will succeed since there are no ordering semantics when we go to apply + // them. There are some session variables that need to be applied before + // others for them to be valid. Thus, it is strictly better to use + // `serialize/deserialize_session()`, this "hack" just gives the test one + // more chance to succeed. + userSessionVars := make(map[string]string) + existingSessionVars, err := t.db.Query("SHOW ALL") + if err != nil { + return err + } + for existingSessionVars.Next() { + var key, value string + if err := existingSessionVars.Scan(&key, &value); err != nil { + return errors.Wrap(err, "scanning session variables") + } + userSessionVars[key] = value + } + userToSessionVars[user] = userSessionVars + } + + backupLocation := fmt.Sprintf("nodelocal://1/logic-test-backup-%s", + strconv.FormatInt(timeutil.Now().UnixNano(), 10)) + + // Perform the backup and restore as root. + t.setUser(security.RootUser, 0 /* nodeIdxOverride */) + + if _, err := t.db.Exec(fmt.Sprintf("BACKUP TO '%s'", backupLocation)); err != nil { + return errors.Wrap(err, "backing up cluster") + } + + // Create a new cluster. Perhaps this can be updated to just wipe the exiting + // cluster once we have the ability to easily wipe a cluster through SQL. + t.resetCluster() + + // Run the restore as root. + t.setUser(security.RootUser, 0 /* nodeIdxOverride */) + if _, err := t.db.Exec(fmt.Sprintf("RESTORE FROM '%s'", backupLocation)); err != nil { + return errors.Wrap(err, "restoring cluster") + } + + // Restore the session state that was in the old cluster. + + // Create new connections for the existing users, and restore the session + // variables that we collected. + for _, user := range users { + // Call setUser for every user to create the connection for that user. + t.setUser(user, 0 /* nodeIdxOverride */) + + if userSession, ok := userToHexSession[user]; ok { + if _, err := t.db.Exec(fmt.Sprintf(`SELECT crdb_internal.deserialize_session(decode('%s', 'hex'))`, userSession)); err != nil { + return errors.Wrapf(err, "deserializing session") + } + } else if vars, ok := userToSessionVars[user]; ok { + // We now attempt to restore the session variables that were set on the + // backing up cluster. These are not included in the backup restore and so + // have to be restored manually. + for key, value := range vars { + // First try setting the cluster setting as a string. + if _, err := t.db.Exec(fmt.Sprintf("SET %s='%s'", key, value)); err != nil { + // If it fails, try setting the value as an int. + log.Infof(context.Background(), "setting session variable as string failed (err: %v), trying as int", pretty.Formatter(err)) + if _, err := t.db.Exec(fmt.Sprintf("SET %s=%s", key, value)); err != nil { + // Some cluster settings can't be set at all, so ignore these errors. + // If a setting that we needed could not be restored, we expect the + // logic test to fail and let us know. + log.Infof(context.Background(), "setting session variable as int failed: %v (continuing anyway)", pretty.Formatter(err)) + continue + } + } + } + } + } + + return nil +} + // fetchSubtests reads through the test file and splices it into subtest chunks. // If there is no subtest, the output will only contain a single entry. func fetchSubtests(path string) ([]subtestDetails, error) { @@ -2092,7 +2291,9 @@ func fetchSubtests(path string) ([]subtestDetails, error) { return subtests, nil } -func (t *logicTest) processSubtest(subtest subtestDetails, path string) error { +func (t *logicTest) processSubtest( + subtest subtestDetails, path string, config testClusterConfig, rng *rand.Rand, +) error { defer t.traceStop() s := newLineScanner(subtest.buffer) @@ -2122,6 +2323,9 @@ func (t *logicTest) processSubtest(subtest subtestDetails, path string) error { path, s.line+subtest.lineLineIndexIntoFile, ) } + if err := t.maybeBackupRestore(rng, config); err != nil { + return err + } switch cmd { case "repeat": // A line "repeat X" makes the test repeat the following statement or query X times.