From 5182478a79c55b2e9b33a11dfbf0e1bf8bc195dd Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Tue, 4 Apr 2023 00:49:25 +0200 Subject: [PATCH] kvtenantccl: unskip TestTenantUpgradeInterlock It does takes an enormous amount of time to run successfully, more than the previous timeout allowed: ``` --- PASS: TestTenantUpgradeInterlock (316.36s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version (120.85s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_first_check_for_instances (9.02s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_fence_RPC (10.11s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_fence_write_to_settings_table (20.27s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_second_check_of_instances (20.20s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_migration (20.45s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_version_bump_RPC (20.28s) --- PASS: TestTenantUpgradeInterlock/lagging_binary_version/pause_after_write_to_settings_table (20.51s) --- PASS: TestTenantUpgradeInterlock/current_binary_version (195.51s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_first_check_for_instances (20.00s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_fence_RPC (30.23s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_fence_write_to_settings_table (29.02s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_second_check_of_instances (29.14s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_migration (29.00s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_version_bump_RPC (29.03s) --- PASS: TestTenantUpgradeInterlock/current_binary_version/pause_after_write_to_settings_table (29.08s) PASS INFO: Elapsed time: 318.059s, Critical Path: 317.36s INFO: 2 processes: 1 internal, 1 processwrapper-sandbox. //pkg/ccl/kvccl/kvtenantccl:kvtenantccl_test PASSED in 317.2s ``` (and this is on a faster machine than in CI) Release note: None --- pkg/BUILD.bazel | 6 + pkg/ccl/kvccl/kvtenantccl/BUILD.bazel | 14 - .../kvccl/kvtenantccl/upgradeccl/BUILD.bazel | 40 ++ .../kvccl/kvtenantccl/upgradeccl/main_test.go | 33 ++ .../upgradeccl/tenant_upgrade_test.go | 387 +++++++++++++++++ .../upgradeinterlockccl/BUILD.bazel | 44 ++ .../upgradeinterlockccl/main_test.go | 33 ++ .../tenant_upgrade_test.go | 403 ++---------------- 8 files changed, 582 insertions(+), 378 deletions(-) create mode 100644 pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel create mode 100644 pkg/ccl/kvccl/kvtenantccl/upgradeccl/main_test.go create mode 100644 pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go create mode 100644 pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/BUILD.bazel create mode 100644 pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/main_test.go rename pkg/ccl/kvccl/kvtenantccl/{ => upgradeinterlockccl}/tenant_upgrade_test.go (55%) diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index d7b0790c5c4a..59366dd588c3 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -40,6 +40,8 @@ ALL_TESTS = [ "//pkg/ccl/jobsccl/jobsprotectedtsccl:jobsprotectedtsccl_test", "//pkg/ccl/jwtauthccl:jwtauthccl_test", "//pkg/ccl/kvccl/kvfollowerreadsccl:kvfollowerreadsccl_test", + "//pkg/ccl/kvccl/kvtenantccl/upgradeccl:upgradeccl_test", + "//pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl:upgradeinterlockccl_test", "//pkg/ccl/kvccl/kvtenantccl:kvtenantccl_test", "//pkg/ccl/logictestccl/tests/3node-tenant-multiregion:3node-tenant-multiregion_test", "//pkg/ccl/logictestccl/tests/3node-tenant:3node-tenant_test", @@ -775,6 +777,8 @@ GO_TARGETS = [ "//pkg/ccl/jwtauthccl:jwtauthccl_test", "//pkg/ccl/kvccl/kvfollowerreadsccl:kvfollowerreadsccl", "//pkg/ccl/kvccl/kvfollowerreadsccl:kvfollowerreadsccl_test", + "//pkg/ccl/kvccl/kvtenantccl/upgradeccl:upgradeccl_test", + "//pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl:upgradeinterlockccl_test", "//pkg/ccl/kvccl/kvtenantccl:kvtenantccl", "//pkg/ccl/kvccl/kvtenantccl:kvtenantccl_test", "//pkg/ccl/kvccl:kvccl", @@ -2417,6 +2421,8 @@ GET_X_DATA_TARGETS = [ "//pkg/ccl/kvccl:get_x_data", "//pkg/ccl/kvccl/kvfollowerreadsccl:get_x_data", "//pkg/ccl/kvccl/kvtenantccl:get_x_data", + "//pkg/ccl/kvccl/kvtenantccl/upgradeccl:get_x_data", + "//pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl:get_x_data", "//pkg/ccl/logictestccl:get_x_data", "//pkg/ccl/logictestccl/tests/3node-tenant:get_x_data", "//pkg/ccl/logictestccl/tests/3node-tenant-multiregion:get_x_data", diff --git a/pkg/ccl/kvccl/kvtenantccl/BUILD.bazel b/pkg/ccl/kvccl/kvtenantccl/BUILD.bazel index ece2b7681113..4fcf239fdbbf 100644 --- a/pkg/ccl/kvccl/kvtenantccl/BUILD.bazel +++ b/pkg/ccl/kvccl/kvtenantccl/BUILD.bazel @@ -54,7 +54,6 @@ go_test( "tenant_range_lookup_test.go", "tenant_scan_range_descriptors_test.go", "tenant_trace_test.go", - "tenant_upgrade_test.go", ], args = ["-test.timeout=895s"], embed = [":kvtenantccl"], @@ -63,10 +62,8 @@ go_test( deps = [ "//pkg/base", "//pkg/ccl", - "//pkg/clusterversion", "//pkg/config", "//pkg/gossip", - "//pkg/jobs", "//pkg/keys", "//pkg/kv/kvclient/kvcoord", "//pkg/kv/kvclient/kvtenant", @@ -80,24 +77,13 @@ go_test( "//pkg/security", "//pkg/security/securityassets", "//pkg/security/securitytest", - "//pkg/security/username", "//pkg/server", "//pkg/settings", - "//pkg/settings/cluster", - "//pkg/spanconfig", "//pkg/sql", - "//pkg/sql/catalog/lease", - "//pkg/sql/sessiondatapb", - "//pkg/sql/sqlinstance/instancestorage", - "//pkg/sql/sqlliveness/slinstance", - "//pkg/sql/stats", "//pkg/testutils", "//pkg/testutils/serverutils", - "//pkg/testutils/skip", "//pkg/testutils/sqlutils", "//pkg/testutils/testcluster", - "//pkg/upgrade", - "//pkg/upgrade/upgradebase", "//pkg/util", "//pkg/util/grpcutil", "//pkg/util/hlc", diff --git a/pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel new file mode 100644 index 000000000000..86b6a6756adf --- /dev/null +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/BUILD.bazel @@ -0,0 +1,40 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_test") + +go_test( + name = "upgradeccl_test", + srcs = [ + "main_test.go", + "tenant_upgrade_test.go", + ], + args = ["-test.timeout=295s"], + tags = ["ccl_test"], + deps = [ + "//pkg/base", + "//pkg/ccl", + "//pkg/clusterversion", + "//pkg/jobs", + "//pkg/roachpb", + "//pkg/security/securityassets", + "//pkg/security/securitytest", + "//pkg/security/username", + "//pkg/server", + "//pkg/settings/cluster", + "//pkg/spanconfig", + "//pkg/sql/sqlinstance/instancestorage", + "//pkg/sql/sqlliveness/slinstance", + "//pkg/testutils/serverutils", + "//pkg/testutils/skip", + "//pkg/testutils/sqlutils", + "//pkg/testutils/testcluster", + "//pkg/upgrade", + "//pkg/upgrade/upgradebase", + "//pkg/util/leaktest", + "//pkg/util/log", + "//pkg/util/randutil", + "//pkg/util/stop", + "@com_github_stretchr_testify//require", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/ccl/kvccl/kvtenantccl/upgradeccl/main_test.go b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/main_test.go new file mode 100644 index 000000000000..0b203745bcc4 --- /dev/null +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/main_test.go @@ -0,0 +1,33 @@ +// Copyright 2023 The Cockroach Authors. +// +// Licensed as a CockroachDB Enterprise file under the Cockroach Community +// License (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt + +package upgradeccl_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/ccl" + "github.com/cockroachdb/cockroach/pkg/security/securityassets" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +//go:generate ../../../util/leaktest/add-leaktest.sh *_test.go + +func TestMain(m *testing.M) { + defer ccl.TestingEnableEnterprise()() + securityassets.SetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + serverutils.InitTestClusterFactory(testcluster.TestClusterFactory) + os.Exit(m.Run()) +} diff --git a/pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go new file mode 100644 index 000000000000..e4b88af0ab4d --- /dev/null +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeccl/tenant_upgrade_test.go @@ -0,0 +1,387 @@ +// Copyright 2021 The Cockroach Authors. +// +// Licensed as a CockroachDB Enterprise file under the Cockroach Community +// License (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt + +package upgradeccl_test + +import ( + "context" + gosql "database/sql" + "net/url" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/clusterversion" + "github.com/cockroachdb/cockroach/pkg/jobs" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/security/username" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/settings/cluster" + "github.com/cockroachdb/cockroach/pkg/spanconfig" + "github.com/cockroachdb/cockroach/pkg/sql/sqlinstance/instancestorage" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slinstance" + "github.com/cockroachdb/cockroach/pkg/testutils/skip" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/upgrade" + "github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/stop" + "github.com/stretchr/testify/require" +) + +// TestTenantUpgrade exercises the case where a system tenant is in a +// non-finalized version state and creates a tenant. The test ensures +// that the newly created tenant begins in that same version. +// +// The first subtest creates the tenant in the mixed version state, +// then upgrades the system tenant, then upgrades the secondary tenant, +// and ensures everything is happy. It then restarts the tenant and ensures +// that the cluster version is properly set. +// +// The second subtest creates a new tenant after the system tenant has been +// upgraded and ensures that it is created at the final cluster version. It +// also verifies that the version is correct after a restart +func TestTenantUpgrade(t *testing.T) { + defer leaktest.AfterTest(t)() + skip.WithIssue(t, 97076, "flaky test") + defer log.Scope(t).Close(t) + ctx := context.Background() + settings := cluster.MakeTestingClusterSettingsWithVersions( + clusterversion.TestingBinaryVersion, + clusterversion.TestingBinaryMinSupportedVersion, + false, // initializeVersion + ) + // Initialize the version to the BinaryMinSupportedVersion. + require.NoError(t, clusterversion.Initialize(ctx, + clusterversion.TestingBinaryMinSupportedVersion, &settings.SV)) + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{ + // Test validates tenant behavior. No need for the default test + // tenant. + DefaultTestTenant: base.TestTenantDisabled, + Settings: settings, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + DisableAutomaticVersionUpgrade: make(chan struct{}), + BinaryVersionOverride: clusterversion.TestingBinaryMinSupportedVersion, + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + connectToTenant := func(t *testing.T, addr string) (_ *gosql.DB, cleanup func()) { + pgURL, cleanupPGUrl := sqlutils.PGUrl(t, addr, "Tenant", url.User(username.RootUser)) + tenantDB, err := gosql.Open("postgres", pgURL.String()) + require.NoError(t, err) + return tenantDB, func() { + tenantDB.Close() + cleanupPGUrl() + } + } + expectedInitialTenantVersion, _, _ := v0v1v2() + mkTenant := func(t *testing.T, id uint64) (tenantDB *gosql.DB, cleanup func()) { + settings := cluster.MakeTestingClusterSettingsWithVersions( + clusterversion.TestingBinaryVersion, + clusterversion.TestingBinaryMinSupportedVersion, + false, // initializeVersion + ) + // Initialize the version to the minimum it could be. + require.NoError(t, clusterversion.Initialize(ctx, + expectedInitialTenantVersion, &settings.SV)) + tenantArgs := base.TestTenantArgs{ + TenantID: roachpb.MustMakeTenantID(id), + TestingKnobs: base.TestingKnobs{}, + Settings: settings, + } + tenant, err := tc.Server(0).StartTenant(ctx, tenantArgs) + require.NoError(t, err) + return connectToTenant(t, tenant.SQLAddr()) + } + + t.Run("upgrade tenant", func(t *testing.T) { + // Create a tenant before upgrading anything and verify its version. + const initialTenantID = 10 + initialTenant, cleanup := mkTenant(t, initialTenantID) + initialTenantRunner := sqlutils.MakeSQLRunner(initialTenant) + + // Ensure that the tenant works. + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{expectedInitialTenantVersion.String()}}) + initialTenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)") + initialTenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)") + + // Upgrade the host cluster. + sqlutils.MakeSQLRunner(tc.ServerConn(0)).Exec(t, + "SET CLUSTER SETTING version = $1", + clusterversion.TestingBinaryVersion.String()) + + // Ensure that the tenant still works. + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + + // Upgrade the tenant cluster. + initialTenantRunner.Exec(t, + "SET CLUSTER SETTING version = $1", + clusterversion.TestingBinaryVersion.String()) + + // Ensure that the tenant still works. + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{clusterversion.TestingBinaryVersion.String()}}) + + // Restart the tenant and ensure that the version is correct. + cleanup() + { + tenantServer, err := tc.Server(0).StartTenant(ctx, base.TestTenantArgs{ + TenantID: roachpb.MustMakeTenantID(initialTenantID), + }) + require.NoError(t, err) + initialTenant, cleanup = connectToTenant(t, tenantServer.SQLAddr()) + defer cleanup() + initialTenantRunner = sqlutils.MakeSQLRunner(initialTenant) + } + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{clusterversion.TestingBinaryVersion.String()}}) + }) + + t.Run("post-upgrade tenant", func(t *testing.T) { + // Create a new tenant and ensure it has the right version. + const postUpgradeTenantID = 11 + postUpgradeTenant, cleanup := mkTenant(t, postUpgradeTenantID) + sqlutils.MakeSQLRunner(postUpgradeTenant).CheckQueryResults(t, + "SHOW CLUSTER SETTING version", + [][]string{{clusterversion.TestingBinaryVersion.String()}}) + + // Restart the new tenant and ensure it has the right version. + cleanup() + { + tenantServer, err := tc.Server(0).StartTenant(ctx, base.TestTenantArgs{ + TenantID: roachpb.MustMakeTenantID(postUpgradeTenantID), + }) + require.NoError(t, err) + postUpgradeTenant, cleanup = connectToTenant(t, tenantServer.SQLAddr()) + defer cleanup() + } + sqlutils.MakeSQLRunner(postUpgradeTenant).CheckQueryResults(t, + "SHOW CLUSTER SETTING version", + [][]string{{clusterversion.TestingBinaryVersion.String()}}) + }) + +} + +// Returns three versions : +// - v0 corresponds to the bootstrapped version of the tenant, +// - v1, v2 correspond to adjacent releases. +func v0v1v2() (roachpb.Version, roachpb.Version, roachpb.Version) { + v0 := clusterversion.ByKey(clusterversion.V22_2) + v1 := clusterversion.TestingBinaryVersion + v2 := clusterversion.TestingBinaryVersion + if v1.Internal > 2 { + v1.Internal -= 2 + } else { + v2.Internal += 2 + } + return v0, v1, v2 +} + +// TestTenantUpgradeFailure exercises cases where the tenant dies +// between version upgrades. +func TestTenantUpgradeFailure(t *testing.T) { + defer leaktest.AfterTest(t)() + skip.WithIssue(t, 98555, "flaky test") + defer log.Scope(t).Close(t) + // Contains information for starting a tenant + // and maintaining a stopper. + type tenantInfo struct { + v2onMigrationStopper *stop.Stopper + tenantArgs *base.TestTenantArgs + } + v0, v1, v2 := v0v1v2() + ctx := context.Background() + settings := cluster.MakeTestingClusterSettingsWithVersions( + v2, + v0, + false, // initializeVersion + ) + // Initialize the version to the BinaryMinSupportedVersion. + tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ + ServerArgs: base.TestServerArgs{ + // Test validates tenant behavior. No need for the default test + // tenant here. + DefaultTestTenant: base.TestTenantDisabled, + Settings: settings, + Knobs: base.TestingKnobs{ + Server: &server.TestingKnobs{ + DisableAutomaticVersionUpgrade: make(chan struct{}), + BinaryVersionOverride: v0, + }, + }, + }, + }) + defer tc.Stopper().Stop(ctx) + + // Channel for stopping a tenant. + tenantStopperChannel := make(chan struct{}) + startAndConnectToTenant := func(t *testing.T, tenantInfo *tenantInfo) (_ *gosql.DB, cleanup func()) { + tenant, err := tc.Server(0).StartTenant(ctx, *tenantInfo.tenantArgs) + require.NoError(t, err) + pgURL, cleanupPGUrl := sqlutils.PGUrl(t, tenant.SQLAddr(), "Tenant", url.User(username.RootUser)) + tenantDB, err := gosql.Open("postgres", pgURL.String()) + require.NoError(t, err) + return tenantDB, func() { + tenantDB.Close() + cleanupPGUrl() + } + } + mkTenant := func(t *testing.T, id uint64) *tenantInfo { + settings := cluster.MakeTestingClusterSettingsWithVersions( + v2, + v0, + false, // initializeVersion + ) + // Shorten the reclaim loop so that terminated SQL servers don't block + // the upgrade from succeeding. + instancestorage.ReclaimLoopInterval.Override(ctx, &settings.SV, 250*time.Millisecond) + slinstance.DefaultTTL.Override(ctx, &settings.SV, 3*time.Second) + slinstance.DefaultHeartBeat.Override(ctx, &settings.SV, 500*time.Millisecond) + v2onMigrationStopper := stop.NewStopper() + // Initialize the version to the minimum it could be. + require.NoError(t, clusterversion.Initialize(ctx, + v0, &settings.SV)) + tenantArgs := base.TestTenantArgs{ + Stopper: v2onMigrationStopper, + TenantID: roachpb.MustMakeTenantID(id), + TestingKnobs: base.TestingKnobs{ + JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), + // Disable the span config job so that it doesn't interfere with + // the upgrade interlock. + SpanConfig: &spanconfig.TestingKnobs{ + ManagerDisableJobCreation: true, + }, + UpgradeManager: &upgradebase.TestingKnobs{ + DontUseJobs: true, + ListBetweenOverride: func(from, to roachpb.Version) []roachpb.Version { + return []roachpb.Version{v1, v2} + }, + RegistryOverride: func(v roachpb.Version) (upgradebase.Upgrade, bool) { + switch v { + case v1: + return upgrade.NewTenantUpgrade("testing", + v1, + upgrade.NoPrecondition, + func( + ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps, + ) error { + return nil + }), true + case v2: + return upgrade.NewTenantUpgrade("testing next", + v2, + upgrade.NoPrecondition, + func( + ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps, + ) error { + tenantStopperChannel <- struct{}{} + return nil + }), true + default: + panic("Unexpected version number observed.") + } + }, + }, + }, + Settings: settings, + } + return &tenantInfo{tenantArgs: &tenantArgs, + v2onMigrationStopper: v2onMigrationStopper} + } + + t.Run("upgrade tenant have it crash then resume", func(t *testing.T) { + // Create a tenant before upgrading anything and verify its version. + const initialTenantID = 10 + tenantInfo := mkTenant(t, initialTenantID) + tenant, cleanup := startAndConnectToTenant(t, tenantInfo) + initialTenantRunner := sqlutils.MakeSQLRunner(tenant) + // Ensure that the tenant works. + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{v0.String()}}) + initialTenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)") + initialTenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)") + // Use to wait for tenant crash leading to a clean up. + waitForTenantClose := make(chan struct{}) + // Cause the upgrade to crash on v1. + go func() { + <-tenantStopperChannel + tenant.Close() + tenantInfo.v2onMigrationStopper.Stop(ctx) + waitForTenantClose <- struct{}{} + }() + // Upgrade the host cluster to the latest version. + sqlutils.MakeSQLRunner(tc.ServerConn(0)).Exec(t, + "SET CLUSTER SETTING version = $1", + clusterversion.TestingBinaryVersion.String()) + // Ensure that the tenant still works. + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + // Upgrade the tenant cluster, but the upgrade will fail on v1. + initialTenantRunner.ExpectErr(t, + ".*(database is closed|failed to connect|closed network connection|upgrade failed due to transient SQL servers)+", + "SET CLUSTER SETTING version = $1", + v2.String()) + <-waitForTenantClose + cleanup() + tenantInfo = mkTenant(t, initialTenantID) + tenant, cleanup = startAndConnectToTenant(t, tenantInfo) + initialTenantRunner = sqlutils.MakeSQLRunner(tenant) + // Ensure that the tenant still works and the target + // version wasn't reached. + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{v1.String()}}) + + // Restart the tenant and ensure that the version is correct. + cleanup() + tenantInfo.v2onMigrationStopper.Stop(ctx) + { + tenantInfo = mkTenant(t, initialTenantID) + tca, cleanup := startAndConnectToTenant(t, tenantInfo) + defer cleanup() + initialTenantRunner = sqlutils.MakeSQLRunner(tca) + } + // Keep trying to resume the stopper channel until the channel is closed, + // since we may repeatedly wait on it due to transaction retries. In + // the other case the stopper is used, so no such risk exists. + go func() { + for { + _, ok := <-tenantStopperChannel + if !ok { + return + } + } + }() + // Make sure that all shutdown SQL instance are gone before proceeding. + // We need to wait here because if we don't, the upgrade may hit + // errors because it's trying to bump the cluster version for a SQL + // instance which doesn't exist (i.e. the one that was restarted above). + initialTenantRunner.CheckQueryResultsRetry(t, + "SELECT count(*) FROM system.sql_instances WHERE session_id IS NOT NULL", [][]string{{"1"}}) + + // Upgrade the tenant cluster. + initialTenantRunner.Exec(t, + "SET CLUSTER SETTING version = $1", + v2.String()) + close(tenantStopperChannel) + // Validate the target version has been reached. + initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", + [][]string{{v2.String()}}) + tenantInfo.v2onMigrationStopper.Stop(ctx) + }) +} diff --git a/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/BUILD.bazel b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/BUILD.bazel new file mode 100644 index 000000000000..c8c2f0cc410c --- /dev/null +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/BUILD.bazel @@ -0,0 +1,44 @@ +load("//build/bazelutil/unused_checker:unused.bzl", "get_x_data") +load("@io_bazel_rules_go//go:def.bzl", "go_test") + +go_test( + name = "upgradeinterlockccl_test", + size = "enormous", + srcs = [ + "main_test.go", + "tenant_upgrade_test.go", + ], + args = ["-test.timeout=3595s"], + tags = ["ccl_test"], + deps = [ + "//pkg/base", + "//pkg/ccl", + "//pkg/clusterversion", + "//pkg/jobs", + "//pkg/roachpb", + "//pkg/security/securityassets", + "//pkg/security/securitytest", + "//pkg/security/username", + "//pkg/server", + "//pkg/settings/cluster", + "//pkg/spanconfig", + "//pkg/sql", + "//pkg/sql/catalog/lease", + "//pkg/sql/sessiondatapb", + "//pkg/sql/sqlinstance/instancestorage", + "//pkg/sql/sqlliveness/slinstance", + "//pkg/sql/stats", + "//pkg/testutils/serverutils", + "//pkg/testutils/skip", + "//pkg/testutils/sqlutils", + "//pkg/testutils/testcluster", + "//pkg/upgrade/upgradebase", + "//pkg/util/leaktest", + "//pkg/util/log", + "//pkg/util/randutil", + "//pkg/util/stop", + "@com_github_stretchr_testify//require", + ], +) + +get_x_data(name = "get_x_data") diff --git a/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/main_test.go b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/main_test.go new file mode 100644 index 000000000000..10e689bd72c6 --- /dev/null +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/main_test.go @@ -0,0 +1,33 @@ +// Copyright 2023 The Cockroach Authors. +// +// Licensed as a CockroachDB Enterprise file under the Cockroach Community +// License (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt + +package upgradeinterlockccl_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/ccl" + "github.com/cockroachdb/cockroach/pkg/security/securityassets" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +//go:generate ../../../util/leaktest/add-leaktest.sh *_test.go + +func TestMain(m *testing.M) { + defer ccl.TestingEnableEnterprise()() + securityassets.SetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + serverutils.InitTestClusterFactory(testcluster.TestClusterFactory) + os.Exit(m.Run()) +} diff --git a/pkg/ccl/kvccl/kvtenantccl/tenant_upgrade_test.go b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/tenant_upgrade_test.go similarity index 55% rename from pkg/ccl/kvccl/kvtenantccl/tenant_upgrade_test.go rename to pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/tenant_upgrade_test.go index 4c09994af654..6789b36064fc 100644 --- a/pkg/ccl/kvccl/kvtenantccl/tenant_upgrade_test.go +++ b/pkg/ccl/kvccl/kvtenantccl/upgradeinterlockccl/tenant_upgrade_test.go @@ -6,7 +6,7 @@ // // https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt -package kvtenantccl_test +package upgradeinterlockccl_test import ( "context" @@ -35,7 +35,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" - "github.com/cockroachdb/cockroach/pkg/upgrade" "github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -43,356 +42,6 @@ import ( "github.com/stretchr/testify/require" ) -// TestTenantUpgrade exercises the case where a system tenant is in a -// non-finalized version state and creates a tenant. The test ensures -// that the newly created tenant begins in that same version. -// -// The first subtest creates the tenant in the mixed version state, -// then upgrades the system tenant, then upgrades the secondary tenant, -// and ensures everything is happy. It then restarts the tenant and ensures -// that the cluster version is properly set. -// -// The second subtest creates a new tenant after the system tenant has been -// upgraded and ensures that it is created at the final cluster version. It -// also verifies that the version is correct after a restart -func TestTenantUpgrade(t *testing.T) { - defer leaktest.AfterTest(t)() - skip.WithIssue(t, 97076, "flaky test") - defer log.Scope(t).Close(t) - ctx := context.Background() - settings := cluster.MakeTestingClusterSettingsWithVersions( - clusterversion.TestingBinaryVersion, - clusterversion.TestingBinaryMinSupportedVersion, - false, // initializeVersion - ) - // Initialize the version to the BinaryMinSupportedVersion. - require.NoError(t, clusterversion.Initialize(ctx, - clusterversion.TestingBinaryMinSupportedVersion, &settings.SV)) - tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{ - // Test validates tenant behavior. No need for the default test - // tenant. - DefaultTestTenant: base.TestTenantDisabled, - Settings: settings, - Knobs: base.TestingKnobs{ - Server: &server.TestingKnobs{ - DisableAutomaticVersionUpgrade: make(chan struct{}), - BinaryVersionOverride: clusterversion.TestingBinaryMinSupportedVersion, - }, - }, - }, - }) - defer tc.Stopper().Stop(ctx) - - connectToTenant := func(t *testing.T, addr string) (_ *gosql.DB, cleanup func()) { - pgURL, cleanupPGUrl := sqlutils.PGUrl(t, addr, "Tenant", url.User(username.RootUser)) - tenantDB, err := gosql.Open("postgres", pgURL.String()) - require.NoError(t, err) - return tenantDB, func() { - tenantDB.Close() - cleanupPGUrl() - } - } - expectedInitialTenantVersion, _, _ := v0v1v2() - mkTenant := func(t *testing.T, id uint64) (tenantDB *gosql.DB, cleanup func()) { - settings := cluster.MakeTestingClusterSettingsWithVersions( - clusterversion.TestingBinaryVersion, - clusterversion.TestingBinaryMinSupportedVersion, - false, // initializeVersion - ) - // Initialize the version to the minimum it could be. - require.NoError(t, clusterversion.Initialize(ctx, - expectedInitialTenantVersion, &settings.SV)) - tenantArgs := base.TestTenantArgs{ - TenantID: roachpb.MustMakeTenantID(id), - TestingKnobs: base.TestingKnobs{}, - Settings: settings, - } - tenant, err := tc.Server(0).StartTenant(ctx, tenantArgs) - require.NoError(t, err) - return connectToTenant(t, tenant.SQLAddr()) - } - - t.Run("upgrade tenant", func(t *testing.T) { - // Create a tenant before upgrading anything and verify its version. - const initialTenantID = 10 - initialTenant, cleanup := mkTenant(t, initialTenantID) - initialTenantRunner := sqlutils.MakeSQLRunner(initialTenant) - - // Ensure that the tenant works. - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{expectedInitialTenantVersion.String()}}) - initialTenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)") - initialTenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)") - - // Upgrade the host cluster. - sqlutils.MakeSQLRunner(tc.ServerConn(0)).Exec(t, - "SET CLUSTER SETTING version = $1", - clusterversion.TestingBinaryVersion.String()) - - // Ensure that the tenant still works. - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - - // Upgrade the tenant cluster. - initialTenantRunner.Exec(t, - "SET CLUSTER SETTING version = $1", - clusterversion.TestingBinaryVersion.String()) - - // Ensure that the tenant still works. - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{clusterversion.TestingBinaryVersion.String()}}) - - // Restart the tenant and ensure that the version is correct. - cleanup() - { - tenantServer, err := tc.Server(0).StartTenant(ctx, base.TestTenantArgs{ - TenantID: roachpb.MustMakeTenantID(initialTenantID), - }) - require.NoError(t, err) - initialTenant, cleanup = connectToTenant(t, tenantServer.SQLAddr()) - defer cleanup() - initialTenantRunner = sqlutils.MakeSQLRunner(initialTenant) - } - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{clusterversion.TestingBinaryVersion.String()}}) - }) - - t.Run("post-upgrade tenant", func(t *testing.T) { - // Create a new tenant and ensure it has the right version. - const postUpgradeTenantID = 11 - postUpgradeTenant, cleanup := mkTenant(t, postUpgradeTenantID) - sqlutils.MakeSQLRunner(postUpgradeTenant).CheckQueryResults(t, - "SHOW CLUSTER SETTING version", - [][]string{{clusterversion.TestingBinaryVersion.String()}}) - - // Restart the new tenant and ensure it has the right version. - cleanup() - { - tenantServer, err := tc.Server(0).StartTenant(ctx, base.TestTenantArgs{ - TenantID: roachpb.MustMakeTenantID(postUpgradeTenantID), - }) - require.NoError(t, err) - postUpgradeTenant, cleanup = connectToTenant(t, tenantServer.SQLAddr()) - defer cleanup() - } - sqlutils.MakeSQLRunner(postUpgradeTenant).CheckQueryResults(t, - "SHOW CLUSTER SETTING version", - [][]string{{clusterversion.TestingBinaryVersion.String()}}) - }) - -} - -// Returns three versions : -// - v0 corresponds to the bootstrapped version of the tenant, -// - v1, v2 correspond to adjacent releases. -func v0v1v2() (roachpb.Version, roachpb.Version, roachpb.Version) { - v0 := clusterversion.ByKey(clusterversion.V22_2) - v1 := clusterversion.TestingBinaryVersion - v2 := clusterversion.TestingBinaryVersion - if v1.Internal > 2 { - v1.Internal -= 2 - } else { - v2.Internal += 2 - } - return v0, v1, v2 -} - -// TestTenantUpgradeFailure exercises cases where the tenant dies -// between version upgrades. -func TestTenantUpgradeFailure(t *testing.T) { - defer leaktest.AfterTest(t)() - skip.WithIssue(t, 98555, "flaky test") - defer log.Scope(t).Close(t) - // Contains information for starting a tenant - // and maintaining a stopper. - type tenantInfo struct { - v2onMigrationStopper *stop.Stopper - tenantArgs *base.TestTenantArgs - } - v0, v1, v2 := v0v1v2() - ctx := context.Background() - settings := cluster.MakeTestingClusterSettingsWithVersions( - v2, - v0, - false, // initializeVersion - ) - // Initialize the version to the BinaryMinSupportedVersion. - tc := testcluster.StartTestCluster(t, 1, base.TestClusterArgs{ - ServerArgs: base.TestServerArgs{ - // Test validates tenant behavior. No need for the default test - // tenant here. - DefaultTestTenant: base.TestTenantDisabled, - Settings: settings, - Knobs: base.TestingKnobs{ - Server: &server.TestingKnobs{ - DisableAutomaticVersionUpgrade: make(chan struct{}), - BinaryVersionOverride: v0, - }, - }, - }, - }) - defer tc.Stopper().Stop(ctx) - - // Channel for stopping a tenant. - tenantStopperChannel := make(chan struct{}) - startAndConnectToTenant := func(t *testing.T, tenantInfo *tenantInfo) (_ *gosql.DB, cleanup func()) { - tenant, err := tc.Server(0).StartTenant(ctx, *tenantInfo.tenantArgs) - require.NoError(t, err) - pgURL, cleanupPGUrl := sqlutils.PGUrl(t, tenant.SQLAddr(), "Tenant", url.User(username.RootUser)) - tenantDB, err := gosql.Open("postgres", pgURL.String()) - require.NoError(t, err) - return tenantDB, func() { - tenantDB.Close() - cleanupPGUrl() - } - } - mkTenant := func(t *testing.T, id uint64) *tenantInfo { - settings := cluster.MakeTestingClusterSettingsWithVersions( - v2, - v0, - false, // initializeVersion - ) - // Shorten the reclaim loop so that terminated SQL servers don't block - // the upgrade from succeeding. - instancestorage.ReclaimLoopInterval.Override(ctx, &settings.SV, 250*time.Millisecond) - slinstance.DefaultTTL.Override(ctx, &settings.SV, 3*time.Second) - slinstance.DefaultHeartBeat.Override(ctx, &settings.SV, 500*time.Millisecond) - v2onMigrationStopper := stop.NewStopper() - // Initialize the version to the minimum it could be. - require.NoError(t, clusterversion.Initialize(ctx, - v0, &settings.SV)) - tenantArgs := base.TestTenantArgs{ - Stopper: v2onMigrationStopper, - TenantID: roachpb.MustMakeTenantID(id), - TestingKnobs: base.TestingKnobs{ - JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), - // Disable the span config job so that it doesn't interfere with - // the upgrade interlock. - SpanConfig: &spanconfig.TestingKnobs{ - ManagerDisableJobCreation: true, - }, - UpgradeManager: &upgradebase.TestingKnobs{ - DontUseJobs: true, - ListBetweenOverride: func(from, to roachpb.Version) []roachpb.Version { - return []roachpb.Version{v1, v2} - }, - RegistryOverride: func(v roachpb.Version) (upgradebase.Upgrade, bool) { - switch v { - case v1: - return upgrade.NewTenantUpgrade("testing", - v1, - upgrade.NoPrecondition, - func( - ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps, - ) error { - return nil - }), true - case v2: - return upgrade.NewTenantUpgrade("testing next", - v2, - upgrade.NoPrecondition, - func( - ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps, - ) error { - tenantStopperChannel <- struct{}{} - return nil - }), true - default: - panic("Unexpected version number observed.") - } - }, - }, - }, - Settings: settings, - } - return &tenantInfo{tenantArgs: &tenantArgs, - v2onMigrationStopper: v2onMigrationStopper} - } - - t.Run("upgrade tenant have it crash then resume", func(t *testing.T) { - // Create a tenant before upgrading anything and verify its version. - const initialTenantID = 10 - tenantInfo := mkTenant(t, initialTenantID) - tenant, cleanup := startAndConnectToTenant(t, tenantInfo) - initialTenantRunner := sqlutils.MakeSQLRunner(tenant) - // Ensure that the tenant works. - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{v0.String()}}) - initialTenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)") - initialTenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)") - // Use to wait for tenant crash leading to a clean up. - waitForTenantClose := make(chan struct{}) - // Cause the upgrade to crash on v1. - go func() { - <-tenantStopperChannel - tenant.Close() - tenantInfo.v2onMigrationStopper.Stop(ctx) - waitForTenantClose <- struct{}{} - }() - // Upgrade the host cluster to the latest version. - sqlutils.MakeSQLRunner(tc.ServerConn(0)).Exec(t, - "SET CLUSTER SETTING version = $1", - clusterversion.TestingBinaryVersion.String()) - // Ensure that the tenant still works. - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - // Upgrade the tenant cluster, but the upgrade will fail on v1. - initialTenantRunner.ExpectErr(t, - ".*(database is closed|failed to connect|closed network connection|upgrade failed due to transient SQL servers)+", - "SET CLUSTER SETTING version = $1", - v2.String()) - <-waitForTenantClose - cleanup() - tenantInfo = mkTenant(t, initialTenantID) - tenant, cleanup = startAndConnectToTenant(t, tenantInfo) - initialTenantRunner = sqlutils.MakeSQLRunner(tenant) - // Ensure that the tenant still works and the target - // version wasn't reached. - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{v1.String()}}) - - // Restart the tenant and ensure that the version is correct. - cleanup() - tenantInfo.v2onMigrationStopper.Stop(ctx) - { - tenantInfo = mkTenant(t, initialTenantID) - tca, cleanup := startAndConnectToTenant(t, tenantInfo) - defer cleanup() - initialTenantRunner = sqlutils.MakeSQLRunner(tca) - } - // Keep trying to resume the stopper channel until the channel is closed, - // since we may repeatedly wait on it due to transaction retries. In - // the other case the stopper is used, so no such risk exists. - go func() { - for { - _, ok := <-tenantStopperChannel - if !ok { - return - } - } - }() - // Make sure that all shutdown SQL instance are gone before proceeding. - // We need to wait here because if we don't, the upgrade may hit - // errors because it's trying to bump the cluster version for a SQL - // instance which doesn't exist (i.e. the one that was restarted above). - initialTenantRunner.CheckQueryResultsRetry(t, - "SELECT count(*) FROM system.sql_instances WHERE session_id IS NOT NULL", [][]string{{"1"}}) - - // Upgrade the tenant cluster. - initialTenantRunner.Exec(t, - "SET CLUSTER SETTING version = $1", - v2.String()) - close(tenantStopperChannel) - // Validate the target version has been reached. - initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) - initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", - [][]string{{v2.String()}}) - tenantInfo.v2onMigrationStopper.Stop(ctx) - }) -} - // TestTenantUpgradeInterlock validates the interlock between upgrading SQL // servers and other SQL servers which may be running (and/or in the process // of starting up). It runs with two SQL servers for the same tenant and starts @@ -403,13 +52,11 @@ func TestTenantUpgradeFailure(t *testing.T) { // is too low. func TestTenantUpgradeInterlock(t *testing.T) { defer leaktest.AfterTest(t)() - // Times out under stress race + // Times out under stress race. skip.UnderStressRace(t) - // Test takes 30s to run + // Test takes 100s+ to run. skip.UnderShort(t) - skip.WithIssue(t, 99593, "flaky test") - defer log.Scope(t).Close(t) ctx := context.Background() @@ -525,7 +172,14 @@ func TestTenantUpgradeInterlock(t *testing.T) { } runTest := func(t *testing.T, variant interlockTestVariant, test interlockTestConfig) { - t.Logf(`upgrade interlock test: running variant "%s", configuration: "%s"`, variants[variant], test.name) + logf := func(format string, args ...interface{}) { + t.Helper() + newArgs := make([]interface{}, len(args)+1) + newArgs[0] = t.Name() + copy(newArgs[1:], args) + t.Logf("(%s) "+format, newArgs...) + } + logf(`upgrade interlock test: running variant "%s", configuration: "%s"`, variants[variant], test.name) reachedChannel := make(chan struct{}) resumeChannel := make(chan struct{}) @@ -574,7 +228,7 @@ func TestTenantUpgradeInterlock(t *testing.T) { // to fail. The changes below in the other server setup will // ensure that the failed SQL server expires quickly. Here we // need to also ensure that it will be reclaimed quickly. - instancestorage.ReclaimLoopInterval.Override(ctx, &s.SV, 1000*time.Millisecond) + instancestorage.ReclaimLoopInterval.Override(ctx, &s.SV, 500*time.Millisecond) } reduceDefaultTTLAndHeartBeat := func(s *cluster.Settings) { @@ -602,6 +256,7 @@ func TestTenantUpgradeInterlock(t *testing.T) { DefaultTestTenant: base.TestTenantDisabled, Settings: settings, Knobs: base.TestingKnobs{ + JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), SpanConfig: &spanconfig.TestingKnobs{ ManagerDisableJobCreation: true, }, @@ -650,6 +305,7 @@ func TestTenantUpgradeInterlock(t *testing.T) { return connectToTenant(t, tenant.SQLAddr()) } + logf("creating an initial tenant server") // Create a tenant before upgrading anything, and verify its // version. tenantID := serverutils.TestTenantID() @@ -657,23 +313,30 @@ func TestTenantUpgradeInterlock(t *testing.T) { defer cleanup() initialTenantRunner := sqlutils.MakeSQLRunner(tenant) + logf("verifying the tenant version") // Ensure that the tenant works. initialTenantRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{msv.String()}}) + logf("verifying basic SQL functionality") initialTenantRunner.Exec(t, "CREATE TABLE t (i INT PRIMARY KEY)") initialTenantRunner.Exec(t, "INSERT INTO t VALUES (1), (2)") initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + logf("verifying the version of the storage cluster") // Validate that the host cluster is at the expected version, and // upgrade it to the binary version. hostClusterRunner := sqlutils.MakeSQLRunner(tc.ServerConn(0)) hostClusterRunner.CheckQueryResults(t, "SHOW CLUSTER SETTING version", [][]string{{msv.String()}}) + + logf("upgrading the storage cluster") hostClusterRunner.Exec(t, "SET CLUSTER SETTING version = $1", bv.String()) + logf("checking the tenant after the storage cluster upgrade") // Ensure that the tenant still works. initialTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + logf("start upgrading the tenant") // Start upgrading the tenant. This call will pause at the specified // pause point, and once resumed (below, outside this function), // will complete the upgrade. @@ -728,7 +391,9 @@ func TestTenantUpgradeInterlock(t *testing.T) { // Wait until the upgrader reaches the pause point before // starting. <-reachedChannel + logf("upgrader is ready") + logf("starting another tenant server") // Now start a second SQL server for this tenant to see how the // two SQL servers interact. otherMsv := msv @@ -765,6 +430,7 @@ func TestTenantUpgradeInterlock(t *testing.T) { expectingStartupToFail := test.expStartupErr[variant] != "" numTenantsStr := "1" if expectingStartupToFail { + logf("shutting down the other tenant server") otherServerStopper.Stop(ctx) } else if otherServerStartError == nil { defer otherServer.Stopper().Stop(ctx) @@ -774,18 +440,22 @@ func TestTenantUpgradeInterlock(t *testing.T) { numTenantsStr = "2" } + logf("waiting for the instance table to get in the right state") // Based on the success or failure of the "other" SQL server startup, // we'll either have 1 or 2 SQL servers running at this point. Confirm // that we're in the desired state by querying the sql_instances table // directly. We do this before we continue to ensure that the upgrade // doesn't encounter any stale SQL instances. initialTenantRunner.CheckQueryResultsRetry(t, - "SELECT count(*) FROM SYSTEM.SQL_INSTANCES WHERE SESSION_ID IS NOT NULL", [][]string{{numTenantsStr}}) + "SELECT count(*) FROM system.sql_instances WHERE session_id IS NOT NULL", [][]string{{numTenantsStr}}) // With tenant started, resume the upgrade and wait for it to // complete. + logf("resuming upgrade") resumeChannel <- struct{}{} + logf("waiting for upgrade to complete") <-completedChannel + logf("upgrade completed") log.Infof(ctx, "continuing after upgrade completed") // Now that we've resumed the upgrade, process any errors. We must do @@ -804,7 +474,9 @@ func TestTenantUpgradeInterlock(t *testing.T) { if test.expStartupErr[variant] == "" { // Validate that the version is as expected, and that the // second sql pod still works. + logf("check the second server still works") otherTenantRunner.CheckQueryResults(t, "SELECT * FROM t", [][]string{{"1"}, {"2"}}) + logf("waiting for second server to reach target final version") otherTenantRunner.CheckQueryResultsRetry(t, "SHOW CLUSTER SETTING version", [][]string{{finalUpgradeVersion.String()}}) } @@ -812,10 +484,13 @@ func TestTenantUpgradeInterlock(t *testing.T) { for variant := range variants { variantName := variants[variant] - for i := range tests { - test := tests[i] - testName := variantName + "_" + test.name - t.Run(testName, func(t *testing.T) { runTest(t, variant, test) }) - } + t.Run(variantName, func(t *testing.T) { + for i := range tests { + test := tests[i] + t.Run(test.name, func(t *testing.T) { + runTest(t, variant, test) + }) + } + }) } }