Skip to content

Commit

Permalink
Merge pull request #104921 from JeffSwenson/backport23.1-104821
Browse files Browse the repository at this point in the history
release-23.1: upgrade: checkpoint fence version for multi-tenant clusters
  • Loading branch information
jeffswenson authored Jun 15, 2023
2 parents d7157c2 + e9374af commit d76049d
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 1 deletion.
5 changes: 5 additions & 0 deletions pkg/upgrade/upgrademanager/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -49,24 +49,29 @@ go_test(
args = ["-test.timeout=295s"],
deps = [
"//pkg/base",
"//pkg/ccl/kvccl/kvtenantccl",
"//pkg/clusterversion",
"//pkg/jobs",
"//pkg/jobs/jobspb",
"//pkg/kv",
"//pkg/kv/kvserver/batcheval",
"//pkg/kv/kvserver/liveness",
"//pkg/roachpb",
"//pkg/security/securityassets",
"//pkg/security/securitytest",
"//pkg/server",
"//pkg/server/settingswatcher",
"//pkg/settings/cluster",
"//pkg/sql/execinfra",
"//pkg/sql/isql",
"//pkg/sql/sem/eval",
"//pkg/testutils",
"//pkg/testutils/serverutils",
"//pkg/testutils/sqlutils",
"//pkg/testutils/testcluster",
"//pkg/upgrade",
"//pkg/upgrade/upgradebase",
"//pkg/upgrade/upgrades",
"//pkg/util",
"//pkg/util/leaktest",
"//pkg/util/log",
Expand Down
2 changes: 1 addition & 1 deletion pkg/upgrade/upgrademanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -544,7 +544,7 @@ func (m *Manager) Migrate(
if mustPersistFenceVersion {
var err error
for {
err = updateSystemVersionSetting(ctx, cv, validate)
err = updateSystemVersionSetting(ctx, fenceVersion, validate)
if errors.Is(err, upgradecluster.InconsistentSQLServersError) {
if err := bumpClusterVersion(ctx, m.deps.Cluster, fenceVersion); err != nil {
return err
Expand Down
131 changes: 131 additions & 0 deletions pkg/upgrade/upgrademanager/manager_external_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,27 +14,34 @@ import (
"context"
gosql "database/sql"
"fmt"
"math/rand"
"sort"
"sync/atomic"
"testing"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl/kvtenantccl"
"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/kv"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval"
"github.com/cockroachdb/cockroach/pkg/kv/kvserver/liveness"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/server"
"github.com/cockroachdb/cockroach/pkg/server/settingswatcher"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/sql/execinfra"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/sem/eval"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/upgrade"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgradebase"
"github.com/cockroachdb/cockroach/pkg/upgrade/upgrades"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
Expand Down Expand Up @@ -618,6 +625,7 @@ func TestPrecondition(t *testing.T) {
require.Equalf(t, exp, got, "server %d", i)
}
}

defer tc.Stopper().Stop(ctx)
sqlDB := tc.ServerConn(0)
{
Expand Down Expand Up @@ -662,3 +670,126 @@ func TestPrecondition(t *testing.T) {
checkActiveVersion(t, v2)
}
}

func TestMigrationFailure(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()

// Configure the range of versions used by the test
startVersionKey := clusterversion.BinaryMinSupportedVersionKey
startVersion := clusterversion.ByKey(startVersionKey)
endVersionKey := clusterversion.BinaryVersionKey
endVersion := clusterversion.ByKey(endVersionKey)

// Pick a random version in to fail at
versions := clusterversion.ListBetween(startVersion, endVersion)
failVersion := versions[rand.Intn(len(versions))]
fenceVersion := upgrade.FenceVersionFor(ctx, clusterversion.ClusterVersion{Version: failVersion}).Version
t.Logf("test will fail at version: %s", failVersion.String())

// Create a storage cluster for the tenant
testCluster := serverutils.StartNewTestCluster(t, 1, base.TestClusterArgs{
ServerArgs: base.TestServerArgs{
DisableDefaultTestTenant: true,
Knobs: base.TestingKnobs{
SQLEvalContext: &eval.TestingKnobs{
TenantLogicalVersionKeyOverride: startVersionKey,
},
},
},
})
defer testCluster.Stopper().Stop(ctx)

// Set the version override so that the tenant is able to upgrade. If this is
// not set, the tenant treats the storage cluster as if it had the oldest
// supported binary version.
s := testCluster.Server(0)
goDB := serverutils.OpenDBConn(t, s.ServingSQLAddr(), "system", false, s.Stopper())
_, err := goDB.Exec(`ALTER TENANT ALL SET CLUSTER SETTING version = $1`, endVersion.String())
require.NoError(t, err)

// setting failUpgrade to false disables the upgrade error logic.
var failUpgrade atomic.Bool
failUpgrade.Store(true)

// Create a tenant cluster at the oldest supported binary version. Configure
// upgrade manager to fail the upgrade at a random version.
tenantSettings := cluster.MakeTestingClusterSettingsWithVersions(
endVersion,
startVersion,
false,
)
require.NoError(t, clusterversion.Initialize(ctx, startVersion, &tenantSettings.SV))
tenant, db := serverutils.StartTenant(t, testCluster.Server(0), base.TestTenantArgs{
TenantID: roachpb.MustMakeTenantID(10),
Settings: tenantSettings,
TestingKnobs: base.TestingKnobs{
Server: &server.TestingKnobs{
DisableAutomaticVersionUpgrade: make(chan struct{}),
BootstrapVersionKeyOverride: startVersionKey,
BinaryVersionOverride: startVersion,
},
UpgradeManager: &upgradebase.TestingKnobs{
DontUseJobs: true,
RegistryOverride: func(cv roachpb.Version) (upgradebase.Upgrade, bool) {
if failUpgrade.Load() && cv == failVersion {
errorUpgrade := func(ctx context.Context, version clusterversion.ClusterVersion, deps upgrade.TenantDeps) error {
return errors.New("the upgrade failed with some error!")
}
return upgrade.NewTenantUpgrade("test", cv, nil, errorUpgrade), true
}
return upgrades.GetUpgrade(cv)
},
},
},
})

// Wait for the storage cluster version to propogate
watcher := tenant.SettingsWatcher().(*settingswatcher.SettingsWatcher)
testutils.SucceedsSoon(t, func() error {
storageVersion := watcher.GetStorageClusterActiveVersion()
if !storageVersion.IsActive(endVersionKey) {
return errors.Newf("expected storage version of at least %s found %s", endVersion.PrettyPrint(), storageVersion.PrettyPrint())
}
return nil
})

checkActiveVersion := func(t *testing.T, exp roachpb.Version) {
got := tenant.ClusterSettings().Version.ActiveVersion(ctx).Version
require.Equal(t, exp, got)
}
checkSettingVersion := func(t *testing.T, exp roachpb.Version) {
var settingVersion clusterversion.ClusterVersion
require.NoError(t, tenant.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
var err error
settingVersion, err = watcher.GetClusterVersionFromStorage(ctx, txn)
return err
}))
require.Equal(t, exp, settingVersion.Version)
}

checkActiveVersion(t, startVersion)
checkSettingVersion(t, startVersion)

// Try to finalize
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.Error(t, err)
checkActiveVersion(t, fenceVersion)
// Note: we don't check the setting version here because the fence setting
// is only materialized on the first attempt.

// Try to finalize again.
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.Error(t, err)
checkActiveVersion(t, fenceVersion)
checkSettingVersion(t, fenceVersion)

// Try to finalize a final time, allowing the upgrade to succeed.
failUpgrade.Store(false)
_, err = db.Exec(`SET CLUSTER SETTING version = $1`, endVersion.String())
require.NoError(t, err)
checkActiveVersion(t, endVersion)
checkSettingVersion(t, endVersion)
}

0 comments on commit d76049d

Please sign in to comment.