From 6db9546833b24914188013fcf864ebb54c95fd3e Mon Sep 17 00:00:00 2001 From: Xiang Gu Date: Tue, 19 Dec 2023 14:15:47 -0500 Subject: [PATCH] tabledesc_test: deflake TestIndexDoesNotStorePrimaryKeyColumnMixedVersion Previously, this test would flake under `--stress` and the root cause is because the upgrade precondition check that it relies on actually checks for corrupt descriptor at AOST '-10s', and it's possible that the corrupt descriptor was injected within the last 10s to the system, and thus the check produced a "false negative" (meaning it saw there isn't no corruption at AOST '-10s' and mistakenly assumed there isn't corruption now) and thus mistakenly allowed the cluster upgrade to proceed. The AOST '-10s' trick in the upgrade was implemented to help with performance, and, for now, I think that outweighs the possibility of producing such false-negatives, so this fix instead modified the test such that it waits 10s after injecting the corrupt descriptor to ensure it will be seen by the time the upgrade logic runs. The downside that comes with this decision is this test will now take ~10-11s to run and we need to skip it under `--stress` (bc 1000 iterations / # of parallelism * 10s can still be larger than 1h, the default timeout for nightly stresses). Release note: None --- pkg/sql/catalog/tabledesc/BUILD.bazel | 2 +- .../tabledesc/validate_version_gating_test.go | 5 +++-- pkg/upgrade/upgrades/first_upgrade.go | 12 +++++++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pkg/sql/catalog/tabledesc/BUILD.bazel b/pkg/sql/catalog/tabledesc/BUILD.bazel index 3a6d84eb3dc6..be197b510e64 100644 --- a/pkg/sql/catalog/tabledesc/BUILD.bazel +++ b/pkg/sql/catalog/tabledesc/BUILD.bazel @@ -118,8 +118,8 @@ go_test( "//pkg/sql/types", "//pkg/testutils", "//pkg/testutils/serverutils", - "//pkg/testutils/skip", "//pkg/testutils/sqlutils", + "//pkg/upgrade/upgrades", "//pkg/util/hlc", "//pkg/util/leaktest", "//pkg/util/log", diff --git a/pkg/sql/catalog/tabledesc/validate_version_gating_test.go b/pkg/sql/catalog/tabledesc/validate_version_gating_test.go index ec86e1703d9f..4e771902ce56 100644 --- a/pkg/sql/catalog/tabledesc/validate_version_gating_test.go +++ b/pkg/sql/catalog/tabledesc/validate_version_gating_test.go @@ -25,8 +25,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/tabledesc" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" - "github.com/cockroachdb/cockroach/pkg/testutils/skip" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/upgrade/upgrades" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -54,7 +54,6 @@ import ( func TestIndexDoesNotStorePrimaryKeyColumnMixedVersion(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - skip.UnderStressRace(t) // Start a test cluster whose cluster version is MinSupported and auto-upgrade is // disabled. @@ -115,6 +114,8 @@ func TestIndexDoesNotStorePrimaryKeyColumnMixedVersion(t *testing.T) { // Assert cluster version upgrade is blocked. require.Equal(t, [][]string{{"1000023.1"}}, tdb.QueryStr(t, "SHOW CLUSTER SETTING version;")) + // disable AOST for upgrade precondition check to ensure it sees the injected, corrupt descriptor. + upgrades.TestingSetFirstUpgradePreconditionAOST(false) _, err := sqlDB.Exec(`SET CLUSTER SETTING version = $1`, clusterversion.Latest.String()) require.ErrorContains(t, err, `verifying precondition for version 1000023.1-upgrading-to-1000023.2-step-002: "".crdb_internal.invalid_objects is not empty`) } diff --git a/pkg/upgrade/upgrades/first_upgrade.go b/pkg/upgrade/upgrades/first_upgrade.go index 1505eb817023..5eae657305dd 100644 --- a/pkg/upgrade/upgrades/first_upgrade.go +++ b/pkg/upgrade/upgrades/first_upgrade.go @@ -97,6 +97,16 @@ func upgradeDescriptors( }) } +var firstUpgradePreconditionUsesAOST = true + +// TestingSetFirstUpgradePreconditionAOST allows tests to disable withAOST trick in upgrade +// precondition check, which otherwise can produce false negative. +func TestingSetFirstUpgradePreconditionAOST(enabled bool) func() { + old := firstUpgradePreconditionUsesAOST + firstUpgradePreconditionUsesAOST = enabled + return func() { firstUpgradePreconditionUsesAOST = old } +} + // FirstUpgradeFromReleasePrecondition is the precondition check for upgrading // from any supported major release. // @@ -118,7 +128,7 @@ func FirstUpgradeFromReleasePrecondition( // a diagnostic query. If no corruptions were found back then, we assume that // there are no corruptions now. Otherwise, we retry and do everything // without an AOST clause henceforth. - withAOST := true + withAOST := firstUpgradePreconditionUsesAOST diagnose := func(tbl string) (hasRows bool, err error) { q := fmt.Sprintf("SELECT count(*) FROM \"\".crdb_internal.%s", tbl) if withAOST {