From 5cc8f110abafb772434d6b0a2c319a0d4ae0829a Mon Sep 17 00:00:00 2001 From: Raphael 'kena' Poss Date: Fri, 30 Jun 2023 00:10:48 +0200 Subject: [PATCH 1/3] upgrades: fix the write of 'version' to system.tenant_settings Prior to this patch, a bogus value was written to the tenant_id column for the 'version' override. This patch fixes it. We also suspect this will help with the multitenant-upgrade roachtest but this will be checked in a later step. Release note: None Co-authored-by: healthy-pod --- .../logictestccl/testdata/logic_test/tenant_settings | 9 ++++++--- pkg/server/tenantsettingswatcher/BUILD.bazel | 1 + pkg/server/tenantsettingswatcher/watcher_test.go | 4 ++++ pkg/sql/logictest/testdata/logic_test/tenant | 10 ++++++---- pkg/upgrade/upgrades/permanent_upgrades.go | 5 +++-- 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/pkg/ccl/logictestccl/testdata/logic_test/tenant_settings b/pkg/ccl/logictestccl/testdata/logic_test/tenant_settings index 7f7760235861..adcb30f96693 100644 --- a/pkg/ccl/logictestccl/testdata/logic_test/tenant_settings +++ b/pkg/ccl/logictestccl/testdata/logic_test/tenant_settings @@ -8,10 +8,13 @@ true user host-cluster-root # Ensure that the version is set the same in the system tenant and in the tenant override. -query I -select count(*) from system.settings JOIN system.tenant_settings on tenant_settings.name = settings.name where tenant_settings.name = 'version' and tenant_settings.value = settings.value; +query TI rowsort +SELECT settings.name, tenant_id +FROM system.settings +JOIN system.tenant_settings ON tenant_settings.name = settings.name +WHERE tenant_settings.name = 'version' AND tenant_settings.value = settings.value; ---- -1 +version 0 statement ok ALTER TENANT [10] SET CLUSTER SETTING sql.notices.enabled = false diff --git a/pkg/server/tenantsettingswatcher/BUILD.bazel b/pkg/server/tenantsettingswatcher/BUILD.bazel index 0308514f98fb..85fe7c877e84 100644 --- a/pkg/server/tenantsettingswatcher/BUILD.bazel +++ b/pkg/server/tenantsettingswatcher/BUILD.bazel @@ -47,6 +47,7 @@ go_test( embed = [":tenantsettingswatcher"], deps = [ "//pkg/base", + "//pkg/clusterversion", "//pkg/keys", "//pkg/kv/kvpb", "//pkg/roachpb", diff --git a/pkg/server/tenantsettingswatcher/watcher_test.go b/pkg/server/tenantsettingswatcher/watcher_test.go index 2ad6864073be..60d30228f602 100644 --- a/pkg/server/tenantsettingswatcher/watcher_test.go +++ b/pkg/server/tenantsettingswatcher/watcher_test.go @@ -18,6 +18,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/clusterversion" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/server/tenantsettingswatcher" @@ -60,6 +61,9 @@ func TestWatcher(t *testing.T) { t.Helper() var vals []string for _, s := range overrides { + if s.Name == clusterversion.KeyVersionSetting { + continue + } vals = append(vals, fmt.Sprintf("%s=%s", s.Name, s.Value.Value)) } if actual := strings.Join(vals, " "); actual != expected { diff --git a/pkg/sql/logictest/testdata/logic_test/tenant b/pkg/sql/logictest/testdata/logic_test/tenant index 10f25a72452b..80f19e3c5a8c 100644 --- a/pkg/sql/logictest/testdata/logic_test/tenant +++ b/pkg/sql/logictest/testdata/logic_test/tenant @@ -415,11 +415,12 @@ EXCEPT SELECT capability_name, capability_value FROM [SHOW TENANT othertenant WI ---- # Check that the setting overrides were copied. -query TTTT -SELECT variable, value, type, origin FROM [SHOW CLUSTER SETTINGS FOR TENANT othertenant] +query TTTT rowsort +SELECT variable, IF(variable='version','--',value), type, origin FROM [SHOW CLUSTER SETTINGS FOR TENANT othertenant] WHERE origin != 'no-override' ---- trace.debug.enable true b per-tenant-override +version -- m all-tenants-override # Check that the resource usage parameters were copied. query IIRRRRI @@ -463,11 +464,12 @@ EXCEPT SELECT capability_name, capability_value FROM [SHOW TENANT othertenant WI ---- # Check the setting overrides were taken over. -query TTTT -SELECT variable, value, type, origin FROM [SHOW CLUSTER SETTINGS FOR TENANT othertenant] +query TTTT rowsort +SELECT variable, IF(variable='version','--',value), type, origin FROM [SHOW CLUSTER SETTINGS FOR TENANT othertenant] WHERE origin != 'no-override' ---- trace.debug.enable true b per-tenant-override +version -- m all-tenants-override # Check that the resource usage parameters were copied. query IIRRRRI diff --git a/pkg/upgrade/upgrades/permanent_upgrades.go b/pkg/upgrade/upgrades/permanent_upgrades.go index e492cbb20306..853c855d1c77 100644 --- a/pkg/upgrade/upgrades/permanent_upgrades.go +++ b/pkg/upgrade/upgrades/permanent_upgrades.go @@ -135,11 +135,12 @@ func populateVersionSetting( // Tenant ID 0 indicates that we're overriding the value for all // tenants. - tenantID := tree.NewDInt(0) _, err = ie.Exec( ctx, "insert-setting", nil, /* txn */ - fmt.Sprintf(`INSERT INTO system.tenant_settings (tenant_id, name, value, "last_updated", "value_type") VALUES (%d, 'version', x'%x', now(), 'm') ON CONFLICT(tenant_id, name) DO NOTHING`, tenantID, b), + fmt.Sprintf(` +INSERT INTO system.tenant_settings (tenant_id, name, value, last_updated, value_type) +VALUES (0, 'version', x'%x', now(), 'm') ON CONFLICT(tenant_id, name) DO NOTHING`, b), ) if err != nil { return err From 8777bf6d0e346dfb043064117e036d039dedf95f Mon Sep 17 00:00:00 2001 From: Renato Costa Date: Thu, 6 Jul 2023 16:53:50 -0400 Subject: [PATCH 2/3] roachprod: fix leaky goroutine in `SyncedCluster.Monitor` The `Monitor` function would leak a goroutine per node waiting for context cancelation even after the monitor loop had already exited (for example, when a `OneShot` monitor check was requested). This commit updates that function so that we use a cancelable context derived from the context passed as argument; when the monitor loop exits, we cancel that context, which causes the leaky goroutine to terminate appropriately. This leaked goroutine shows up in the newly introduced roachtest `leaktest`, where there is one leaked goroutine per node in the cluster created by the test; the monitor is created during the `assertNoDeadNode` post-test assertion. Epic: none Release note: None --- pkg/roachprod/install/cluster_synced.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/roachprod/install/cluster_synced.go b/pkg/roachprod/install/cluster_synced.go index 1a6e4fa5260c..c9916a9fce7a 100644 --- a/pkg/roachprod/install/cluster_synced.go +++ b/pkg/roachprod/install/cluster_synced.go @@ -638,6 +638,7 @@ func (c *SyncedCluster) Monitor( ch := make(chan NodeMonitorInfo) nodes := c.TargetNodes() var wg sync.WaitGroup + monitorCtx, cancel := context.WithCancel(ctx) for i := range nodes { wg.Add(1) @@ -758,9 +759,10 @@ done return } - // Watch for context cancellation. + // Watch for context cancellation, which can happen either if + // the test fails, or if the monitor loop exits. go func() { - <-ctx.Done() + <-monitorCtx.Done() sess.Close() }() @@ -776,6 +778,7 @@ done } go func() { wg.Wait() + cancel() close(ch) }() From f9d14096192ef166525850329c3d775389806245 Mon Sep 17 00:00:00 2001 From: Erik Grinaker Date: Fri, 7 Jul 2023 11:43:14 +0000 Subject: [PATCH 3/3] kvserver: unskip and deflake `TestShowTraceReplica` This test asserts on replica placement. By default, the replicate queue can take up to 10 minutes to execute config changes, which can fail the test. This patch speeds up the replicate queue. It also skips the test under stressrace and deadlock, since it is timing sensitive. Epic: none Release note: None --- pkg/sql/show_trace_replica_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/sql/show_trace_replica_test.go b/pkg/sql/show_trace_replica_test.go index ff44ce3cc59c..7ce0973f4a18 100644 --- a/pkg/sql/show_trace_replica_test.go +++ b/pkg/sql/show_trace_replica_test.go @@ -15,6 +15,7 @@ import ( "fmt" "reflect" "testing" + "time" "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/config/zonepb" @@ -34,7 +35,8 @@ func TestShowTraceReplica(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) - skip.WithIssue(t, 98598) + skip.UnderStressRace(t) // too slow + skip.UnderDeadlock(t) // too slow const numNodes = 4 @@ -44,6 +46,7 @@ func TestShowTraceReplica(t *testing.T) { ctx := context.Background() tsArgs := func(node string) base.TestServerArgs { return base.TestServerArgs{ + ScanMaxIdleTime: 10 * time.Millisecond, // speed up replicate queue Knobs: base.TestingKnobs{ Server: &server.TestingKnobs{ DefaultZoneConfigOverride: &zoneConfig,