From 247f05ee8a866fcd40b1a0e3cf1f713e32892de3 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Wed, 21 Jun 2023 18:51:36 -0700
Subject: [PATCH 1/4] sqlinstance: remove unused error

The only usages of this error were removed in ea7fffb138e44b28f9c37fb02a7d13df7fec7782.

Epic: None

Release note: None
---
 pkg/sql/sqlinstance/sqlinstance.go | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pkg/sql/sqlinstance/sqlinstance.go b/pkg/sql/sqlinstance/sqlinstance.go
index e82f2320bd2e..0dbfe248d80d 100644
--- a/pkg/sql/sqlinstance/sqlinstance.go
+++ b/pkg/sql/sqlinstance/sqlinstance.go
@@ -72,7 +72,3 @@ type AddressResolver interface {
 
 // NonExistentInstanceError can be returned if a SQL instance does not exist.
 var NonExistentInstanceError = errors.Errorf("non existent SQL instance")
-
-// NotASQLInstanceError can be returned if a function is is not supported for
-// non-SQL instances.
-var NotASQLInstanceError = errors.Errorf("not supported for non-SQL instance")

From bdda6a0db99e53167314b31fcb36a95175e3e6b3 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Tue, 20 Jun 2023 20:58:47 -0700
Subject: [PATCH 2/4] sql: adjust DistSQL physical planning in multi-tenant
 mode

This commit removes the logic that might reassign some spans to the
gateway that can be applied in the multi-tenant deployment mode. This
logic was introduced in #80353 with the following [rationale](https://github.com/cockroachdb/cockroach/pull/80353#pullrequestreview-961275076):
```
This will probably go away once we support locality-aware distribution,
at least in this form. There's two reasons why I did this right now:
1. It avoids an extra hop (KV->assignee->gateway->client vs
KV->gateway->client), and this is an optimization we sometimes do in
the non-MT code path though at a later stage,
2. It makes the assignments and distribution type deterministic in
testing when we expect to assign to only a single pod.
```
Since then the locality-aware planning has been implemented (addressing
first half of point 1.). Also, the second half of point 1. applies both
to single-tenant and multi-tenant modes (`maybeMoveSingleFlowToGateway`).
Point 2. is a bit unfortunate and I'm not sure what to do about it yet
(we'll need to figure it out if we ever make separate-process
multi-tenant the default mode for running tests that check DistSQL
planning). For now only a single test needed an adjustment to make it
deterministic.

Additionally, running TPCH queries experimentally has shown that this
reassigning of single TableReader can make the query latency
significantly different (sometimes reduce it, sometimes increase it)
(see [here](https://github.com/cockroachdb/cockroach/issues/104379#issuecomment-1599208200)).

All of these reasons suggest that we should just get rid off this logic
to unify the DistSQL physical planner more between single-tenant and
multi-tenant modes.

Release note: None
---
 pkg/sql/distsql_physical_planner.go           | 80 +++----------------
 .../exec/execbuilder/testdata/distsql_tenant  | 17 ++--
 2 files changed, 21 insertions(+), 76 deletions(-)

diff --git a/pkg/sql/distsql_physical_planner.go b/pkg/sql/distsql_physical_planner.go
index 36775d996a9e..a8a185bb825a 100644
--- a/pkg/sql/distsql_physical_planner.go
+++ b/pkg/sql/distsql_physical_planner.go
@@ -1327,7 +1327,7 @@ func (dsp *DistSQLPlanner) deprecatedPartitionSpansSystem(
 func (dsp *DistSQLPlanner) partitionSpans(
 	ctx context.Context, planCtx *PlanningCtx, spans roachpb.Spans,
 ) (partitions []SpanPartition, ignoreMisplannedRanges bool, _ error) {
-	resolver, instances, err := dsp.makeInstanceResolver(ctx, planCtx.localityFilter)
+	resolver, err := dsp.makeInstanceResolver(ctx, planCtx.localityFilter)
 	if err != nil {
 		return nil, false, err
 	}
@@ -1356,11 +1356,6 @@ func (dsp *DistSQLPlanner) partitionSpans(
 			return nil, false, err
 		}
 	}
-	if planCtx.localityFilter.Empty() {
-		if err = dsp.maybeReassignToGatewaySQLInstance(partitions, instances); err != nil {
-			return nil, false, err
-		}
-	}
 	return partitions, ignoreMisplannedRanges, nil
 }
 
@@ -1395,29 +1390,24 @@ func instanceIDForKVNodeHostedInstance(nodeID roachpb.NodeID) base.SQLInstanceID
 }
 
 // makeInstanceResolver returns a function that can choose the SQL instance ID
-// for a provided KV node ID. It also returns a list of all healthy instances if
-// that list was used in choosing an instance, specifically if the localities of
-// those instances were used to decide the assignment, for use by any steps that
-// wish to post-process that assignment (such as adjusting based on localities).
-// If the instance was assigned statically or the instance list had no locality
-// information leading to random assignments then no instance list is returned.
+// for a provided KV node ID.
 func (dsp *DistSQLPlanner) makeInstanceResolver(
 	ctx context.Context, locFilter roachpb.Locality,
-) (func(roachpb.NodeID) base.SQLInstanceID, []sqlinstance.InstanceInfo, error) {
+) (func(roachpb.NodeID) base.SQLInstanceID, error) {
 	_, mixedProcessMode := dsp.distSQLSrv.NodeID.OptionalNodeID()
 
 	if mixedProcessMode && locFilter.Empty() {
-		return instanceIDForKVNodeHostedInstance, nil, nil
+		return instanceIDForKVNodeHostedInstance, nil
 	}
 
 	// GetAllInstances only returns healthy instances.
 	// TODO(yuzefovich): confirm that all instances are of compatible version.
 	instances, err := dsp.sqlAddressResolver.GetAllInstances(ctx)
 	if err != nil {
-		return nil, nil, err
+		return nil, err
 	}
 	if len(instances) == 0 {
-		return nil, nil, errors.New("no healthy sql instances available for planning")
+		return nil, errors.New("no healthy sql instances available for planning")
 	}
 
 	rng, _ := randutil.NewPseudoRand()
@@ -1436,7 +1426,7 @@ func (dsp *DistSQLPlanner) makeInstanceResolver(
 			}
 		}
 		if len(eligible) == 0 {
-			return nil, nil, errors.New("no healthy sql instances available matching locality requirement")
+			return nil, errors.New("no healthy sql instances available matching locality requirement")
 		}
 		instances = eligible
 		instancesHaveLocality = true
@@ -1487,7 +1477,7 @@ func (dsp *DistSQLPlanner) makeInstanceResolver(
 			}
 			return instances[rng.Intn(len(instances))].InstanceID
 		}
-		return resolver, instances, nil
+		return resolver, nil
 	}
 
 	// If no sql instances have locality information, fallback to a naive
@@ -1503,7 +1493,7 @@ func (dsp *DistSQLPlanner) makeInstanceResolver(
 		i++
 		return id
 	}
-	return resolver, nil, nil
+	return resolver, nil
 }
 
 // closestInstances returns the subset of instances which are closest to the
@@ -1526,56 +1516,6 @@ func closestInstances(
 	return res
 }
 
-// maybeReassignToGatewaySQLInstance checks whether the span partitioning is
-// such that it contains only a single SQL instance that is different from the
-// gateway, yet the gateway instance is in the same region as the assigned one.
-// If that is the case, then all spans are reassigned to the gateway instance in
-// order to avoid an extra hop needed when setting up the distributed plan. If
-// the locality information isn't available for the instances, then we assume
-// the assigned instance to be in the same region as the gateway.
-func (dsp *DistSQLPlanner) maybeReassignToGatewaySQLInstance(
-	partitions []SpanPartition, instances []sqlinstance.InstanceInfo,
-) error {
-	if len(partitions) != 1 || partitions[0].SQLInstanceID == dsp.gatewaySQLInstanceID {
-		// Keep the existing partitioning if more than one instance is used or
-		// the gateway is already used as the single instance.
-		return nil
-	}
-	var gatewayRegion, assignedRegion string
-	if len(instances) > 0 {
-		assignedInstance := partitions[0].SQLInstanceID
-		var ok bool
-		for _, instance := range instances {
-			if instance.InstanceID == dsp.gatewaySQLInstanceID {
-				gatewayRegion, ok = instance.Locality.Find("region")
-				if !ok {
-					// If we can't determine the region of the gateway, keep the
-					// spans assigned to the other instance.
-					break
-				}
-			} else if instance.InstanceID == assignedInstance {
-				assignedRegion, ok = instance.Locality.Find("region")
-				if !ok {
-					// We couldn't determine the region of the assigned instance
-					// but it shouldn't be possible since we wouldn't have used
-					// the instance in the planning (since we wouldn't include
-					// it into regionToSQLInstanceIDs map in
-					// makeSQLInstanceIDForKVNodeIDTenantResolver).
-					return errors.AssertionFailedf(
-						"unexpectedly planned all spans on a SQL instance %s "+
-							"which we could not find region for", instance,
-					)
-				}
-			}
-		}
-	}
-
-	if gatewayRegion == assignedRegion {
-		partitions[0].SQLInstanceID = dsp.gatewaySQLInstanceID
-	}
-	return nil
-}
-
 // getInstanceIDForScan retrieves the SQL Instance ID where the single table
 // reader should reside for a limited scan. Ideally this is the lease holder for
 // the first range in the specified spans. But if that node is unhealthy or
@@ -1609,7 +1549,7 @@ func (dsp *DistSQLPlanner) getInstanceIDForScan(
 	if dsp.useGossipPlanning(ctx, planCtx) && planCtx.localityFilter.Empty() {
 		return dsp.deprecatedSQLInstanceIDForKVNodeIDSystem(ctx, planCtx, replDesc.NodeID), nil
 	}
-	resolver, _, err := dsp.makeInstanceResolver(ctx, planCtx.localityFilter)
+	resolver, err := dsp.makeInstanceResolver(ctx, planCtx.localityFilter)
 	if err != nil {
 		return 0, err
 	}
diff --git a/pkg/sql/opt/exec/execbuilder/testdata/distsql_tenant b/pkg/sql/opt/exec/execbuilder/testdata/distsql_tenant
index 5abe3ca50d1c..3e77144339bf 100644
--- a/pkg/sql/opt/exec/execbuilder/testdata/distsql_tenant
+++ b/pkg/sql/opt/exec/execbuilder/testdata/distsql_tenant
@@ -1,5 +1,5 @@
 # LogicTest: 3node-tenant
-# tenant-cluster-setting-override-opt: sql.zone_configs.allow_for_secondary_tenant.enabled=true sql.multi_region.allow_abstractions_for_secondary_tenants.enabled=true
+# tenant-cluster-setting-override-opt: sql.zone_configs.allow_for_secondary_tenant.enabled=true sql.multi_region.allow_abstractions_for_secondary_tenants.enabled=true sql.split_at.allow_for_secondary_tenant.enabled=true
 
 statement ok
 CREATE TABLE tbl1 (a INT PRIMARY KEY, b INT)
@@ -22,15 +22,20 @@ CREATE TABLE tbl2 (k INT PRIMARY KEY, v INT, w INT, x INT,
    FAMILY fam_2 (v, w)
 )
 
-# This should be a local query, even though it uses two spans due to the column
-# family configuration.
+# Currently, the DistSQL planner uses all available SQL instances (when they
+# have no locality information) in naive round-robin fashion. In order to make
+# the output deterministic, we ensure that 3 ranges need to be scanned by the
+# query so that each of the three instances would get 1 range.
+statement ok
+ALTER TABLE tbl2 SPLIT AT VALUES (2), (3)
+
 query T
-EXPLAIN SELECT v, w FROM tbl2 WHERE k = 23
+EXPLAIN SELECT v, w FROM tbl2 WHERE k <= 3
 ----
-distribution: local
+distribution: full
 vectorized: true
 ·
 • scan
   missing stats
   table: tbl2@tbl2_pkey
-  spans: [/23 - /23]
+  spans: [ - /3]

From 75a6f1fa3d08227ed3804152cf57b37ba980ad77 Mon Sep 17 00:00:00 2001
From: Nick Vigilante <vigilante@cockroachlabs.com>
Date: Thu, 29 Jun 2023 10:57:21 -0400
Subject: [PATCH 3/4] docs-issue-generation: add related PRs link to product
 change issue body

Fixes DEVINF-447

Release note: None
---
 .../docs_issue_generation.go                  |  9 ++
 .../docs_issue_generation_test.go             | 82 +++++++++++++++++++
 2 files changed, 91 insertions(+)

diff --git a/pkg/cmd/docs-issue-generation/docs_issue_generation.go b/pkg/cmd/docs-issue-generation/docs_issue_generation.go
index 95c5a3913cc6..d3aa2e19e227 100644
--- a/pkg/cmd/docs-issue-generation/docs_issue_generation.go
+++ b/pkg/cmd/docs-issue-generation/docs_issue_generation.go
@@ -867,6 +867,15 @@ func formatReleaseNotes(
 		)
 		rnBodySlice = append(rnBodySlice, strings.TrimSuffix(rnBody, "\n"))
 	}
+	if len(rnBodySlice) > 1 {
+		relatedProductChanges := "Related product changes: " +
+			"https://cockroachlabs.atlassian.net/issues/?jql=project%20%3D%20%22DOC%22%20and%20%22Doc%20Type%5BDropdown%5D" +
+			"%22%20%3D%20%22Product%20Change%22%20AND%20description%20~%20%22commit%2F" +
+			crdbSha + "%22%20ORDER%20BY%20created%20DESC\n\n---"
+		for i, rn := range rnBodySlice {
+			rnBodySlice[i] = strings.Replace(rn, "\n---", relatedProductChanges, -1)
+		}
+	}
 	return rnBodySlice
 }
 
diff --git a/pkg/cmd/docs-issue-generation/docs_issue_generation_test.go b/pkg/cmd/docs-issue-generation/docs_issue_generation_test.go
index 78ec9ea2bf2e..b56848553424 100644
--- a/pkg/cmd/docs-issue-generation/docs_issue_generation_test.go
+++ b/pkg/cmd/docs-issue-generation/docs_issue_generation_test.go
@@ -360,6 +360,88 @@ This increases troubleshootability.
 Release note: None`,
 			rns: []string{},
 		},
+		{
+			prNum: "104265",
+			sha:   "d756dec1b9d7245305ab706e68e2ec3de0e61ffc",
+			commitMessage: `Release note (cli change): The log output formats ` + "`crdb-v1`" + ` and
+` + "`crdb-v2`" + ` now support the format option ` + "`timezone`" + `. When specified,
+the corresponding time zone is used to produce the timestamp column.
+
+For example:
+` + "```" + `yaml
+file-defaults:
+	format: crdb-v2
+	format-options: {timezone: america/new_york}
+` + "```" + `
+
+Example logging output:
+` + "```" + `
+I230606 12:43:01.553407-040000 1 1@cli/start.go:575 ⋮ [n?] 4  soft memory limit of Go runtime is set to 35 GiB
+^^^^^^^ indicates GMT-4 was used.
+` + "```" + `
+
+The timezone offset is also always included in the format if it is not
+zero (e.g. for non-UTC time zones). This is necessary to ensure that
+the times can be read back precisely.
+
+Release note (cli change): The command ` + "`cockroach debug merge-log`" + ` was
+adapted to understand time zones in input files read with format
+` + "`crdb-v1`" + ` or ` + "`crdb-v2`" + `.
+
+Release note (backward-incompatible change): When a deployment is
+configured to use a time zone (new feature) for log file output using
+formats ` + "`crdb-v1`" + ` or ` + "`crdb-v2`" + `, it becomes impossible to process the
+new output log files using the ` + "`cockroach debug merge-log`" + ` command
+from a previous version. The newest ` + "`cockroach debug merge-log`" + ` code
+must be used instead.`,
+			rns: []string{`Related PR: https://github.com/cockroachdb/cockroach/pull/104265
+Commit: https://github.com/cockroachdb/cockroach/commit/d756dec1b9d7245305ab706e68e2ec3de0e61ffc
+Related product changes: https://cockroachlabs.atlassian.net/issues/?jql=project%20%3D%20%22DOC%22%20and%20%22Doc%20Type%5BDropdown%5D%22%20%3D%20%22Product%20Change%22%20AND%20description%20~%20%22commit%2Fd756dec1b9d7245305ab706e68e2ec3de0e61ffc%22%20ORDER%20BY%20created%20DESC
+
+---
+
+Release note (cli change): The log output formats ` + "`crdb-v1`" + ` and
+` + "`crdb-v2`" + ` now support the format option ` + "`timezone`" + `. When specified,
+the corresponding time zone is used to produce the timestamp column.
+
+For example:
+` + "```" + `yaml
+file-defaults:
+	format: crdb-v2
+	format-options: {timezone: america/new_york}
+` + "```" + `
+
+Example logging output:
+` + "```" + `
+I230606 12:43:01.553407-040000 1 1@cli/start.go:575 ⋮ [n?] 4  soft memory limit of Go runtime is set to 35 GiB
+^^^^^^^ indicates GMT-4 was used.
+` + "```" + `
+
+The timezone offset is also always included in the format if it is not
+zero (e.g. for non-UTC time zones). This is necessary to ensure that
+the times can be read back precisely.`,
+				`Related PR: https://github.com/cockroachdb/cockroach/pull/104265
+Commit: https://github.com/cockroachdb/cockroach/commit/d756dec1b9d7245305ab706e68e2ec3de0e61ffc
+Related product changes: https://cockroachlabs.atlassian.net/issues/?jql=project%20%3D%20%22DOC%22%20and%20%22Doc%20Type%5BDropdown%5D%22%20%3D%20%22Product%20Change%22%20AND%20description%20~%20%22commit%2Fd756dec1b9d7245305ab706e68e2ec3de0e61ffc%22%20ORDER%20BY%20created%20DESC
+
+---
+
+Release note (cli change): The command ` + "`cockroach debug merge-log`" + ` was
+adapted to understand time zones in input files read with format
+` + "`crdb-v1`" + ` or ` + "`crdb-v2`" + `.`,
+				`Related PR: https://github.com/cockroachdb/cockroach/pull/104265
+Commit: https://github.com/cockroachdb/cockroach/commit/d756dec1b9d7245305ab706e68e2ec3de0e61ffc
+Related product changes: https://cockroachlabs.atlassian.net/issues/?jql=project%20%3D%20%22DOC%22%20and%20%22Doc%20Type%5BDropdown%5D%22%20%3D%20%22Product%20Change%22%20AND%20description%20~%20%22commit%2Fd756dec1b9d7245305ab706e68e2ec3de0e61ffc%22%20ORDER%20BY%20created%20DESC
+
+---
+
+Release note (backward-incompatible change): When a deployment is
+configured to use a time zone (new feature) for log file output using
+formats ` + "`crdb-v1`" + ` or ` + "`crdb-v2`" + `, it becomes impossible to process the
+new output log files using the ` + "`cockroach debug merge-log`" + ` command
+from a previous version. The newest ` + "`cockroach debug merge-log`" + ` code
+must be used instead.`},
+		},
 	}
 	for _, tc := range testCases {
 		t.Run(tc.prNum, func(t *testing.T) {

From f80b346584ddf092c2a5b4f6ae81a83141e87128 Mon Sep 17 00:00:00 2001
From: Yahor Yuzefovich <yahor@cockroachlabs.com>
Date: Mon, 26 Jun 2023 11:10:20 -0700
Subject: [PATCH 4/4] roachtest: harden and extend cancel roachtest

This commit hardens `cancel` roachtest. In particular, this test
involves two goroutines: the runner that is executing longer running
TPCH query and the main goroutine that cancels that query. Previously,
in order to ensure that the main goroutine attempts to cancel the query
at the right moment, we slept for 250ms. Then, we would cancel all
running non-internal queries other than `SHOW CLUSTER QUERIES` itself.
This was problematic for a couple of reasons:
- the TPCH query might not have started yet (due some Go scheduling
delays)
- we could actually try to cancel one of the setup queries (the runner
does `USE tpch;` and `SET distsql = off;` before running the TPCH
query).

In order to address the first reason, this commit adjusts the runner to
notify the main goroutine only after the setup queries are done and
introduces the polling loop to wait until the TPCH query shows up. That
polling loop will now randomly sleep for a random duration up to 1000ms
(in order to improve the test coverage of both the optimizer and the
execution engine). Note that we only check that the cancellation
occurred within 3s (used to be 5s before this commit), so we don't
sufficiently exercise the optimizer cancellation (which isn't the
primary goal of this test anyway).

The second reason is addressed by blocking the main goroutine until the
setup queries are done.

Release note: None
---
 pkg/cmd/roachtest/tests/cancel.go | 144 +++++++++++++++++++-----------
 1 file changed, 93 insertions(+), 51 deletions(-)

diff --git a/pkg/cmd/roachtest/tests/cancel.go b/pkg/cmd/roachtest/tests/cancel.go
index 065ff216134d..fbe1be8d292b 100644
--- a/pkg/cmd/roachtest/tests/cancel.go
+++ b/pkg/cmd/roachtest/tests/cancel.go
@@ -22,6 +22,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/util/cancelchecker"
+	"github.com/cockroachdb/cockroach/pkg/util/randutil"
 	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
 	"github.com/cockroachdb/cockroach/pkg/workload/tpch"
 	"github.com/cockroachdb/errors"
@@ -58,63 +59,104 @@ func registerCancel(r registry.Registry) {
 				t.Fatal(err)
 			}
 
-			queryPrefix := "USE tpch; "
-			if !useDistsql {
-				queryPrefix += "SET distsql = off; "
-			}
-
 			t.Status("running queries to cancel")
+			rng, _ := randutil.NewTestRand()
 			for _, queryNum := range tpchQueriesToRun {
-				// sem is used to indicate that the query-runner goroutine has
-				// been spawned up.
-				sem := make(chan struct{})
-				// Any error regarding the cancellation (or of its absence) will
-				// be sent on errCh.
-				errCh := make(chan error, 1)
-				go func(queryNum int) {
-					defer close(errCh)
-					query := tpch.QueriesByNumber[queryNum]
-					t.L().Printf("executing q%d\n", queryNum)
-					sem <- struct{}{}
-					close(sem)
-					_, err := conn.Exec(queryPrefix + query)
-					if err == nil {
-						errCh <- errors.New("query completed before it could be canceled")
-					} else {
-						fmt.Printf("query failed with error: %s\n", err)
-						// Note that errors.Is() doesn't work here because
-						// lib/pq wraps the query canceled error.
-						if !strings.Contains(err.Error(), cancelchecker.QueryCanceledError.Error()) {
-							errCh <- errors.Wrap(err, "unexpected error")
+				// Run each query 5 times to increase the test coverage.
+				for run := 0; run < 5; run++ {
+					// sem is used to indicate that the query-runner goroutine
+					// has been spawned up and has done preliminary setup.
+					sem := make(chan struct{})
+					// An error will always be sent on errCh by the runner
+					// (either query execution error or an error indicating the
+					// absence of expected cancellation error).
+					errCh := make(chan error, 1)
+					go func(queryNum int) {
+						runnerConn := c.Conn(ctx, t.L(), 1)
+						defer runnerConn.Close()
+						setupQueries := []string{"USE tpch;"}
+						if !useDistsql {
+							setupQueries = append(setupQueries, "SET distsql = off;")
+						}
+						for _, setupQuery := range setupQueries {
+							t.L().Printf("executing setup query %q", setupQuery)
+							if _, err := runnerConn.Exec(setupQuery); err != nil {
+								errCh <- err
+								close(sem)
+								return
+							}
+						}
+						query := tpch.QueriesByNumber[queryNum]
+						t.L().Printf("executing q%d\n", queryNum)
+						close(sem)
+						_, err := runnerConn.Exec(query)
+						if err == nil {
+							err = errors.New("query completed before it could be canceled")
+						}
+						errCh <- err
+					}(queryNum)
+
+					// Wait for the query-runner goroutine to start as well as
+					// to execute setup queries.
+					<-sem
+
+					// Continuously poll until we get the queryID that we want
+					// to cancel. We expect it to show up within 10 seconds.
+					var queryID, query string
+					timeoutCh := time.After(10 * time.Second)
+					pollingStartTime := timeutil.Now()
+					for {
+						// Sleep for some random duration up to 1000ms. This
+						// allows us to sometimes find the query when it's in
+						// the planning stage while in most cases it's in the
+						// execution stage already.
+						toSleep := time.Duration(rng.Intn(1001)) * time.Millisecond
+						t.Status(fmt.Sprintf("sleeping for %s", toSleep))
+						time.Sleep(toSleep)
+						rows, err := conn.Query(`SELECT query_id, query FROM [SHOW CLUSTER QUERIES] WHERE query NOT LIKE '%SHOW CLUSTER QUERIES%'`)
+						if err != nil {
+							t.Fatal(err)
+						}
+						if rows.Next() {
+							if err = rows.Scan(&queryID, &query); err != nil {
+								t.Fatal(err)
+							}
+							break
+						}
+						if err = rows.Close(); err != nil {
+							t.Fatal(err)
+						}
+						select {
+						case err = <-errCh:
+							t.Fatalf("received an error from the runner goroutine before the query could be canceled: %v", err)
+						case <-timeoutCh:
+							t.Fatal(errors.New("didn't see the query to cancel within 10 seconds"))
+						default:
 						}
 					}
-				}(queryNum)
-
-				// Wait for the query-runner goroutine to start.
-				<-sem
-
-				// The cancel query races with the execution of the query it's trying to
-				// cancel, which may result in attempting to cancel the query before it
-				// has started.  To be more confident that the query is executing, wait
-				// a bit before attempting to cancel it.
-				time.Sleep(250 * time.Millisecond)
-
-				const cancelQuery = `CANCEL QUERIES
-	SELECT query_id FROM [SHOW CLUSTER QUERIES] WHERE query not like '%SHOW CLUSTER QUERIES%'`
-				c.Run(ctx, c.Node(1), `./cockroach sql --insecure -e "`+cancelQuery+`"`)
-				cancelStartTime := timeutil.Now()
-
-				select {
-				case err, ok := <-errCh:
-					if ok {
-						t.Fatal(err)
+
+					t.Status(fmt.Sprintf("canceling the query after waiting for %s", timeutil.Since(pollingStartTime)))
+					_, err := conn.Exec(`CANCEL QUERY $1`, queryID)
+					if err != nil {
+						t.Status(fmt.Sprintf("%s: %q", queryID, query))
+						t.Fatalf("encountered an error when canceling %q with queryID=%s: %v", query, queryID, err)
 					}
-					// If errCh is closed, then the cancellation was successful.
-					timeToCancel := timeutil.Since(cancelStartTime)
-					fmt.Printf("canceling q%d took %s\n", queryNum, timeToCancel)
+					cancelStartTime := timeutil.Now()
 
-				case <-time.After(5 * time.Second):
-					t.Fatal("query took too long to respond to cancellation")
+					select {
+					case err := <-errCh:
+						t.Status(err)
+						if !strings.Contains(err.Error(), cancelchecker.QueryCanceledError.Error()) {
+							// Note that errors.Is() doesn't work here because
+							// lib/pq wraps the query canceled error.
+							t.Fatal(errors.Wrap(err, "unexpected error"))
+						}
+						timeToCancel := timeutil.Since(cancelStartTime)
+						t.Status(fmt.Sprintf("canceling q%d took %s\n", queryNum, timeToCancel))
+
+					case <-time.After(3 * time.Second):
+						t.Fatal("query took too long to respond to cancellation")
+					}
 				}
 			}