Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
100121: backupccl: deflake TestFullClusterBackup r=adityamaru a=msbutler

TestFullClusterBackup would flake if the test ran at a specific time during the week due to cockroachdb#100094. This patch prevents this flake.

Fixes cockroachdb#100094

Release note: none

100182: roachprod: set COCKROACH_CONNECT_TIMEOUT to 1200s during start r=renatolabs,jbowens a=msbutler

Previously the sql cmds invoked within roachprod start had an infinite timeout. This patch reduces the timeout to 10 minutes -- if a node can't start within 10 minutes because a roachprod client can't connect to the new crdb node, there's likely a problem.

We decided to make this change after a roachtest with a 10 hr time out hung for 10 hours due to an internal start cmd. See cockroachdb#99280.

Informs cockroachdb#99280

Release note: none

Co-authored-by: Michael Butler <[email protected]>
  • Loading branch information
craig[bot] and msbutler committed Mar 30, 2023
3 parents 2414bba + 0778e24 + 20053d9 commit 3de803f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 8 deletions.
3 changes: 3 additions & 0 deletions pkg/ccl/backupccl/full_cluster_backup_restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,9 @@ CREATE TABLE data2.foo (a int);
case systemschema.CommentsTable.GetName():
query := fmt.Sprintf("SELECT comment FROM system.%s", table)
verificationQueries[i] = query
case systemschema.ScheduledJobsTable.GetName():
query := fmt.Sprintf("SELECT schedule_id, schedule_name FROM system.%s", table)
verificationQueries[i] = query
default:
query := fmt.Sprintf("SELECT * FROM system.%s", table)
verificationQueries[i] = query
Expand Down
20 changes: 12 additions & 8 deletions pkg/roachprod/install/cockroach.go
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,10 @@ type StartOpts struct {
KVCluster *SyncedCluster
}

// startSQLTimeout identifies the COCKROACH_CONNECT_TIMEOUT to use (in seconds)
// for sql cmds within syncedCluster.Start().
const startSQLTimeout = 1200

// StartTarget identifies what flavor of cockroach we are starting.
type StartTarget int

Expand Down Expand Up @@ -693,12 +697,12 @@ func (c *SyncedCluster) generateClusterSettingCmd(l *logger.Logger, node Node) s
// removed in v21.2.
clusterSettingCmd += fmt.Sprintf(`
if ! test -e %s ; then
COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e "SET CLUSTER SETTING server.remote_debugging.mode = 'any'" || true;
COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e "
COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e "SET CLUSTER SETTING server.remote_debugging.mode = 'any'" || true;
COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e "
SET CLUSTER SETTING cluster.organization = 'Cockroach Labs - Production Testing';
SET CLUSTER SETTING enterprise.license = '%s';" \
&& touch %s
fi`, path, binary, url, binary, url, config.CockroachDevLicense, path)
fi`, path, startSQLTimeout, binary, url, startSQLTimeout, binary, url, config.CockroachDevLicense, path)
return clusterSettingCmd
}

Expand All @@ -713,8 +717,8 @@ func (c *SyncedCluster) generateInitCmd(node Node) string {
binary := cockroachNodeBinary(c, node)
initCmd += fmt.Sprintf(`
if ! test -e %[1]s ; then
COCKROACH_CONNECT_TIMEOUT=0 %[2]s init --url %[3]s && touch %[1]s
fi`, path, binary, url)
COCKROACH_CONNECT_TIMEOUT=%[4]d %[2]s init --url %[3]s && touch %[1]s
fi`, path, binary, url, startSQLTimeout)
return initCmd
}

Expand Down Expand Up @@ -829,10 +833,10 @@ func (c *SyncedCluster) createFixedBackupSchedule(
node := c.Nodes[0]
binary := cockroachNodeBinary(c, node)
url := c.NodeURL("localhost", c.NodePort(node), "" /* tenantName */)
fullCmd := fmt.Sprintf(`COCKROACH_CONNECT_TIMEOUT=0 %s sql --url %s -e %q`,
binary, url, createScheduleCmd)
fullCmd := fmt.Sprintf(`COCKROACH_CONNECT_TIMEOUT=%d %s sql --url %s -e %q`,
startSQLTimeout, binary, url, createScheduleCmd)
// Instead of using `c.ExecSQL()`, use the more flexible c.newSession(), which allows us to
// 1) prefix the schedule backup cmd with COCKROACH_CONNECT_TIMEOUT=0.
// 1) prefix the schedule backup cmd with COCKROACH_CONNECT_TIMEOUT.
// 2) run the command against the first node in the cluster target.
sess := c.newSession(l, node, fullCmd, withDebugName("init-backup-schedule"))
defer sess.Close()
Expand Down

0 comments on commit 3de803f

Please sign in to comment.