Skip to content

Commit

Permalink
roachprod: add virtual cluster support to the monitor
Browse files Browse the repository at this point in the history
This makes the roachprod monitor aware of virtual clusters. This means
that the monitor is able to identify scenarios where multiple
instances are running on the same VM, and monitor the lifecycle of sql
instance processes just like it does for non virtual cluster
deployments.

Under the covers, this is achieved by leveraging the
"ROACHPROD_VIRTUAL_CLUSTER" environment variable that is set when
starting cockroach. The monitor then looks for processes that have
that environment variable set, and parses it to identify the virtual
cluster name and instance corresponding to that cockroach process.

This change should go unnoticed by all current monitor callers: tests
that monitor cockroach processes will continue to be notified of
events just like they did before. The only thing that changes is that,
if a test uses roachprod to start and stop sql instances, these events
will be available through the monitor.

Epic: none

Release note: None
  • Loading branch information
renatolabs committed Oct 2, 2023
1 parent 9a42878 commit c8d7695
Show file tree
Hide file tree
Showing 11 changed files with 378 additions and 115 deletions.
9 changes: 9 additions & 0 deletions pkg/cmd/roachprod/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,15 @@ environment variables to the cockroach process.
install.EnvOption(nodeEnv),
install.NumRacksOption(numRacks),
}

// Always pick a random available port when starting virtual
// clusters. We do not expose the functionality of choosing a
// specific port, so this is fine.
//
// TODO(renato): remove this once #111052 is addressed.
startOpts.SQLPort = 0
startOpts.AdminUIPort = 0

return roachprod.StartServiceForVirtualCluster(context.Background(),
config.Logger, targetRoachprodCluster, storageCluster, startOpts, clusterSettingsOpts...)
}),
Expand Down
15 changes: 10 additions & 5 deletions pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1394,17 +1394,22 @@ func (c *clusterImpl) assertNoDeadNode(ctx context.Context, t test.Test) error {
return err
}

deadNodes := 0
deadProcesses := 0
for info := range eventsCh {
t.L().Printf("%s", info)

if _, isDeath := info.Event.(install.MonitorNodeDead); isDeath {
deadNodes++
if _, isDeath := info.Event.(install.MonitorProcessDead); isDeath {
deadProcesses++
}
}

if deadNodes > 0 {
return errors.Newf("%d dead node(s) detected", deadNodes)
var plural string
if deadProcesses > 1 {
plural = "es"
}

if deadProcesses > 0 {
return errors.Newf("%d dead cockroach process%s detected", deadProcesses, plural)
}
return nil
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func (m *monitorImpl) startNodeMonitor() {
}

for info := range eventsCh {
_, isDeath := info.Event.(install.MonitorNodeDead)
_, isDeath := info.Event.(install.MonitorProcessDead)
isExpectedDeath := isDeath && atomic.AddInt32(&m.expDeaths, -1) >= 0
var expectedDeathStr string
if isExpectedDeath {
Expand Down
1 change: 1 addition & 0 deletions pkg/roachprod/install/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ go_test(
name = "install_test",
srcs = [
"cluster_synced_test.go",
"cockroach_test.go",
"services_test.go",
"staging_test.go",
"start_template_test.go",
Expand Down
Loading

0 comments on commit c8d7695

Please sign in to comment.