diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index f2b448cfce33..fb74f9492867 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1875,7 +1875,7 @@ func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) st } logFile := fmt.Sprintf( "run_%s_n%s_%s", - t.Format(`150405.000`), + t.Format(`150405.000000000`), nodes.String()[1:], s, ) diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index 4baf600c3dde..b934100726b8 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -352,7 +352,7 @@ func TestClusterMachineType(t *testing.T) { func TestCmdLogFileName(t *testing.T) { ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local) - const exp = `run_150412.000_n1,3-4,9_cockroach_bla` + const exp = `run_150412.000000000_n1,3-4,9_cockroach_bla` nodes := option.NodeListOption{1, 3, 4, 9} assert.Equal(t, exp, diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 85481c4d63d2..b4f4c633374d 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -247,11 +247,18 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio // Make a copy of i for the goroutine. i := i m.Go(func(ctx context.Context) error { + // Only prefix stats.json with workload_i_ if we have multiple workloads, + // in case other processes relied on previous behavior. + var statsPrefix string + if len(workloadInstances) > 1 { + statsPrefix = fmt.Sprintf("workload_%d.", i) + } t.WorkerStatus(fmt.Sprintf("running tpcc idx %d on %s", i, pgURLs[i])) cmd := fmt.Sprintf( - "./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/stats.json "+ + "./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/%sstats.json "+ opts.ExtraRunArgs+" --ramp=%s --duration=%s --prometheus-port=%d --pprofport=%d %s %s", opts.Warehouses, + statsPrefix, rampDuration, opts.Duration, workloadInstances[i].prometheusPort, @@ -259,8 +266,7 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio workloadInstances[i].extraRunArgs, pgURLs[i], ) - c.Run(ctx, workloadNode, cmd) - return nil + return c.RunE(ctx, workloadNode, cmd) }) } if opts.Chaos != nil { @@ -487,11 +493,13 @@ func registerTPCC(r registry.Registry) { zs = append(zs, s.zones) } const nodesPerRegion = 3 + const warehousesPerRegion = 20 multiRegionTests := []struct { - desc string - name string - survivalGoal string + desc string + name string + survivalGoal string + chaosTarget func(iter int) option.NodeListOption workloadInstances []workloadInstance }{ @@ -595,7 +603,7 @@ func registerTPCC(r registry.Registry) { iter := 0 chaosEventCh := make(chan ChaosEvent) runTPCC(ctx, t, c, tpccOptions{ - Warehouses: len(regions) * 20, + Warehouses: len(regions) * warehousesPerRegion, Duration: duration, ExtraSetupArgs: partitionArgs, ExtraRunArgs: `--method=simple --wait=false --tolerate-errors ` + partitionArgs, @@ -641,10 +649,15 @@ func registerTPCC(r registry.Registry) { }, ch: chaosEventCh, promClient: promv1.NewAPI(client), - // We see a slow trickle of errors after a server has been - // force shutdown due to queries before the shutdown not - // fully completing. - maxErrorsDuringUptime: 10, + // We see a slow trickle of errors after a server has been force shutdown due + // to queries before the shutdown not fully completing. You can inspect this + // by looking at the workload logs and corresponding the errors with the + // prometheus graphs. + // The errors seen can be be of the form: + // * ERROR: inbox communication error: rpc error: code = Canceled + // desc = context canceled (SQLSTATE 58C01) + // Setting this allows some errors to occur. + maxErrorsDuringUptime: warehousesPerRegion * 5, // "delivery" does not trigger often. allowZeroSuccessDuringUptime: true, }, nil