Skip to content

Commit

Permalink
Merge #67835
Browse files Browse the repository at this point in the history
67835: roachtest: more multi-region TPC-C fine tuning r=ajstorm a=otan

Resolves #67777

See individual commits for details.

Co-authored-by: Oliver Tan <[email protected]>
  • Loading branch information
craig[bot] and otan committed Jul 22, 2021
2 parents f0e2aa6 + 4f28ca2 commit c3049f4
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -1875,7 +1875,7 @@ func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) st
}
logFile := fmt.Sprintf(
"run_%s_n%s_%s",
t.Format(`150405.000`),
t.Format(`150405.000000000`),
nodes.String()[1:],
s,
)
Expand Down
2 changes: 1 addition & 1 deletion pkg/cmd/roachtest/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ func TestClusterMachineType(t *testing.T) {
func TestCmdLogFileName(t *testing.T) {
ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local)

const exp = `run_150412.000_n1,3-4,9_cockroach_bla`
const exp = `run_150412.000000000_n1,3-4,9_cockroach_bla`
nodes := option.NodeListOption{1, 3, 4, 9}
assert.Equal(t,
exp,
Expand Down
35 changes: 24 additions & 11 deletions pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,20 +247,26 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio
// Make a copy of i for the goroutine.
i := i
m.Go(func(ctx context.Context) error {
// Only prefix stats.json with workload_i_ if we have multiple workloads,
// in case other processes relied on previous behavior.
var statsPrefix string
if len(workloadInstances) > 1 {
statsPrefix = fmt.Sprintf("workload_%d.", i)
}
t.WorkerStatus(fmt.Sprintf("running tpcc idx %d on %s", i, pgURLs[i]))
cmd := fmt.Sprintf(
"./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/stats.json "+
"./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/%sstats.json "+
opts.ExtraRunArgs+" --ramp=%s --duration=%s --prometheus-port=%d --pprofport=%d %s %s",
opts.Warehouses,
statsPrefix,
rampDuration,
opts.Duration,
workloadInstances[i].prometheusPort,
workloadPProfStartPort+i,
workloadInstances[i].extraRunArgs,
pgURLs[i],
)
c.Run(ctx, workloadNode, cmd)
return nil
return c.RunE(ctx, workloadNode, cmd)
})
}
if opts.Chaos != nil {
Expand Down Expand Up @@ -487,11 +493,13 @@ func registerTPCC(r registry.Registry) {
zs = append(zs, s.zones)
}
const nodesPerRegion = 3
const warehousesPerRegion = 20

multiRegionTests := []struct {
desc string
name string
survivalGoal string
desc string
name string
survivalGoal string

chaosTarget func(iter int) option.NodeListOption
workloadInstances []workloadInstance
}{
Expand Down Expand Up @@ -595,7 +603,7 @@ func registerTPCC(r registry.Registry) {
iter := 0
chaosEventCh := make(chan ChaosEvent)
runTPCC(ctx, t, c, tpccOptions{
Warehouses: len(regions) * 20,
Warehouses: len(regions) * warehousesPerRegion,
Duration: duration,
ExtraSetupArgs: partitionArgs,
ExtraRunArgs: `--method=simple --wait=false --tolerate-errors ` + partitionArgs,
Expand Down Expand Up @@ -641,10 +649,15 @@ func registerTPCC(r registry.Registry) {
},
ch: chaosEventCh,
promClient: promv1.NewAPI(client),
// We see a slow trickle of errors after a server has been
// force shutdown due to queries before the shutdown not
// fully completing.
maxErrorsDuringUptime: 10,
// We see a slow trickle of errors after a server has been force shutdown due
// to queries before the shutdown not fully completing. You can inspect this
// by looking at the workload logs and corresponding the errors with the
// prometheus graphs.
// The errors seen can be be of the form:
// * ERROR: inbox communication error: rpc error: code = Canceled
// desc = context canceled (SQLSTATE 58C01)
// Setting this allows some errors to occur.
maxErrorsDuringUptime: warehousesPerRegion * 5,
// "delivery" does not trigger often.
allowZeroSuccessDuringUptime: true,
}, nil
Expand Down

0 comments on commit c3049f4

Please sign in to comment.