From c7fc2c11be117f5ed5f1e2e26339d4f84cf16630 Mon Sep 17 00:00:00 2001 From: Oliver Tan Date: Wed, 21 Jul 2021 09:22:47 +1000 Subject: [PATCH 1/2] roachtest: logging improvements to TPC-C * Increase precision of log file names to nanoseconds. This currently only has a precision of 1ms - but if we start TPC-C concurrently all the log files have the start within the same millisecond, so only 1 log file gets written (instead of 9). Give ourselves a larger chance by making it nanoseconds. * stats.json all override each other as well. Prefix stats.json with `workload_.`if there are multiple TPC-C instances. Release note: None --- pkg/cmd/roachtest/cluster.go | 2 +- pkg/cmd/roachtest/cluster_test.go | 2 +- pkg/cmd/roachtest/tests/tpcc.go | 12 +++++++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/pkg/cmd/roachtest/cluster.go b/pkg/cmd/roachtest/cluster.go index f2b448cfce33..fb74f9492867 100644 --- a/pkg/cmd/roachtest/cluster.go +++ b/pkg/cmd/roachtest/cluster.go @@ -1875,7 +1875,7 @@ func cmdLogFileName(t time.Time, nodes option.NodeListOption, args ...string) st } logFile := fmt.Sprintf( "run_%s_n%s_%s", - t.Format(`150405.000`), + t.Format(`150405.000000000`), nodes.String()[1:], s, ) diff --git a/pkg/cmd/roachtest/cluster_test.go b/pkg/cmd/roachtest/cluster_test.go index 4baf600c3dde..b934100726b8 100644 --- a/pkg/cmd/roachtest/cluster_test.go +++ b/pkg/cmd/roachtest/cluster_test.go @@ -352,7 +352,7 @@ func TestClusterMachineType(t *testing.T) { func TestCmdLogFileName(t *testing.T) { ts := time.Date(2000, 1, 1, 15, 4, 12, 0, time.Local) - const exp = `run_150412.000_n1,3-4,9_cockroach_bla` + const exp = `run_150412.000000000_n1,3-4,9_cockroach_bla` nodes := option.NodeListOption{1, 3, 4, 9} assert.Equal(t, exp, diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 85481c4d63d2..ce280918e406 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -247,11 +247,18 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio // Make a copy of i for the goroutine. i := i m.Go(func(ctx context.Context) error { + // Only prefix stats.json with workload_i_ if we have multiple workloads, + // in case other processes relied on previous behavior. + var statsPrefix string + if len(workloadInstances) > 1 { + statsPrefix = fmt.Sprintf("workload_%d.", i) + } t.WorkerStatus(fmt.Sprintf("running tpcc idx %d on %s", i, pgURLs[i])) cmd := fmt.Sprintf( - "./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/stats.json "+ + "./cockroach workload run tpcc --warehouses=%d --histograms="+t.PerfArtifactsDir()+"/%sstats.json "+ opts.ExtraRunArgs+" --ramp=%s --duration=%s --prometheus-port=%d --pprofport=%d %s %s", opts.Warehouses, + statsPrefix, rampDuration, opts.Duration, workloadInstances[i].prometheusPort, @@ -259,8 +266,7 @@ func runTPCC(ctx context.Context, t test.Test, c cluster.Cluster, opts tpccOptio workloadInstances[i].extraRunArgs, pgURLs[i], ) - c.Run(ctx, workloadNode, cmd) - return nil + return c.RunE(ctx, workloadNode, cmd) }) } if opts.Chaos != nil { From 4f28ca21f8442bfe3c75c13e4da00753afbee9c0 Mon Sep 17 00:00:00 2001 From: Oliver Tan Date: Wed, 21 Jul 2021 13:45:26 +1000 Subject: [PATCH 2/2] roachtest/tpcc: further fine-tune error rates for multi-region TPC-C Increasing the warehouses increased the number of errors that appear! Added clarifying comments + increased the allowed error rate. Release note: None --- pkg/cmd/roachtest/tests/tpcc.go | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index ce280918e406..b4f4c633374d 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -493,11 +493,13 @@ func registerTPCC(r registry.Registry) { zs = append(zs, s.zones) } const nodesPerRegion = 3 + const warehousesPerRegion = 20 multiRegionTests := []struct { - desc string - name string - survivalGoal string + desc string + name string + survivalGoal string + chaosTarget func(iter int) option.NodeListOption workloadInstances []workloadInstance }{ @@ -601,7 +603,7 @@ func registerTPCC(r registry.Registry) { iter := 0 chaosEventCh := make(chan ChaosEvent) runTPCC(ctx, t, c, tpccOptions{ - Warehouses: len(regions) * 20, + Warehouses: len(regions) * warehousesPerRegion, Duration: duration, ExtraSetupArgs: partitionArgs, ExtraRunArgs: `--method=simple --wait=false --tolerate-errors ` + partitionArgs, @@ -647,10 +649,15 @@ func registerTPCC(r registry.Registry) { }, ch: chaosEventCh, promClient: promv1.NewAPI(client), - // We see a slow trickle of errors after a server has been - // force shutdown due to queries before the shutdown not - // fully completing. - maxErrorsDuringUptime: 10, + // We see a slow trickle of errors after a server has been force shutdown due + // to queries before the shutdown not fully completing. You can inspect this + // by looking at the workload logs and corresponding the errors with the + // prometheus graphs. + // The errors seen can be be of the form: + // * ERROR: inbox communication error: rpc error: code = Canceled + // desc = context canceled (SQLSTATE 58C01) + // Setting this allows some errors to occur. + maxErrorsDuringUptime: warehousesPerRegion * 5, // "delivery" does not trigger often. allowZeroSuccessDuringUptime: true, }, nil