Skip to content

Commit

Permalink
roachtest: harden tpchvec/perf
Browse files Browse the repository at this point in the history
This commit improves `tpchvec/perf` roachtest so that it's less likely
to fail due to some flake in performance. In particular, this test has
an assertion that if a query runtime in ON config is 1.5x slower than in
OFF config, then some bundles are collected and the test is failed.
However, we've seen quite a few times when those bundles don't explain
the slowness (which likely to be intermittent). To prevent these false
positives this commit improves the test to run the query that was marked
as too slow one more time and only fail the test if it's significantly
slower again in ON config vs OFF config.

Release note: None
  • Loading branch information
yuzefovich committed Jun 2, 2023
1 parent 7ba92b8 commit c5c7ead
Showing 1 changed file with 71 additions and 26 deletions.
97 changes: 71 additions & 26 deletions pkg/cmd/roachtest/tests/tpchvec.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,28 @@ func (h *tpchVecPerfHelper) parseQueryOutput(t test.Test, output []byte, setupId
}
}

func (h *tpchVecPerfHelper) getQueryTimes(
t test.Test, numRunsPerQuery, queryNum int,
) (onTime, offTime float64) {
findMedian := func(times []float64) float64 {
sort.Float64s(times)
return times[len(times)/2]
}
onTimes := h.timeByQueryNum[tpchPerfTestOnConfigIdx][queryNum]
onName := h.setupNames[tpchPerfTestOnConfigIdx]
offTimes := h.timeByQueryNum[tpchPerfTestOffConfigIdx][queryNum]
offName := h.setupNames[tpchPerfTestOffConfigIdx]
if len(onTimes) != numRunsPerQuery {
t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
"recorded with %s config: %v", queryNum, onName, onTimes))
}
if len(offTimes) != numRunsPerQuery {
t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
"recorded with %s config: %v", queryNum, offName, offTimes))
}
return findMedian(onTimes), findMedian(offTimes)
}

// compareSetups compares the runtimes of TPCH queries in different setups and
// logs that comparison. The expectation is that the second "ON" setup should be
// faster, and if that is not the case, then a warning message is included in
Expand All @@ -151,24 +173,11 @@ func (h *tpchVecPerfHelper) compareSetups(
) {
t.Status("comparing the runtimes (only median values for each query are compared)")
for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ {
findMedian := func(times []float64) float64 {
sort.Float64s(times)
return times[len(times)/2]
}
onTimes := h.timeByQueryNum[tpchPerfTestOnConfigIdx][queryNum]
onName := h.setupNames[tpchPerfTestOnConfigIdx]
offTimes := h.timeByQueryNum[tpchPerfTestOffConfigIdx][queryNum]
offName := h.setupNames[tpchPerfTestOffConfigIdx]
if len(onTimes) != numRunsPerQuery {
t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
"recorded with %s config: %v", queryNum, onName, onTimes))
}
if len(offTimes) != numRunsPerQuery {
t.Fatal(fmt.Sprintf("[q%d] unexpectedly wrong number of run times "+
"recorded with %s config: %v", queryNum, offName, offTimes))
}
onTime := findMedian(onTimes)
offTime := findMedian(offTimes)
onTime, offTime := h.getQueryTimes(t, numRunsPerQuery, queryNum)
if offTime < onTime {
t.L().Printf(
fmt.Sprintf("[q%d] %s was faster by %.2f%%: "+
Expand Down Expand Up @@ -246,9 +255,41 @@ func (p *tpchVecPerfTest) postTestRunHook(
p.tpchVecPerfHelper.compareSetups(t, runConfig.numRunsPerQuery, func(queryNum int, onTime, offTime float64, onTimes, offTimes []float64) {
if onTime >= p.slownessThreshold*offTime {
// For some reason, the ON setup executed the query a lot slower
// than the OFF setup which is unexpected. In order to understand
// where the slowness comes from, we will run EXPLAIN ANALYZE
// (DEBUG) of the query with all setup options
// than the OFF setup which is unexpected.

// Check whether we can reproduce this slowness to prevent false
// positives.
var helper tpchVecPerfHelper
for setupIdx, setup := range runConfig.clusterSetups {
performClusterSetup(t, conn, setup)
result, err := c.RunWithDetailsSingleNode(
ctx, t.L(), c.Node(1),
getTPCHVecWorkloadCmd(runConfig.numRunsPerQuery, queryNum),
)
workloadOutput := result.Stdout + result.Stderr
t.L().Printf(workloadOutput)
if err != nil {
// Note: if you see an error like "exit status 1", it is
// likely caused by the erroneous output of the query.
t.Fatal(err)
}
helper.parseQueryOutput(t, []byte(workloadOutput), setupIdx)
}
newOnTime, newOffTime := helper.getQueryTimes(t, runConfig.numRunsPerQuery, queryNum)
if newOnTime < p.slownessThreshold*newOffTime {
// This time the slowness threshold was satisfied, so we don't
// fail the test.
t.L().Printf(fmt.Sprintf(
"[q%d] after re-running: %.2fs ON vs %.2fs OFF (proceeding)", queryNum, onTime, offTime,
))
return
}
t.L().Printf(fmt.Sprintf(
"[q%d] after re-running: %.2fs ON vs %.2fs OFF (failing)", queryNum, onTime, offTime,
))

// In order to understand where the slowness comes from, we will run
// EXPLAIN ANALYZE (DEBUG) of the query with all setup options
// tpchPerfTestNumRunsPerQuery times (hoping at least one will
// "catch" the slowness).
for setupIdx, setup := range runConfig.clusterSetups {
Expand Down Expand Up @@ -438,22 +479,26 @@ func (d tpchVecDiskTest) getRunConfig() tpchVecTestRunConfig {
return runConfig
}

func getTPCHVecWorkloadCmd(numRunsPerQuery, queryNum int) string {
// Note that we use --default-vectorize flag which tells tpch workload to
// use the current cluster setting sql.defaults.vectorize which must have
// been set correctly in preQueryRunHook.
return fmt.Sprintf("./workload run tpch --concurrency=1 --db=tpch "+
"--default-vectorize --max-ops=%d --queries=%d {pgurl:1} --enable-checks=true",
numRunsPerQuery, queryNum)
}

func baseTestRun(
ctx context.Context, t test.Test, c cluster.Cluster, conn *gosql.DB, tc tpchVecTestCase,
) {
firstNode := c.Node(1)
runConfig := tc.getRunConfig()
for queryNum := 1; queryNum <= tpch.NumQueries; queryNum++ {
for setupIdx, setup := range runConfig.clusterSetups {
tc.preQueryRunHook(t, conn, setup)
// Note that we use --default-vectorize flag which tells tpch
// workload to use the current cluster setting
// sql.defaults.vectorize which must have been set correctly in
// preQueryRunHook.
cmd := fmt.Sprintf("./workload run tpch --concurrency=1 --db=tpch "+
"--default-vectorize --max-ops=%d --queries=%d {pgurl:1} --enable-checks=true",
runConfig.numRunsPerQuery, queryNum)
result, err := c.RunWithDetailsSingleNode(ctx, t.L(), firstNode, cmd)
result, err := c.RunWithDetailsSingleNode(
ctx, t.L(), c.Node(1),
getTPCHVecWorkloadCmd(runConfig.numRunsPerQuery, queryNum),
)
workloadOutput := result.Stdout + result.Stderr
t.L().Printf(workloadOutput)
if err != nil {
Expand Down

0 comments on commit c5c7ead

Please sign in to comment.